* add the armv7a conv3x3s1 implement without overflow,remove old codes * fix the bug of conv3x3s2 packed int8 * new int8 implement,weight quant by perchanel,better accuracy~ * fix the bug of conv3x3s1 packed int8 neon * add the naive c fp32 and int8 winograd F(2,3) * add the neon intrinsic int8 winograd F(2,3) * optimize the armv7a int8 winograd F(2,3) with neon assembly * optimize the armv7a int8 winograd F(2,3) input transform with assembly. * add the requantize layer and int8 relu implement. * add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64. * fix int8 bugs * add the c naive im2col with sgemm * add aarch64 int8 winograd f23, conv3x3s2 naive implement * add the int8 sgemm conv7x7s2 on x86/armv7a platform * optimize the int8 sgemm by neon intrinsic and packed kernel * optimize the int8 sgemm with packed data * optimize the int8 sgemm with armv7a neon assembly * add the int8 sgemm on arm64-v8a platform * perpare to merge latest codes from master * add the int8 param files * In the Class Net,add the fuse_network methodtags/20190320
| @@ -32,6 +32,8 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON) | |||
| option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" OFF) | |||
| option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF) | |||
| option(NCNN_VULKAN "vulkan compute support" OFF) | |||
| option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF) | |||
| option(NCNN_IM2COL_SGEMM "im2col sgemm support" OFF) | |||
| if(NCNN_OPENMP) | |||
| find_package(OpenMP) | |||
| @@ -202,7 +202,7 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const | |||
| time_avg /= g_loop_count; | |||
| fprintf(stderr, "%16s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg); | |||
| fprintf(stderr, "%-20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg); | |||
| } | |||
| void squeezenet_init(ncnn::Net& net) | |||
| @@ -210,6 +210,11 @@ void squeezenet_init(ncnn::Net& net) | |||
| net.load_param("squeezenet.param"); | |||
| } | |||
| void squeezenet_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("squeezenet_int8.param"); | |||
| } | |||
| void squeezenet_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -226,6 +231,11 @@ void mobilenet_init(ncnn::Net& net) | |||
| net.load_param("mobilenet.param"); | |||
| } | |||
| void mobilenet_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("mobilenet_int8.param"); | |||
| } | |||
| void mobilenet_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -306,6 +316,11 @@ void googlenet_init(ncnn::Net& net) | |||
| net.load_param("googlenet.param"); | |||
| } | |||
| void googlenet_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("googlenet_int8.param"); | |||
| } | |||
| void googlenet_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -322,6 +337,11 @@ void resnet18_init(ncnn::Net& net) | |||
| net.load_param("resnet18.param"); | |||
| } | |||
| void resnet18_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("resnet18_int8.param"); | |||
| } | |||
| void resnet18_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -354,6 +374,11 @@ void vgg16_init(ncnn::Net& net) | |||
| net.load_param("vgg16.param"); | |||
| } | |||
| void vgg16_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("vgg16_int8.param"); | |||
| } | |||
| void vgg16_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -365,11 +390,37 @@ void vgg16_run(const ncnn::Net& net) | |||
| ex.extract("prob", out); | |||
| } | |||
| void resnet50_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("resnet50.param"); | |||
| } | |||
| void resnet50_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("resnet50_int8.param"); | |||
| } | |||
| void resnet50_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| ncnn::Mat in(224, 224, 3); | |||
| ex.input("data", in); | |||
| ncnn::Mat out; | |||
| ex.extract("prob", out); | |||
| } | |||
| void squeezenet_ssd_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("squeezenet_ssd.param"); | |||
| } | |||
| void squeezenet_ssd_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("squeezenet_ssd_int8.param"); | |||
| } | |||
| void squeezenet_ssd_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -386,6 +437,11 @@ void mobilenet_ssd_init(ncnn::Net& net) | |||
| net.load_param("mobilenet_ssd.param"); | |||
| } | |||
| void mobilenet_ssd_int8_init(ncnn::Net& net) | |||
| { | |||
| net.load_param("mobilenet_ssd_int8.param"); | |||
| } | |||
| void mobilenet_ssd_run(const ncnn::Net& net) | |||
| { | |||
| ncnn::Extractor ex = net.create_extractor(); | |||
| @@ -497,8 +553,12 @@ int main(int argc, char** argv) | |||
| // run | |||
| benchmark("squeezenet", squeezenet_init, squeezenet_run); | |||
| benchmark("squeezenet-int8", squeezenet_int8_init, squeezenet_run); | |||
| benchmark("mobilenet", mobilenet_init, mobilenet_run); | |||
| benchmark("mobilenet-int8", mobilenet_int8_init, mobilenet_run); | |||
| benchmark("mobilenet_v2", mobilenet_v2_init, mobilenet_v2_run); | |||
| benchmark("shufflenet", shufflenet_init, shufflenet_run); | |||
| @@ -509,16 +569,28 @@ int main(int argc, char** argv) | |||
| benchmark("googlenet", googlenet_init, googlenet_run); | |||
| benchmark("googlenet-int8", googlenet_int8_init, googlenet_run); | |||
| benchmark("resnet18", resnet18_init, resnet18_run); | |||
| benchmark("resnet18-int8", resnet18_int8_init, resnet18_run); | |||
| benchmark("alexnet", alexnet_init, alexnet_run); | |||
| benchmark("vgg16", vgg16_init, vgg16_run); | |||
| benchmark("resnet50", resnet50_init, resnet50_run); | |||
| benchmark("resnet50-int8", resnet50_int8_init, resnet50_run); | |||
| benchmark("squeezenet-ssd", squeezenet_ssd_init, squeezenet_ssd_run); | |||
| benchmark("squeezenet-ssd-int8", squeezenet_ssd_int8_init, squeezenet_ssd_run); | |||
| benchmark("mobilenet-ssd", mobilenet_ssd_init, mobilenet_ssd_run); | |||
| benchmark("mobilenet-ssd-int8", mobilenet_ssd_int8_init, mobilenet_ssd_run); | |||
| benchmark("mobilenet-yolo", mobilenet_yolo_init, mobilenet_yolo_run); | |||
| benchmark("mobilenet-yolov3", mobilenet_yolov3_init, mobilenet_yolov3_run); | |||
| @@ -0,0 +1,154 @@ | |||
| 7767517 | |||
| 152 179 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2 | |||
| ReLU conv1/relu_7x7 1 1 conv1/7x7_s2 conv1/7x7_s2_conv1/relu_7x7 | |||
| Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 0=0 1=3 2=2 3=0 4=0 | |||
| LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 0=0 1=5 2=0.000100 3=0.750000 | |||
| Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce 0=64 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU conv2/relu_3x3_reduce 1 1 conv2/3x3_reduce conv2/3x3_reduce_conv2/relu_3x3_reduce | |||
| Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 | |||
| ReLU conv2/relu_3x3 1 1 conv2/3x3 conv2/3x3_conv2/relu_3x3 | |||
| LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 0=0 1=5 2=0.000100 3=0.750000 | |||
| Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 | |||
| Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| ReLU inception_3a/relu_1x1 1 1 inception_3a/1x1 inception_3a/1x1_inception_3a/relu_1x1 | |||
| Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 | |||
| ReLU inception_3a/relu_3x3_reduce 1 1 inception_3a/3x3_reduce inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce | |||
| Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 | |||
| ReLU inception_3a/relu_3x3 1 1 inception_3a/3x3 inception_3a/3x3_inception_3a/relu_3x3 | |||
| Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=3072 8=2 | |||
| ReLU inception_3a/relu_5x5_reduce 1 1 inception_3a/5x5_reduce inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce | |||
| Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5 0=32 1=5 2=1 3=1 4=2 5=1 6=12800 8=2 | |||
| ReLU inception_3a/relu_5x5 1 1 inception_3a/5x5 inception_3a/5x5_inception_3a/relu_5x5 | |||
| Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj 0=32 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 | |||
| ReLU inception_3a/relu_pool_proj 1 1 inception_3a/pool_proj inception_3a/pool_proj_inception_3a/relu_pool_proj | |||
| Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output 0=0 | |||
| Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 | |||
| Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU inception_3b/relu_1x1 1 1 inception_3b/1x1 inception_3b/1x1_inception_3b/relu_1x1 | |||
| Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU inception_3b/relu_3x3_reduce 1 1 inception_3b/3x3_reduce inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce | |||
| Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=221184 8=2 | |||
| ReLU inception_3b/relu_3x3 1 1 inception_3b/3x3 inception_3b/3x3_inception_3b/relu_3x3 | |||
| Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 | |||
| ReLU inception_3b/relu_5x5_reduce 1 1 inception_3b/5x5_reduce inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce | |||
| Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5 0=96 1=5 2=1 3=1 4=2 5=1 6=76800 8=2 | |||
| ReLU inception_3b/relu_5x5 1 1 inception_3b/5x5 inception_3b/5x5_inception_3b/relu_5x5 | |||
| Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU inception_3b/relu_pool_proj 1 1 inception_3b/pool_proj inception_3b/pool_proj_inception_3b/relu_pool_proj | |||
| Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output 0=0 | |||
| Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 | |||
| Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=92160 8=2 | |||
| ReLU inception_4a/relu_1x1 1 1 inception_4a/1x1 inception_4a/1x1_inception_4a/relu_1x1 | |||
| Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=46080 8=2 | |||
| ReLU inception_4a/relu_3x3_reduce 1 1 inception_4a/3x3_reduce inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce | |||
| Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3 0=208 1=3 2=1 3=1 4=1 5=1 6=179712 8=2 | |||
| ReLU inception_4a/relu_3x3 1 1 inception_4a/3x3 inception_4a/3x3_inception_4a/relu_3x3 | |||
| Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=7680 8=2 | |||
| ReLU inception_4a/relu_5x5_reduce 1 1 inception_4a/5x5_reduce inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce | |||
| Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5 0=48 1=5 2=1 3=1 4=2 5=1 6=19200 8=2 | |||
| ReLU inception_4a/relu_5x5 1 1 inception_4a/5x5 inception_4a/5x5_inception_4a/relu_5x5 | |||
| Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=30720 8=2 | |||
| ReLU inception_4a/relu_pool_proj 1 1 inception_4a/pool_proj inception_4a/pool_proj_inception_4a/relu_pool_proj | |||
| Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output 0=0 | |||
| Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 | |||
| Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1 0=160 1=1 2=1 3=1 4=0 5=1 6=81920 8=2 | |||
| ReLU inception_4b/relu_1x1 1 1 inception_4b/1x1 inception_4b/1x1_inception_4b/relu_1x1 | |||
| Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2 | |||
| ReLU inception_4b/relu_3x3_reduce 1 1 inception_4b/3x3_reduce inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce | |||
| Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3 0=224 1=3 2=1 3=1 4=1 5=1 6=225792 8=2 | |||
| ReLU inception_4b/relu_3x3 1 1 inception_4b/3x3 inception_4b/3x3_inception_4b/relu_3x3 | |||
| Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| ReLU inception_4b/relu_5x5_reduce 1 1 inception_4b/5x5_reduce inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce | |||
| Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2 | |||
| ReLU inception_4b/relu_5x5 1 1 inception_4b/5x5 inception_4b/5x5_inception_4b/relu_5x5 | |||
| Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU inception_4b/relu_pool_proj 1 1 inception_4b/pool_proj inception_4b/pool_proj_inception_4b/relu_pool_proj | |||
| Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output 0=0 | |||
| Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 | |||
| Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 | |||
| ReLU inception_4c/relu_1x1 1 1 inception_4c/1x1 inception_4c/1x1_inception_4c/relu_1x1 | |||
| Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 | |||
| ReLU inception_4c/relu_3x3_reduce 1 1 inception_4c/3x3_reduce inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce | |||
| Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2 | |||
| ReLU inception_4c/relu_3x3 1 1 inception_4c/3x3 inception_4c/3x3_inception_4c/relu_3x3 | |||
| Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| ReLU inception_4c/relu_5x5_reduce 1 1 inception_4c/5x5_reduce inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce | |||
| Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2 | |||
| ReLU inception_4c/relu_5x5 1 1 inception_4c/5x5 inception_4c/5x5_inception_4c/relu_5x5 | |||
| Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU inception_4c/relu_pool_proj 1 1 inception_4c/pool_proj inception_4c/pool_proj_inception_4c/relu_pool_proj | |||
| Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output 0=0 | |||
| Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 | |||
| Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2 | |||
| ReLU inception_4d/relu_1x1 1 1 inception_4d/1x1 inception_4d/1x1_inception_4d/relu_1x1 | |||
| Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce 0=144 1=1 2=1 3=1 4=0 5=1 6=73728 8=2 | |||
| ReLU inception_4d/relu_3x3_reduce 1 1 inception_4d/3x3_reduce inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce | |||
| Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3 0=288 1=3 2=1 3=1 4=1 5=1 6=373248 8=2 | |||
| ReLU inception_4d/relu_3x3 1 1 inception_4d/3x3 inception_4d/3x3_inception_4d/relu_3x3 | |||
| Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU inception_4d/relu_5x5_reduce 1 1 inception_4d/5x5_reduce inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce | |||
| Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=51200 8=2 | |||
| ReLU inception_4d/relu_5x5 1 1 inception_4d/5x5 inception_4d/5x5_inception_4d/relu_5x5 | |||
| Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU inception_4d/relu_pool_proj 1 1 inception_4d/pool_proj inception_4d/pool_proj_inception_4d/relu_pool_proj | |||
| Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output 0=0 | |||
| Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 | |||
| Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=135168 8=2 | |||
| ReLU inception_4e/relu_1x1 1 1 inception_4e/1x1 inception_4e/1x1_inception_4e/relu_1x1 | |||
| Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=84480 8=2 | |||
| ReLU inception_4e/relu_3x3_reduce 1 1 inception_4e/3x3_reduce inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce | |||
| Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2 | |||
| ReLU inception_4e/relu_3x3 1 1 inception_4e/3x3 inception_4e/3x3_inception_4e/relu_3x3 | |||
| Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16896 8=2 | |||
| ReLU inception_4e/relu_5x5_reduce 1 1 inception_4e/5x5_reduce inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce | |||
| Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2 | |||
| ReLU inception_4e/relu_5x5 1 1 inception_4e/5x5 inception_4e/5x5_inception_4e/relu_5x5 | |||
| Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=67584 8=2 | |||
| ReLU inception_4e/relu_pool_proj 1 1 inception_4e/pool_proj inception_4e/pool_proj_inception_4e/relu_pool_proj | |||
| Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output 0=0 | |||
| Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 | |||
| Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=212992 8=2 | |||
| ReLU inception_5a/relu_1x1 1 1 inception_5a/1x1 inception_5a/1x1_inception_5a/relu_1x1 | |||
| Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=133120 8=2 | |||
| ReLU inception_5a/relu_3x3_reduce 1 1 inception_5a/3x3_reduce inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce | |||
| Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2 | |||
| ReLU inception_5a/relu_3x3 1 1 inception_5a/3x3 inception_5a/3x3_inception_5a/relu_3x3 | |||
| Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=26624 8=2 | |||
| ReLU inception_5a/relu_5x5_reduce 1 1 inception_5a/5x5_reduce inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce | |||
| Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2 | |||
| ReLU inception_5a/relu_5x5 1 1 inception_5a/5x5 inception_5a/5x5_inception_5a/relu_5x5 | |||
| Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2 | |||
| ReLU inception_5a/relu_pool_proj 1 1 inception_5a/pool_proj inception_5a/pool_proj_inception_5a/relu_pool_proj | |||
| Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output 0=0 | |||
| Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 | |||
| Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=319488 8=2 | |||
| ReLU inception_5b/relu_1x1 1 1 inception_5b/1x1 inception_5b/1x1_inception_5b/relu_1x1 | |||
| Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce 0=192 1=1 2=1 3=1 4=0 5=1 6=159744 8=2 | |||
| ReLU inception_5b/relu_3x3_reduce 1 1 inception_5b/3x3_reduce inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce | |||
| Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=663552 8=2 | |||
| ReLU inception_5b/relu_3x3 1 1 inception_5b/3x3 inception_5b/3x3_inception_5b/relu_3x3 | |||
| Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce 0=48 1=1 2=1 3=1 4=0 5=1 6=39936 8=2 | |||
| ReLU inception_5b/relu_5x5_reduce 1 1 inception_5b/5x5_reduce inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce | |||
| Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=153600 8=2 | |||
| ReLU inception_5b/relu_5x5 1 1 inception_5b/5x5 inception_5b/5x5_inception_5b/relu_5x5 | |||
| Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 0=0 1=3 2=1 3=1 4=0 | |||
| Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2 | |||
| ReLU inception_5b/relu_pool_proj 1 1 inception_5b/pool_proj inception_5b/pool_proj_inception_5b/relu_pool_proj | |||
| Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output 0=0 | |||
| Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1 0=1 1=7 2=1 3=0 4=0 | |||
| Dropout pool5/drop_7x7_s1 1 1 pool5/7x7_s1 pool5/7x7_s1_pool5/drop_7x7_s1 | |||
| InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000 | |||
| Softmax prob 1 1 loss3/classifier prob 0=0 | |||
| @@ -0,0 +1,114 @@ | |||
| 7767517 | |||
| 112 112 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1 1 1 data conv1 0=32 1=3 2=1 3=2 4=1 5=0 6=864 8=2 | |||
| BatchNorm conv1/bn 1 1 conv1 conv1_conv1/bn 0=32 | |||
| Scale conv1/scale 1 1 conv1_conv1/bn conv1_conv1/scale 0=32 1=1 | |||
| ReLU relu1 1 1 conv1_conv1/scale conv1_relu1 | |||
| ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw 0=32 1=3 2=1 3=1 4=1 5=0 6=288 7=32 8=1 | |||
| BatchNorm conv2_1/dw/bn 1 1 conv2_1/dw conv2_1/dw_conv2_1/dw/bn 0=32 | |||
| Scale conv2_1/dw/scale 1 1 conv2_1/dw_conv2_1/dw/bn conv2_1/dw_conv2_1/dw/scale 0=32 1=1 | |||
| ReLU relu2_1/dw 1 1 conv2_1/dw_conv2_1/dw/scale conv2_1/dw_relu2_1/dw | |||
| Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep 0=64 1=1 2=1 3=1 4=0 5=0 6=2048 8=2 | |||
| BatchNorm conv2_1/sep/bn 1 1 conv2_1/sep conv2_1/sep_conv2_1/sep/bn 0=64 | |||
| Scale conv2_1/sep/scale 1 1 conv2_1/sep_conv2_1/sep/bn conv2_1/sep_conv2_1/sep/scale 0=64 1=1 | |||
| ReLU relu2_1/sep 1 1 conv2_1/sep_conv2_1/sep/scale conv2_1/sep_relu2_1/sep | |||
| ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw 0=64 1=3 2=1 3=2 4=1 5=0 6=576 7=64 8=1 | |||
| BatchNorm conv2_2/dw/bn 1 1 conv2_2/dw conv2_2/dw_conv2_2/dw/bn 0=64 | |||
| Scale conv2_2/dw/scale 1 1 conv2_2/dw_conv2_2/dw/bn conv2_2/dw_conv2_2/dw/scale 0=64 1=1 | |||
| ReLU relu2_2/dw 1 1 conv2_2/dw_conv2_2/dw/scale conv2_2/dw_relu2_2/dw | |||
| Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=8192 8=2 | |||
| BatchNorm conv2_2/sep/bn 1 1 conv2_2/sep conv2_2/sep_conv2_2/sep/bn 0=128 | |||
| Scale conv2_2/sep/scale 1 1 conv2_2/sep_conv2_2/sep/bn conv2_2/sep_conv2_2/sep/scale 0=128 1=1 | |||
| ReLU relu2_2/sep 1 1 conv2_2/sep_conv2_2/sep/scale conv2_2/sep_relu2_2/sep | |||
| ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw 0=128 1=3 2=1 3=1 4=1 5=0 6=1152 7=128 8=1 | |||
| BatchNorm conv3_1/dw/bn 1 1 conv3_1/dw conv3_1/dw_conv3_1/dw/bn 0=128 | |||
| Scale conv3_1/dw/scale 1 1 conv3_1/dw_conv3_1/dw/bn conv3_1/dw_conv3_1/dw/scale 0=128 1=1 | |||
| ReLU relu3_1/dw 1 1 conv3_1/dw_conv3_1/dw/scale conv3_1/dw_relu3_1/dw | |||
| Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm conv3_1/sep/bn 1 1 conv3_1/sep conv3_1/sep_conv3_1/sep/bn 0=128 | |||
| Scale conv3_1/sep/scale 1 1 conv3_1/sep_conv3_1/sep/bn conv3_1/sep_conv3_1/sep/scale 0=128 1=1 | |||
| ReLU relu3_1/sep 1 1 conv3_1/sep_conv3_1/sep/scale conv3_1/sep_relu3_1/sep | |||
| ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw 0=128 1=3 2=1 3=2 4=1 5=0 6=1152 7=128 8=1 | |||
| BatchNorm conv3_2/dw/bn 1 1 conv3_2/dw conv3_2/dw_conv3_2/dw/bn 0=128 | |||
| Scale conv3_2/dw/scale 1 1 conv3_2/dw_conv3_2/dw/bn conv3_2/dw_conv3_2/dw/scale 0=128 1=1 | |||
| ReLU relu3_2/dw 1 1 conv3_2/dw_conv3_2/dw/scale conv3_2/dw_relu3_2/dw | |||
| Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=32768 8=2 | |||
| BatchNorm conv3_2/sep/bn 1 1 conv3_2/sep conv3_2/sep_conv3_2/sep/bn 0=256 | |||
| Scale conv3_2/sep/scale 1 1 conv3_2/sep_conv3_2/sep/bn conv3_2/sep_conv3_2/sep/scale 0=256 1=1 | |||
| ReLU relu3_2/sep 1 1 conv3_2/sep_conv3_2/sep/scale conv3_2/sep_relu3_2/sep | |||
| ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw 0=256 1=3 2=1 3=1 4=1 5=0 6=2304 7=256 8=1 | |||
| BatchNorm conv4_1/dw/bn 1 1 conv4_1/dw conv4_1/dw_conv4_1/dw/bn 0=256 | |||
| Scale conv4_1/dw/scale 1 1 conv4_1/dw_conv4_1/dw/bn conv4_1/dw_conv4_1/dw/scale 0=256 1=1 | |||
| ReLU relu4_1/dw 1 1 conv4_1/dw_conv4_1/dw/scale conv4_1/dw_relu4_1/dw | |||
| Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm conv4_1/sep/bn 1 1 conv4_1/sep conv4_1/sep_conv4_1/sep/bn 0=256 | |||
| Scale conv4_1/sep/scale 1 1 conv4_1/sep_conv4_1/sep/bn conv4_1/sep_conv4_1/sep/scale 0=256 1=1 | |||
| ReLU relu4_1/sep 1 1 conv4_1/sep_conv4_1/sep/scale conv4_1/sep_relu4_1/sep | |||
| ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw 0=256 1=3 2=1 3=2 4=1 5=0 6=2304 7=256 8=1 | |||
| BatchNorm conv4_2/dw/bn 1 1 conv4_2/dw conv4_2/dw_conv4_2/dw/bn 0=256 | |||
| Scale conv4_2/dw/scale 1 1 conv4_2/dw_conv4_2/dw/bn conv4_2/dw_conv4_2/dw/scale 0=256 1=1 | |||
| ReLU relu4_2/dw 1 1 conv4_2/dw_conv4_2/dw/scale conv4_2/dw_relu4_2/dw | |||
| Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=131072 8=2 | |||
| BatchNorm conv4_2/sep/bn 1 1 conv4_2/sep conv4_2/sep_conv4_2/sep/bn 0=512 | |||
| Scale conv4_2/sep/scale 1 1 conv4_2/sep_conv4_2/sep/bn conv4_2/sep_conv4_2/sep/scale 0=512 1=1 | |||
| ReLU relu4_2/sep 1 1 conv4_2/sep_conv4_2/sep/scale conv4_2/sep_relu4_2/sep | |||
| ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_1/dw/bn 1 1 conv5_1/dw conv5_1/dw_conv5_1/dw/bn 0=512 | |||
| Scale conv5_1/dw/scale 1 1 conv5_1/dw_conv5_1/dw/bn conv5_1/dw_conv5_1/dw/scale 0=512 1=1 | |||
| ReLU relu5_1/dw 1 1 conv5_1/dw_conv5_1/dw/scale conv5_1/dw_relu5_1/dw | |||
| Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm conv5_1/sep/bn 1 1 conv5_1/sep conv5_1/sep_conv5_1/sep/bn 0=512 | |||
| Scale conv5_1/sep/scale 1 1 conv5_1/sep_conv5_1/sep/bn conv5_1/sep_conv5_1/sep/scale 0=512 1=1 | |||
| ReLU relu5_1/sep 1 1 conv5_1/sep_conv5_1/sep/scale conv5_1/sep_relu5_1/sep | |||
| ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_2/dw/bn 1 1 conv5_2/dw conv5_2/dw_conv5_2/dw/bn 0=512 | |||
| Scale conv5_2/dw/scale 1 1 conv5_2/dw_conv5_2/dw/bn conv5_2/dw_conv5_2/dw/scale 0=512 1=1 | |||
| ReLU relu5_2/dw 1 1 conv5_2/dw_conv5_2/dw/scale conv5_2/dw_relu5_2/dw | |||
| Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm conv5_2/sep/bn 1 1 conv5_2/sep conv5_2/sep_conv5_2/sep/bn 0=512 | |||
| Scale conv5_2/sep/scale 1 1 conv5_2/sep_conv5_2/sep/bn conv5_2/sep_conv5_2/sep/scale 0=512 1=1 | |||
| ReLU relu5_2/sep 1 1 conv5_2/sep_conv5_2/sep/scale conv5_2/sep_relu5_2/sep | |||
| ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_3/dw/bn 1 1 conv5_3/dw conv5_3/dw_conv5_3/dw/bn 0=512 | |||
| Scale conv5_3/dw/scale 1 1 conv5_3/dw_conv5_3/dw/bn conv5_3/dw_conv5_3/dw/scale 0=512 1=1 | |||
| ReLU relu5_3/dw 1 1 conv5_3/dw_conv5_3/dw/scale conv5_3/dw_relu5_3/dw | |||
| Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm conv5_3/sep/bn 1 1 conv5_3/sep conv5_3/sep_conv5_3/sep/bn 0=512 | |||
| Scale conv5_3/sep/scale 1 1 conv5_3/sep_conv5_3/sep/bn conv5_3/sep_conv5_3/sep/scale 0=512 1=1 | |||
| ReLU relu5_3/sep 1 1 conv5_3/sep_conv5_3/sep/scale conv5_3/sep_relu5_3/sep | |||
| ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_4/dw/bn 1 1 conv5_4/dw conv5_4/dw_conv5_4/dw/bn 0=512 | |||
| Scale conv5_4/dw/scale 1 1 conv5_4/dw_conv5_4/dw/bn conv5_4/dw_conv5_4/dw/scale 0=512 1=1 | |||
| ReLU relu5_4/dw 1 1 conv5_4/dw_conv5_4/dw/scale conv5_4/dw_relu5_4/dw | |||
| Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm conv5_4/sep/bn 1 1 conv5_4/sep conv5_4/sep_conv5_4/sep/bn 0=512 | |||
| Scale conv5_4/sep/scale 1 1 conv5_4/sep_conv5_4/sep/bn conv5_4/sep_conv5_4/sep/scale 0=512 1=1 | |||
| ReLU relu5_4/sep 1 1 conv5_4/sep_conv5_4/sep/scale conv5_4/sep_relu5_4/sep | |||
| ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_5/dw/bn 1 1 conv5_5/dw conv5_5/dw_conv5_5/dw/bn 0=512 | |||
| Scale conv5_5/dw/scale 1 1 conv5_5/dw_conv5_5/dw/bn conv5_5/dw_conv5_5/dw/scale 0=512 1=1 | |||
| ReLU relu5_5/dw 1 1 conv5_5/dw_conv5_5/dw/scale conv5_5/dw_relu5_5/dw | |||
| Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm conv5_5/sep/bn 1 1 conv5_5/sep conv5_5/sep_conv5_5/sep/bn 0=512 | |||
| Scale conv5_5/sep/scale 1 1 conv5_5/sep_conv5_5/sep/bn conv5_5/sep_conv5_5/sep/scale 0=512 1=1 | |||
| ReLU relu5_5/sep 1 1 conv5_5/sep_conv5_5/sep/scale conv5_5/sep_relu5_5/sep | |||
| ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw 0=512 1=3 2=1 3=2 4=1 5=0 6=4608 7=512 8=1 | |||
| BatchNorm conv5_6/dw/bn 1 1 conv5_6/dw conv5_6/dw_conv5_6/dw/bn 0=512 | |||
| Scale conv5_6/dw/scale 1 1 conv5_6/dw_conv5_6/dw/bn conv5_6/dw_conv5_6/dw/scale 0=512 1=1 | |||
| ReLU relu5_6/dw 1 1 conv5_6/dw_conv5_6/dw/scale conv5_6/dw_relu5_6/dw | |||
| Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=524288 8=2 | |||
| BatchNorm conv5_6/sep/bn 1 1 conv5_6/sep conv5_6/sep_conv5_6/sep/bn 0=1024 | |||
| Scale conv5_6/sep/scale 1 1 conv5_6/sep_conv5_6/sep/bn conv5_6/sep_conv5_6/sep/scale 0=1024 1=1 | |||
| ReLU relu5_6/sep 1 1 conv5_6/sep_conv5_6/sep/scale conv5_6/sep_relu5_6/sep | |||
| ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw 0=1024 1=3 2=1 3=1 4=1 5=0 6=9216 7=1024 8=1 | |||
| BatchNorm conv6/dw/bn 1 1 conv6/dw conv6/dw_conv6/dw/bn 0=1024 | |||
| Scale conv6/dw/scale 1 1 conv6/dw_conv6/dw/bn conv6/dw_conv6/dw/scale 0=1024 1=1 | |||
| ReLU relu6/dw 1 1 conv6/dw_conv6/dw/scale conv6/dw_relu6/dw | |||
| Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm conv6/sep/bn 1 1 conv6/sep conv6/sep_conv6/sep/bn 0=1024 | |||
| Scale conv6/sep/scale 1 1 conv6/sep_conv6/sep/bn conv6/sep_conv6/sep/scale 0=1024 1=1 | |||
| ReLU relu6/sep 1 1 conv6/sep_conv6/sep/scale conv6/sep_relu6/sep | |||
| Pooling pool6 1 1 conv6/sep_relu6/sep pool6 0=1 1=0 2=1 3=0 4=1 | |||
| Convolution fc7 1 1 pool6 fc7 0=1000 1=1 2=1 3=1 4=0 5=1 6=1024000 8=2 | |||
| Softmax prob 1 1 fc7 prob 0=0 | |||
| @@ -0,0 +1,129 @@ | |||
| 7767517 | |||
| 127 150 | |||
| Input data 0 1 data 0=300 1=300 2=3 | |||
| Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 | |||
| Convolution conv0 1 1 data_splitncnn_6 conv0 0=32 1=3 2=1 3=2 4=1 5=1 6=864 8=2 | |||
| ReLU conv0/relu 1 1 conv0 conv0_conv0/relu | |||
| ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw 0=32 1=3 2=1 3=1 4=1 5=1 6=288 7=32 8=1 | |||
| ReLU conv1/dw/relu 1 1 conv1/dw conv1/dw_conv1/dw/relu | |||
| Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1 0=64 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 | |||
| ReLU conv1/relu 1 1 conv1 conv1_conv1/relu | |||
| ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw 0=64 1=3 2=1 3=2 4=1 5=1 6=576 7=64 8=1 | |||
| ReLU conv2/dw/relu 1 1 conv2/dw conv2/dw_conv2/dw/relu | |||
| Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2 0=128 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 | |||
| ReLU conv2/relu 1 1 conv2 conv2_conv2/relu | |||
| ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw 0=128 1=3 2=1 3=1 4=1 5=1 6=1152 7=128 8=1 | |||
| ReLU conv3/dw/relu 1 1 conv3/dw conv3/dw_conv3/dw/relu | |||
| Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3 0=128 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU conv3/relu 1 1 conv3 conv3_conv3/relu | |||
| ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw 0=128 1=3 2=1 3=2 4=1 5=1 6=1152 7=128 8=1 | |||
| ReLU conv4/dw/relu 1 1 conv4/dw conv4/dw_conv4/dw/relu | |||
| Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4 0=256 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU conv4/relu 1 1 conv4 conv4_conv4/relu | |||
| ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw 0=256 1=3 2=1 3=1 4=1 5=1 6=2304 7=256 8=1 | |||
| ReLU conv5/dw/relu 1 1 conv5/dw conv5/dw_conv5/dw/relu | |||
| Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5 0=256 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 | |||
| ReLU conv5/relu 1 1 conv5 conv5_conv5/relu | |||
| ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw 0=256 1=3 2=1 3=2 4=1 5=1 6=2304 7=256 8=1 | |||
| ReLU conv6/dw/relu 1 1 conv6/dw conv6/dw_conv6/dw/relu | |||
| Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6 0=512 1=1 2=1 3=1 4=0 5=1 6=131072 8=2 | |||
| ReLU conv6/relu 1 1 conv6 conv6_conv6/relu | |||
| ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv7/dw/relu 1 1 conv7/dw conv7/dw_conv7/dw/relu | |||
| Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv7/relu 1 1 conv7 conv7_conv7/relu | |||
| ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv8/dw/relu 1 1 conv8/dw conv8/dw_conv8/dw/relu | |||
| Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv8/relu 1 1 conv8 conv8_conv8/relu | |||
| ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv9/dw/relu 1 1 conv9/dw conv9/dw_conv9/dw/relu | |||
| Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv9/relu 1 1 conv9 conv9_conv9/relu | |||
| ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv10/dw/relu 1 1 conv10/dw conv10/dw_conv10/dw/relu | |||
| Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv10/relu 1 1 conv10 conv10_conv10/relu | |||
| ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv11/dw/relu 1 1 conv11/dw conv11/dw_conv11/dw/relu | |||
| Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv11/relu 1 1 conv11 conv11_conv11/relu | |||
| Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 | |||
| ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw 0=512 1=3 2=1 3=2 4=1 5=1 6=4608 7=512 8=1 | |||
| ReLU conv12/dw/relu 1 1 conv12/dw conv12/dw_conv12/dw/relu | |||
| Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12 0=1024 1=1 2=1 3=1 4=0 5=1 6=524288 8=2 | |||
| ReLU conv12/relu 1 1 conv12 conv12_conv12/relu | |||
| ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw 0=1024 1=3 2=1 3=1 4=1 5=1 6=9216 7=1024 8=1 | |||
| ReLU conv13/dw/relu 1 1 conv13/dw conv13/dw_conv13/dw/relu | |||
| Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13 0=1024 1=1 2=1 3=1 4=0 5=1 6=1048576 8=2 | |||
| ReLU conv13/relu 1 1 conv13 conv13_conv13/relu | |||
| Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 | |||
| Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1 0=256 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 | |||
| ReLU conv14_1/relu 1 1 conv14_1 conv14_1_conv14_1/relu | |||
| Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2 0=512 1=3 2=1 3=2 4=1 5=1 6=1179648 8=2 | |||
| ReLU conv14_2/relu 1 1 conv14_2 conv14_2_conv14_2/relu | |||
| Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 | |||
| Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 | |||
| ReLU conv15_1/relu 1 1 conv15_1 conv15_1_conv15_1/relu | |||
| Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2 | |||
| ReLU conv15_2/relu 1 1 conv15_2 conv15_2_conv15_2/relu | |||
| Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 | |||
| Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU conv16_1/relu 1 1 conv16_1 conv16_1_conv16_1/relu | |||
| Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2 | |||
| ReLU conv16_2/relu 1 1 conv16_2 conv16_2_conv16_2/relu | |||
| Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 | |||
| Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU conv17_1/relu 1 1 conv17_1 conv17_1_conv17_1/relu | |||
| Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2 0=128 1=3 2=1 3=2 4=1 5=1 6=73728 8=2 | |||
| ReLU conv17_2/relu 1 1 conv17_2 conv17_2_conv17_2/relu | |||
| Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 | |||
| Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 | |||
| Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3 | |||
| Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat | |||
| Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 | |||
| Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3 | |||
| Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat | |||
| PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23301=0 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 | |||
| Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3 | |||
| Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat | |||
| Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=129024 8=2 | |||
| Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3 | |||
| Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat | |||
| PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3 | |||
| Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat | |||
| Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=64512 8=2 | |||
| Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3 | |||
| Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat | |||
| PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 | |||
| Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3 | |||
| Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat | |||
| Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 | |||
| Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3 | |||
| Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat | |||
| PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 | |||
| Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3 | |||
| Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat | |||
| Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 | |||
| Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3 | |||
| Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat | |||
| PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=3072 8=2 | |||
| Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3 | |||
| Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat | |||
| Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=16128 8=2 | |||
| Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3 | |||
| Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat | |||
| PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 | |||
| Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc 0=0 | |||
| Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf 0=0 | |||
| Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1 | |||
| Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0 | |||
| Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 | |||
| Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten | |||
| DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.250000 | |||
| @@ -0,0 +1,103 @@ | |||
| 7767517 | |||
| 101 109 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=0 6=9408 8=2 | |||
| BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 | |||
| Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 | |||
| ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu | |||
| Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 | |||
| Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2 | |||
| BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=64 | |||
| Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=64 1=1 | |||
| Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 | |||
| Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 | |||
| ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu | |||
| Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 | |||
| Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 | |||
| Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -23301=0 | |||
| ReLU res2a_relu 1 1 res2a res2a_res2a_relu | |||
| Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 | |||
| Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 | |||
| Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 | |||
| ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu | |||
| Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 | |||
| Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 | |||
| Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -23301=0 | |||
| ReLU res2b_relu 1 1 res2b res2b_res2b_relu | |||
| Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 | |||
| Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1 0=128 1=1 2=1 3=2 4=0 5=0 6=8192 8=2 | |||
| BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=128 | |||
| Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=128 1=1 | |||
| Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2 | |||
| BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 | |||
| Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 | |||
| ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu | |||
| Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 | |||
| Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 | |||
| Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -23301=0 | |||
| ReLU res3a_relu 1 1 res3a res3a_res3a_relu | |||
| Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 | |||
| Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 | |||
| Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 | |||
| ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu | |||
| Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 | |||
| Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 | |||
| Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -23301=0 | |||
| ReLU res3b_relu 1 1 res3b res3b_res3b_relu | |||
| Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 | |||
| Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1 0=256 1=1 2=1 3=2 4=0 5=0 6=32768 8=2 | |||
| BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=256 | |||
| Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=256 1=1 | |||
| Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2 | |||
| BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 | |||
| Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 | |||
| ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu | |||
| Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 | |||
| Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 | |||
| Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -23301=0 | |||
| ReLU res4a_relu 1 1 res4a res4a_res4a_relu | |||
| Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 | |||
| Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 | |||
| Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 | |||
| ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu | |||
| Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 | |||
| Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 | |||
| Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -23301=0 | |||
| ReLU res4b_relu 1 1 res4b res4b_res4b_relu | |||
| Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 | |||
| Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 | |||
| BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=512 | |||
| Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=512 1=1 | |||
| Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a 0=512 1=3 2=1 3=2 4=1 5=0 6=1179648 8=2 | |||
| BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 | |||
| Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 | |||
| ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu | |||
| Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 | |||
| Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 | |||
| Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -23301=0 | |||
| ReLU res5a_relu 1 1 res5a res5a_res5a_relu | |||
| Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 | |||
| Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 | |||
| Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 | |||
| ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu | |||
| Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 | |||
| Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 | |||
| Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -23301=0 | |||
| ReLU res5b_relu 1 1 res5b res5b_res5b_relu | |||
| Pooling pool5 1 1 res5b_res5b_relu pool5 0=1 1=7 2=1 3=0 4=0 | |||
| InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=512000 | |||
| Softmax prob 1 1 fc1000 prob 0=0 | |||
| @@ -0,0 +1,247 @@ | |||
| 7767517 | |||
| 245 261 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 | |||
| BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 | |||
| Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 | |||
| ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu | |||
| Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 | |||
| Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256 | |||
| Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1 | |||
| Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 | |||
| BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 | |||
| Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 | |||
| ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu | |||
| Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 | |||
| BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 | |||
| Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 | |||
| ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu | |||
| Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256 | |||
| Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1 | |||
| Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0 | |||
| ReLU res2a_relu 1 1 res2a res2a_res2a_relu | |||
| Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 | |||
| Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 | |||
| Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 | |||
| ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu | |||
| Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 | |||
| BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 | |||
| Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 | |||
| ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu | |||
| Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256 | |||
| Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1 | |||
| Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0 | |||
| ReLU res2b_relu 1 1 res2b res2b_res2b_relu | |||
| Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 | |||
| Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64 | |||
| Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1 | |||
| ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu | |||
| Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 | |||
| BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64 | |||
| Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1 | |||
| ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu | |||
| Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 | |||
| BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256 | |||
| Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1 | |||
| Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0 | |||
| ReLU res2c_relu 1 1 res2c res2c_res2c_relu | |||
| Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 | |||
| Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 | |||
| BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512 | |||
| Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1 | |||
| Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 | |||
| BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 | |||
| Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 | |||
| ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu | |||
| Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 | |||
| BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 | |||
| Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 | |||
| ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu | |||
| Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512 | |||
| Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1 | |||
| Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0 | |||
| ReLU res3a_relu 1 1 res3a res3a_res3a_relu | |||
| Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 | |||
| Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 | |||
| Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 | |||
| ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu | |||
| Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 | |||
| BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 | |||
| Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 | |||
| ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu | |||
| Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512 | |||
| Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1 | |||
| Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0 | |||
| ReLU res3b_relu 1 1 res3b res3b_res3b_relu | |||
| Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 | |||
| Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128 | |||
| Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1 | |||
| ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu | |||
| Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 | |||
| BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128 | |||
| Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1 | |||
| ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu | |||
| Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512 | |||
| Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1 | |||
| Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0 | |||
| ReLU res3c_relu 1 1 res3c res3c_res3c_relu | |||
| Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 | |||
| Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128 | |||
| Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1 | |||
| ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu | |||
| Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 | |||
| BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128 | |||
| Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1 | |||
| ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu | |||
| Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 | |||
| BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512 | |||
| Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1 | |||
| Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0 | |||
| ReLU res3d_relu 1 1 res3d res3d_res3d_relu | |||
| Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 | |||
| Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 | |||
| BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024 | |||
| Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1 | |||
| Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 | |||
| BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 | |||
| Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 | |||
| ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu | |||
| Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 | |||
| Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 | |||
| ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu | |||
| Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024 | |||
| Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1 | |||
| Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0 | |||
| ReLU res4a_relu 1 1 res4a res4a_res4a_relu | |||
| Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 | |||
| Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 | |||
| Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 | |||
| ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu | |||
| Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 | |||
| Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 | |||
| ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu | |||
| Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024 | |||
| Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1 | |||
| Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0 | |||
| ReLU res4b_relu 1 1 res4b res4b_res4b_relu | |||
| Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 | |||
| Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256 | |||
| Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1 | |||
| ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu | |||
| Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256 | |||
| Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1 | |||
| ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu | |||
| Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024 | |||
| Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1 | |||
| Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0 | |||
| ReLU res4c_relu 1 1 res4c res4c_res4c_relu | |||
| Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 | |||
| Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256 | |||
| Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1 | |||
| ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu | |||
| Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256 | |||
| Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1 | |||
| ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu | |||
| Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024 | |||
| Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1 | |||
| Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0 | |||
| ReLU res4d_relu 1 1 res4d res4d_res4d_relu | |||
| Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 | |||
| Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256 | |||
| Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1 | |||
| ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu | |||
| Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256 | |||
| Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1 | |||
| ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu | |||
| Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024 | |||
| Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1 | |||
| Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0 | |||
| ReLU res4e_relu 1 1 res4e res4e_res4e_relu | |||
| Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 | |||
| Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256 | |||
| Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1 | |||
| ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu | |||
| Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 | |||
| BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256 | |||
| Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1 | |||
| ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu | |||
| Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 | |||
| BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024 | |||
| Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1 | |||
| Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0 | |||
| ReLU res4f_relu 1 1 res4f res4f_res4f_relu | |||
| Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 | |||
| Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 | |||
| BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048 | |||
| Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1 | |||
| Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 | |||
| BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 | |||
| Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 | |||
| ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu | |||
| Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 | |||
| BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 | |||
| Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 | |||
| ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu | |||
| Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 | |||
| BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048 | |||
| Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1 | |||
| Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0 | |||
| ReLU res5a_relu 1 1 res5a res5a_res5a_relu | |||
| Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 | |||
| Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 | |||
| BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 | |||
| Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 | |||
| ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu | |||
| Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 | |||
| BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 | |||
| Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 | |||
| ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu | |||
| Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 | |||
| BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048 | |||
| Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1 | |||
| Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0 | |||
| ReLU res5b_relu 1 1 res5b res5b_res5b_relu | |||
| Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 | |||
| Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 | |||
| BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512 | |||
| Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1 | |||
| ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu | |||
| Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 | |||
| BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512 | |||
| Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1 | |||
| ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu | |||
| Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 | |||
| BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048 | |||
| Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1 | |||
| Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0 | |||
| ReLU res5c_relu 1 1 res5c res5c_res5c_relu | |||
| Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0 | |||
| InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 | |||
| Softmax prob 1 1 fc1000 prob 0=0 | |||
| @@ -0,0 +1,247 @@ | |||
| 7767517 | |||
| 245 261 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2 | |||
| BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 | |||
| Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 | |||
| ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu | |||
| Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 | |||
| Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 | |||
| Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256 | |||
| Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1 | |||
| Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2 | |||
| BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 | |||
| Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 | |||
| ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu | |||
| Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 | |||
| Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 | |||
| ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu | |||
| Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256 | |||
| Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1 | |||
| Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0 | |||
| ReLU res2a_relu 1 1 res2a res2a_res2a_relu | |||
| Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 | |||
| Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 | |||
| Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 | |||
| ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu | |||
| Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 | |||
| Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 | |||
| ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu | |||
| Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256 | |||
| Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1 | |||
| Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0 | |||
| ReLU res2b_relu 1 1 res2b res2b_res2b_relu | |||
| Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 | |||
| Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64 | |||
| Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1 | |||
| ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu | |||
| Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 | |||
| BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64 | |||
| Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1 | |||
| ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu | |||
| Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256 | |||
| Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1 | |||
| Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0 | |||
| ReLU res2c_relu 1 1 res2c res2c_res2c_relu | |||
| Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 | |||
| Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 | |||
| BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512 | |||
| Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1 | |||
| Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 8=2 | |||
| BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 | |||
| Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 | |||
| ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu | |||
| Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 | |||
| Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 | |||
| ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu | |||
| Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512 | |||
| Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1 | |||
| Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0 | |||
| ReLU res3a_relu 1 1 res3a res3a_res3a_relu | |||
| Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 | |||
| Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 | |||
| Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 | |||
| ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu | |||
| Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 | |||
| Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 | |||
| ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu | |||
| Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512 | |||
| Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1 | |||
| Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0 | |||
| ReLU res3b_relu 1 1 res3b res3b_res3b_relu | |||
| Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 | |||
| Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128 | |||
| Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1 | |||
| ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu | |||
| Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128 | |||
| Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1 | |||
| ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu | |||
| Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512 | |||
| Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1 | |||
| Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0 | |||
| ReLU res3c_relu 1 1 res3c res3c_res3c_relu | |||
| Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 | |||
| Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128 | |||
| Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1 | |||
| ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu | |||
| Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 | |||
| BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128 | |||
| Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1 | |||
| ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu | |||
| Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 | |||
| BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512 | |||
| Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1 | |||
| Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0 | |||
| ReLU res3d_relu 1 1 res3d res3d_res3d_relu | |||
| Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 | |||
| Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 8=2 | |||
| BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024 | |||
| Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1 | |||
| Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 | |||
| BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 | |||
| Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 | |||
| ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu | |||
| Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 | |||
| Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 | |||
| ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu | |||
| Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024 | |||
| Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1 | |||
| Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0 | |||
| ReLU res4a_relu 1 1 res4a res4a_res4a_relu | |||
| Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 | |||
| Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 | |||
| Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 | |||
| ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu | |||
| Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 | |||
| Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 | |||
| ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu | |||
| Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024 | |||
| Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1 | |||
| Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0 | |||
| ReLU res4b_relu 1 1 res4b res4b_res4b_relu | |||
| Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 | |||
| Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256 | |||
| Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1 | |||
| ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu | |||
| Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256 | |||
| Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1 | |||
| ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu | |||
| Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024 | |||
| Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1 | |||
| Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0 | |||
| ReLU res4c_relu 1 1 res4c res4c_res4c_relu | |||
| Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 | |||
| Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256 | |||
| Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1 | |||
| ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu | |||
| Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256 | |||
| Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1 | |||
| ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu | |||
| Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024 | |||
| Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1 | |||
| Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0 | |||
| ReLU res4d_relu 1 1 res4d res4d_res4d_relu | |||
| Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 | |||
| Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256 | |||
| Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1 | |||
| ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu | |||
| Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256 | |||
| Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1 | |||
| ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu | |||
| Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024 | |||
| Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1 | |||
| Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0 | |||
| ReLU res4e_relu 1 1 res4e res4e_res4e_relu | |||
| Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 | |||
| Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256 | |||
| Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1 | |||
| ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu | |||
| Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 | |||
| BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256 | |||
| Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1 | |||
| ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu | |||
| Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 | |||
| BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024 | |||
| Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1 | |||
| Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0 | |||
| ReLU res4f_relu 1 1 res4f res4f_res4f_relu | |||
| Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 | |||
| Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 8=2 | |||
| BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048 | |||
| Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1 | |||
| Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 8=2 | |||
| BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 | |||
| Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 | |||
| ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu | |||
| Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 | |||
| Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 | |||
| ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu | |||
| Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048 | |||
| Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1 | |||
| Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0 | |||
| ReLU res5a_relu 1 1 res5a res5a_res5a_relu | |||
| Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 | |||
| Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 | |||
| Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 | |||
| ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu | |||
| Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 | |||
| Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 | |||
| ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu | |||
| Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048 | |||
| Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1 | |||
| Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0 | |||
| ReLU res5b_relu 1 1 res5b res5b_res5b_relu | |||
| Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 | |||
| Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512 | |||
| Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1 | |||
| ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu | |||
| Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 | |||
| BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512 | |||
| Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1 | |||
| ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu | |||
| Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 | |||
| BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048 | |||
| Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1 | |||
| Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0 | |||
| ReLU res5c_relu 1 1 res5c res5c_res5c_relu | |||
| Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0 | |||
| InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 | |||
| Softmax prob 1 1 fc1000 prob 0=0 | |||
| @@ -0,0 +1,77 @@ | |||
| 7767517 | |||
| 75 83 | |||
| Input data 0 1 data 0=227 1=227 2=3 | |||
| Convolution conv1 1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2 | |||
| ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 | |||
| Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 | |||
| Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 | |||
| Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 | |||
| ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 | |||
| Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0 | |||
| Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 | |||
| ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 | |||
| Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 | |||
| Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 | |||
| ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 | |||
| Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0 | |||
| Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 | |||
| Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 | |||
| Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 | |||
| Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0 | |||
| Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 | |||
| ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 | |||
| Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 | |||
| Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 | |||
| Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0 | |||
| Pooling pool5 1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 | |||
| Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 | |||
| ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 | |||
| Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 | |||
| ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 | |||
| Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0 | |||
| Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 | |||
| ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 | |||
| Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 | |||
| ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 | |||
| Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 | |||
| ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 | |||
| Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0 | |||
| Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 | |||
| ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 | |||
| Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 | |||
| Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 | |||
| ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 | |||
| Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0 | |||
| Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 | |||
| Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 | |||
| Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 | |||
| ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 | |||
| Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0 | |||
| Dropout drop9 1 1 fire9/concat fire9/concat_drop9 | |||
| Convolution conv10 1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 8=2 | |||
| ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 | |||
| Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1 | |||
| Softmax prob 1 1 pool10 prob 0=0 | |||
| @@ -0,0 +1,181 @@ | |||
| 7767517 | |||
| 179 212 | |||
| Input data 0 1 data 0=300 1=300 2=3 | |||
| Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 | |||
| Convolution conv1 1 1 data_splitncnn_6 conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2 | |||
| ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 | |||
| Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 | |||
| Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 | |||
| Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 | |||
| ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 | |||
| Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0 | |||
| Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 | |||
| ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 | |||
| Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 | |||
| ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 | |||
| Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 | |||
| ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 | |||
| Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0 | |||
| Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 | |||
| Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 | |||
| Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 | |||
| Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0 | |||
| Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 | |||
| ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 | |||
| Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 | |||
| ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 | |||
| Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 | |||
| Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0 | |||
| Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 | |||
| Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 | |||
| ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 | |||
| Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 | |||
| ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 | |||
| Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 | |||
| ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 | |||
| Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0 | |||
| Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 | |||
| ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 | |||
| Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 | |||
| ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 | |||
| Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 | |||
| ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 | |||
| Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0 | |||
| Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 | |||
| ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 | |||
| Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 | |||
| Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 | |||
| ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 | |||
| Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0 | |||
| Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 | |||
| ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 | |||
| Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 | |||
| BatchNorm fire9/expand1x1/bn 1 1 fire9/expand1x1 fire9/expand1x1_fire9/expand1x1/bn 0=256 | |||
| Scale fire9/expand1x1/scale 1 1 fire9/expand1x1_fire9/expand1x1/bn fire9/expand1x1_fire9/expand1x1/scale 0=256 1=1 | |||
| ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1_fire9/expand1x1/scale fire9/expand1x1_fire9/relu_expand1x1 | |||
| Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 | |||
| BatchNorm fire9/expand3x3/bn 1 1 fire9/expand3x3 fire9/expand3x3_fire9/expand3x3/bn 0=256 | |||
| Scale fire9/expand3x3/scale 1 1 fire9/expand3x3_fire9/expand3x3/bn fire9/expand3x3_fire9/expand3x3/scale 0=256 1=1 | |||
| ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3_fire9/expand3x3/scale fire9/expand3x3_fire9/relu_expand3x3 | |||
| Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0 | |||
| Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 | |||
| Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=49152 8=2 | |||
| BatchNorm fire10/squeeze1x1/bn 1 1 fire10/squeeze1x1 fire10/squeeze1x1_fire10/squeeze1x1/bn 0=96 | |||
| Scale fire10/squeeze1x1/scale 1 1 fire10/squeeze1x1_fire10/squeeze1x1/bn fire10/squeeze1x1_fire10/squeeze1x1/scale 0=96 1=1 | |||
| ReLU fire10/relu_squeeze1x1 1 1 fire10/squeeze1x1_fire10/squeeze1x1/scale fire10/squeeze1x1_fire10/relu_squeeze1x1 | |||
| Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2 | |||
| BatchNorm fire10/expand1x1/bn 1 1 fire10/expand1x1 fire10/expand1x1_fire10/expand1x1/bn 0=384 | |||
| Scale fire10/expand1x1/scale 1 1 fire10/expand1x1_fire10/expand1x1/bn fire10/expand1x1_fire10/expand1x1/scale 0=384 1=1 | |||
| ReLU fire10/relu_expand1x1 1 1 fire10/expand1x1_fire10/expand1x1/scale fire10/expand1x1_fire10/relu_expand1x1 | |||
| Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2 | |||
| BatchNorm fire10/expand3x3/bn 1 1 fire10/expand3x3 fire10/expand3x3_fire10/expand3x3/bn 0=384 | |||
| Scale fire10/expand3x3/scale 1 1 fire10/expand3x3_fire10/expand3x3/bn fire10/expand3x3_fire10/expand3x3/scale 0=384 1=1 | |||
| ReLU fire10/relu_expand3x3 1 1 fire10/expand3x3_fire10/expand3x3/scale fire10/expand3x3_fire10/relu_expand3x3 | |||
| Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat 0=0 | |||
| Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 | |||
| Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 0=0 1=3 2=2 3=0 4=0 | |||
| Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=73728 8=2 | |||
| BatchNorm fire11/squeeze1x1/bn 1 1 fire11/squeeze1x1 fire11/squeeze1x1_fire11/squeeze1x1/bn 0=96 | |||
| Scale fire11/squeeze1x1/scale 1 1 fire11/squeeze1x1_fire11/squeeze1x1/bn fire11/squeeze1x1_fire11/squeeze1x1/scale 0=96 1=1 | |||
| ReLU fire11/relu_squeeze1x1 1 1 fire11/squeeze1x1_fire11/squeeze1x1/scale fire11/squeeze1x1_fire11/relu_squeeze1x1 | |||
| Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 | |||
| Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2 | |||
| BatchNorm fire11/expand1x1/bn 1 1 fire11/expand1x1 fire11/expand1x1_fire11/expand1x1/bn 0=384 | |||
| Scale fire11/expand1x1/scale 1 1 fire11/expand1x1_fire11/expand1x1/bn fire11/expand1x1_fire11/expand1x1/scale 0=384 1=1 | |||
| ReLU fire11/relu_expand1x1 1 1 fire11/expand1x1_fire11/expand1x1/scale fire11/expand1x1_fire11/relu_expand1x1 | |||
| Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2 | |||
| BatchNorm fire11/expand3x3/bn 1 1 fire11/expand3x3 fire11/expand3x3_fire11/expand3x3/bn 0=384 | |||
| Scale fire11/expand3x3/scale 1 1 fire11/expand3x3_fire11/expand3x3/bn fire11/expand3x3_fire11/expand3x3/scale 0=384 1=1 | |||
| ReLU fire11/relu_expand3x3 1 1 fire11/expand3x3_fire11/expand3x3/scale fire11/expand3x3_fire11/relu_expand3x3 | |||
| Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat 0=0 | |||
| Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 | |||
| Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1 0=128 1=1 2=1 3=1 4=0 5=0 6=98304 8=2 | |||
| BatchNorm conv12_1/bn 1 1 conv12_1 conv12_1_conv12_1/bn 0=128 | |||
| Scale conv12_1/scale 1 1 conv12_1_conv12_1/bn conv12_1_conv12_1/scale 0=128 1=1 | |||
| ReLU conv12_1/relu 1 1 conv12_1_conv12_1/scale conv12_1_conv12_1/relu | |||
| Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2 | |||
| BatchNorm conv12_2/bn 1 1 conv12_2 conv12_2_conv12_2/bn 0=256 | |||
| Scale conv12_2/scale 1 1 conv12_2_conv12_2/bn conv12_2_conv12_2/scale 0=256 1=1 | |||
| ReLU conv12_2/relu 1 1 conv12_2_conv12_2/scale conv12_2_conv12_2/relu | |||
| Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 | |||
| Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 | |||
| BatchNorm conv13_1/bn 1 1 conv13_1 conv13_1_conv13_1/bn 0=64 | |||
| Scale conv13_1/scale 1 1 conv13_1_conv13_1/bn conv13_1_conv13_1/scale 0=64 1=1 | |||
| ReLU conv13_1/relu 1 1 conv13_1_conv13_1/scale conv13_1_conv13_1/relu | |||
| Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2 | |||
| BatchNorm conv13_2/bn 1 1 conv13_2 conv13_2_conv13_2/bn 0=128 | |||
| Scale conv13_2/scale 1 1 conv13_2_conv13_2/bn conv13_2_conv13_2/scale 0=128 1=1 | |||
| ReLU conv13_2/relu 1 1 conv13_2_conv13_2/scale conv13_2_conv13_2/relu | |||
| Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 | |||
| BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal 0=256 | |||
| Scale fire5/scale 1 1 fire5/normal fire5/normal_fire5/scale 0=256 1=1 | |||
| Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 | |||
| Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3 | |||
| Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat | |||
| Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=193536 8=2 | |||
| Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3 | |||
| Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat | |||
| PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000 | |||
| Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 | |||
| Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3 | |||
| Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat | |||
| Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=580608 8=2 | |||
| Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3 | |||
| Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat | |||
| PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000 | |||
| Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2 | |||
| Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3 | |||
| Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat | |||
| Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2 | |||
| Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3 | |||
| Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat | |||
| PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000 | |||
| Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2 | |||
| Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3 | |||
| Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat | |||
| Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2 | |||
| Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3 | |||
| Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat | |||
| PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000 | |||
| Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=55296 8=2 | |||
| Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3 | |||
| Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat | |||
| Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=290304 8=2 | |||
| Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3 | |||
| Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat | |||
| PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000 | |||
| Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=18432 8=2 | |||
| Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3 | |||
| Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat | |||
| Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=96768 8=2 | |||
| Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3 | |||
| Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat | |||
| PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000 | |||
| Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc 0=0 | |||
| Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf 0=0 | |||
| Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1 | |||
| Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0 | |||
| Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 | |||
| Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten | |||
| DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.050000 | |||
| @@ -0,0 +1,42 @@ | |||
| 7767517 | |||
| 40 40 | |||
| Input data 0 1 data 0=224 1=224 2=3 | |||
| Convolution conv1_1 1 1 data conv1_1 0=64 1=3 2=1 3=1 4=1 5=1 6=1728 8=2 | |||
| ReLU relu1_1 1 1 conv1_1 conv1_1_relu1_1 | |||
| Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2 0=64 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 | |||
| ReLU relu1_2 1 1 conv1_2 conv1_2_relu1_2 | |||
| Pooling pool1 1 1 conv1_2_relu1_2 pool1 0=0 1=2 2=2 3=0 4=0 | |||
| Convolution conv2_1 1 1 pool1 conv2_1 0=128 1=3 2=1 3=1 4=1 5=1 6=73728 8=2 | |||
| ReLU relu2_1 1 1 conv2_1 conv2_1_relu2_1 | |||
| Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2 0=128 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 | |||
| ReLU relu2_2 1 1 conv2_2 conv2_2_relu2_2 | |||
| Pooling pool2 1 1 conv2_2_relu2_2 pool2 0=0 1=2 2=2 3=0 4=0 | |||
| Convolution conv3_1 1 1 pool2 conv3_1 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2 | |||
| ReLU relu3_1 1 1 conv3_1 conv3_1_relu3_1 | |||
| Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2 | |||
| ReLU relu3_2 1 1 conv3_2 conv3_2_relu3_2 | |||
| Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2 | |||
| ReLU relu3_3 1 1 conv3_3 conv3_3_relu3_3 | |||
| Pooling pool3 1 1 conv3_3_relu3_3 pool3 0=0 1=2 2=2 3=0 4=0 | |||
| Convolution conv4_1 1 1 pool3 conv4_1 0=512 1=3 2=1 3=1 4=1 5=1 6=1179648 8=2 | |||
| ReLU relu4_1 1 1 conv4_1 conv4_1_relu4_1 | |||
| Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 | |||
| ReLU relu4_2 1 1 conv4_2 conv4_2_relu4_2 | |||
| Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 | |||
| ReLU relu4_3 1 1 conv4_3 conv4_3_relu4_3 | |||
| Pooling pool4 1 1 conv4_3_relu4_3 pool4 0=0 1=2 2=2 3=0 4=0 | |||
| Convolution conv5_1 1 1 pool4 conv5_1 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 | |||
| ReLU relu5_1 1 1 conv5_1 conv5_1_relu5_1 | |||
| Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 | |||
| ReLU relu5_2 1 1 conv5_2 conv5_2_relu5_2 | |||
| Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 | |||
| ReLU relu5_3 1 1 conv5_3 conv5_3_relu5_3 | |||
| Pooling pool5 1 1 conv5_3_relu5_3 pool5 0=0 1=2 2=2 3=0 4=0 | |||
| InnerProduct fc6 1 1 pool5 fc6 0=4096 1=1 2=102760448 | |||
| ReLU relu6 1 1 fc6 fc6_relu6 | |||
| Dropout drop6 1 1 fc6_relu6 fc6_drop6 | |||
| InnerProduct fc7 1 1 fc6_drop6 fc7 0=4096 1=1 2=16777216 | |||
| ReLU relu7 1 1 fc7 fc7_relu7 | |||
| Dropout drop7 1 1 fc7_relu7 fc7_drop7 | |||
| InnerProduct fc8 1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 | |||
| Softmax prob 1 1 fc8 prob 0=0 | |||
| @@ -183,6 +183,7 @@ ncnn_add_layer(Yolov3DetectionOutput) | |||
| ncnn_add_layer(PSROIPooling) | |||
| ncnn_add_layer(ROIAlign OFF) | |||
| ncnn_add_layer(Packing) | |||
| ncnn_add_layer(Requantize) | |||
| # message("SHADER_SPV_HEX_FILES = ${SHADER_SPV_HEX_FILES}") | |||
| add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES}) | |||
| @@ -55,14 +55,14 @@ double get_current_time() | |||
| void benchmark(const Layer* layer, double start, double end) | |||
| { | |||
| fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); | |||
| fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); | |||
| fprintf(stderr, " |"); | |||
| fprintf(stderr, "\n"); | |||
| } | |||
| void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end) | |||
| { | |||
| fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); | |||
| fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); | |||
| fprintf(stderr, " | feature_map: %4d x %-4d inch: %4d outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c); | |||
| if (layer->type == "Convolution") | |||
| { | |||
| @@ -0,0 +1,35 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv5x5s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 5; | |||
| int kernel_h = 5; | |||
| int stride_w = 1; | |||
| int stride_h = 1; | |||
| conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| static void conv5x5s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 5; | |||
| int kernel_h = 5; | |||
| int stride_w = 2; | |||
| int stride_h = 2; | |||
| conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| @@ -0,0 +1,35 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv7x7s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 7; | |||
| int kernel_h = 7; | |||
| int stride_w = 1; | |||
| int stride_h = 1; | |||
| conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| static void conv7x7s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 7; | |||
| int kernel_h = 7; | |||
| int stride_w = 2; | |||
| int stride_h = 2; | |||
| conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| @@ -14,6 +14,8 @@ | |||
| #include "convolution_arm.h" | |||
| #include "benchmark.h" | |||
| namespace ncnn { | |||
| #include "convolution_1x1.h" | |||
| @@ -24,8 +26,11 @@ namespace ncnn { | |||
| #include "convolution_7x7.h" | |||
| #if __ARM_NEON | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_1x1_int8.h" | |||
| #include "convolution_3x3_int8.h" | |||
| #include "convolution_5x5_int8.h" | |||
| #include "convolution_7x7_int8.h" | |||
| #endif // __ARM_NEON | |||
| DEFINE_LAYER_CREATOR(Convolution_arm) | |||
| @@ -66,9 +71,12 @@ int Convolution_arm::load_model(const ModelBin& mb) | |||
| if (use_int8_inference) | |||
| { | |||
| #if __ARM_NEON | |||
| #if !__aarch64__ | |||
| if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| if (use_winograd3x3) | |||
| { | |||
| int num_input = weight_data_size / 9 / num_output; | |||
| conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| int num_input = weight_data_size / 9 / num_output; | |||
| conv3x3s1_transform_kernel_int8_neon(weight_data, weight_3x3s1_int8_data, num_input, num_output); | |||
| @@ -78,16 +86,15 @@ int Convolution_arm::load_model(const ModelBin& mb) | |||
| { | |||
| int num_input = weight_data_size / 9 / num_output; | |||
| conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output); | |||
| } | |||
| } | |||
| if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| int num_input = weight_data_size / num_output; | |||
| conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output); | |||
| use_sgemm1x1 = true; | |||
| } | |||
| #endif // !__aarch64__ | |||
| #endif // __ARM_NEON | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -233,7 +240,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| } | |||
| const int kernel_size = kernel_w; | |||
| const int stride = stride_w; | |||
| //const int stride = stride_w; | |||
| int stride = stride_w; | |||
| if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h) | |||
| { | |||
| @@ -293,43 +301,50 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| #if __ARM_NEON | |||
| // kernel_size x stride | |||
| conv_int8_func conv_int8_func_table[5][5] = | |||
| conv_int8_func conv_int8_func_table[7][4] = | |||
| { | |||
| { | |||
| conv1x1s1_int8_neon, | |||
| conv1x1s2_int8_neon, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 1 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 2 | |||
| { | |||
| conv3x3s1_int8_neon, | |||
| conv3x3s2_int8_neon, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 3 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 4 | |||
| { | |||
| conv5x5s1_int8_neon, | |||
| conv5x5s2_int8_neon, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 5 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 5 | |||
| }, // kernel_size = 6 | |||
| { | |||
| conv7x7s1_int8_neon, | |||
| conv7x7s2_int8_neon, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 7 | |||
| }; | |||
| #endif // __ARM_NEON | |||
| @@ -384,9 +399,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| opt_g.blob_allocator = bottom_blob_int8.allocator; | |||
| quantize->forward(bottom_blob, bottom_blob_int8, opt_g); | |||
| } | |||
| } | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| } | |||
| Mat bottom_blob_bordered = bottom_blob_unbordered; | |||
| @@ -423,34 +438,90 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| if (use_int8_inference) | |||
| { | |||
| #if __ARM_NEON | |||
| #if !__aarch64__ | |||
| if (use_sgemm1x1) | |||
| { | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| if (use_int8_requantize == true) | |||
| { | |||
| conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt); | |||
| Mat top_blob_tm; | |||
| top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); | |||
| if (top_blob_tm.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (use_sgemm1x1) | |||
| { | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_1x1s1_sgemm_int8_data, opt); | |||
| } | |||
| else if (use_winograd3x3) | |||
| { | |||
| conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s1_int8_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt); | |||
| } | |||
| else | |||
| { | |||
| conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| } | |||
| // requantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1); | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g); | |||
| } | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt); | |||
| } | |||
| else | |||
| #endif // !__aarch64__ | |||
| #endif // __ARM_NEON | |||
| { | |||
| conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // dequantize, reverse scale inplace | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| if (use_sgemm1x1) | |||
| { | |||
| conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt); | |||
| } | |||
| else if (use_winograd3x3) | |||
| { | |||
| conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt); | |||
| } | |||
| else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) | |||
| { | |||
| conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt); | |||
| } | |||
| else | |||
| { | |||
| conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| dequantize->forward_inplace(top_blob, opt_g); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| dequantize_ops[p]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -40,6 +40,8 @@ public: | |||
| Mat weight_3x3s1_int8_data; | |||
| Mat weight_3x3s2_int8_data; | |||
| Mat weight_1x1s1_sgemm_int8_data; | |||
| Mat weight_3x3_winograd23_data; | |||
| std::vector<Mat> weight_3x3_winograd23_int8_data; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -16,347 +16,6 @@ | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| #if __aarch64__ | |||
| static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const signed char* kernel = (const signed char *)_kernel + p*9; | |||
| int* outptr0 = out; | |||
| int* outptr0n = outptr0 + outw; | |||
| const signed char* img0 = bottom_blob.channel(p); | |||
| const signed char* r0 = img0; | |||
| const signed char* r1 = img0 + w; | |||
| const signed char* r2 = img0 + w*2; | |||
| const signed char* r3 = img0 + w*3; | |||
| int i = 0; | |||
| int8x8_t _k0 = vdup_n_s8(kernel[0]); | |||
| int8x8_t _k1 = vdup_n_s8(kernel[1]); | |||
| int8x8_t _k2 = vdup_n_s8(kernel[2]); | |||
| int8x8_t _k3 = vdup_n_s8(kernel[3]); | |||
| int8x8_t _k4 = vdup_n_s8(kernel[4]); | |||
| int8x8_t _k5 = vdup_n_s8(kernel[5]); | |||
| int8x8_t _k6 = vdup_n_s8(kernel[6]); | |||
| int8x8_t _k7 = vdup_n_s8(kernel[7]); | |||
| int8x8_t _k8 = vdup_n_s8(kernel[8]); | |||
| for (; i+1 < outh; i+=2) | |||
| { | |||
| int nn = outw >> 3; | |||
| int remain = outw & 7; | |||
| for (; nn >0; nn--) | |||
| { | |||
| int8x8_t _r0 = vld1_s8(r0); | |||
| int8x8_t _r0n = vld1_s8(r0+8); | |||
| int8x8_t _r01 = vext_s8(_r0, _r0n, 1); | |||
| int8x8_t _r02 = vext_s8(_r0, _r0n, 2); | |||
| int16x8_t _sum0 = vmull_s8(_r0, _k0); | |||
| _sum0 = vmlal_s8(_sum0, _r01, _k1); | |||
| _sum0 = vmlal_s8(_sum0, _r02, _k2); | |||
| int8x8_t _r1 = vld1_s8(r1); | |||
| int8x8_t _r1n = vld1_s8(r1+8); | |||
| int8x8_t _r11 = vext_s8(_r1, _r1n, 1); | |||
| int8x8_t _r12 = vext_s8(_r1, _r1n, 2); | |||
| _sum0 = vmlal_s8(_sum0, _r1, _k3); | |||
| _sum0 = vmlal_s8(_sum0, _r11, _k4); | |||
| _sum0 = vmlal_s8(_sum0, _r12, _k5); | |||
| int16x8_t _sum1 = vmull_s8(_r1, _k0); | |||
| _sum1 = vmlal_s8(_sum1, _r11, _k1); | |||
| _sum1 = vmlal_s8(_sum1, _r12, _k2); | |||
| int8x8_t _r2 = vld1_s8(r2); | |||
| int8x8_t _r2n = vld1_s8(r2+8); | |||
| int8x8_t _r21 = vext_s8(_r2, _r2n, 1); | |||
| int8x8_t _r22 = vext_s8(_r2, _r2n, 2); | |||
| _sum0 = vmlal_s8(_sum0, _r2, _k6); | |||
| _sum0 = vmlal_s8(_sum0, _r21, _k7); | |||
| _sum0 = vmlal_s8(_sum0, _r22, _k8); | |||
| _sum1 = vmlal_s8(_sum1, _r2, _k3); | |||
| _sum1 = vmlal_s8(_sum1, _r21, _k4); | |||
| _sum1 = vmlal_s8(_sum1, _r22, _k5); | |||
| int8x8_t _r3 = vld1_s8(r3); | |||
| int8x8_t _r3n = vld1_s8(r3+8); | |||
| int8x8_t _r31 = vext_s8(_r3, _r3n, 1); | |||
| int8x8_t _r32 = vext_s8(_r3, _r3n, 2); | |||
| _sum1 = vmlal_s8(_sum1, _r3, _k6); | |||
| _sum1 = vmlal_s8(_sum1, _r31, _k7); | |||
| _sum1 = vmlal_s8(_sum1, _r32, _k8); | |||
| int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); | |||
| int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); | |||
| vst1q_s32(outptr0, sum0_s32); | |||
| vst1q_s32(outptr0+4, sum0n_s32); | |||
| int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1)); | |||
| int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1)); | |||
| vst1q_s32(outptr0n, sum1_s32); | |||
| vst1q_s32(outptr0n+4, sum1n_s32); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| r3 += 8; | |||
| outptr0 += 8; | |||
| outptr0n += 8; | |||
| } | |||
| for (; remain>0; remain--) | |||
| { | |||
| //Todo Neon | |||
| int sum0 = 0; | |||
| int sum0n = 0; | |||
| sum0 += (int)r0[0] * kernel[0]; | |||
| sum0 += (int)r0[1] * kernel[1]; | |||
| sum0 += (int)r0[2] * kernel[2]; | |||
| sum0 += (int)r1[0] * kernel[3]; | |||
| sum0 += (int)r1[1] * kernel[4]; | |||
| sum0 += (int)r1[2] * kernel[5]; | |||
| sum0 += (int)r2[0] * kernel[6]; | |||
| sum0 += (int)r2[1] * kernel[7]; | |||
| sum0 += (int)r2[2] * kernel[8]; | |||
| sum0n += (int)r1[0] * kernel[0]; | |||
| sum0n += (int)r1[1] * kernel[1]; | |||
| sum0n += (int)r1[2] * kernel[2]; | |||
| sum0n += (int)r2[0] * kernel[3]; | |||
| sum0n += (int)r2[1] * kernel[4]; | |||
| sum0n += (int)r2[2] * kernel[5]; | |||
| sum0n += (int)r3[0] * kernel[6]; | |||
| sum0n += (int)r3[1] * kernel[7]; | |||
| sum0n += (int)r3[2] * kernel[8]; | |||
| *outptr0 = sum0; | |||
| *outptr0n = sum0n; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| r3++; | |||
| outptr0++; | |||
| outptr0n++; | |||
| } | |||
| r0 += 2 + w; | |||
| r1 += 2 + w; | |||
| r2 += 2 + w; | |||
| r3 += 2 + w; | |||
| outptr0 += outw; | |||
| outptr0n += outw; | |||
| } | |||
| for (; i < outh; i++) | |||
| { | |||
| int nn = outw >> 3; | |||
| int remain = outw & 7; | |||
| for (; nn >0; nn--) | |||
| { | |||
| int8x8_t _r0 = vld1_s8(r0); | |||
| int8x8_t _r0n = vld1_s8(r0+8); | |||
| int8x8_t _r01 = vext_s8(_r0, _r0n, 1); | |||
| int8x8_t _r02 = vext_s8(_r0, _r0n, 2); | |||
| int16x8_t _sum0 = vmull_s8(_r0, _k0); | |||
| _sum0 = vmlal_s8(_sum0, _r01, _k1); | |||
| _sum0 = vmlal_s8(_sum0, _r02, _k2); | |||
| int8x8_t _r1 = vld1_s8(r1); | |||
| int8x8_t _r1n = vld1_s8(r1+8); | |||
| int8x8_t _r11 = vext_s8(_r1, _r1n, 1); | |||
| int8x8_t _r12 = vext_s8(_r1, _r1n, 2); | |||
| _sum0 = vmlal_s8(_sum0, _r1, _k3); | |||
| _sum0 = vmlal_s8(_sum0, _r11, _k4); | |||
| _sum0 = vmlal_s8(_sum0, _r12, _k5); | |||
| int8x8_t _r2 = vld1_s8(r2); | |||
| int8x8_t _r2n = vld1_s8(r2+8); | |||
| int8x8_t _r21 = vext_s8(_r2, _r2n, 1); | |||
| int8x8_t _r22 = vext_s8(_r2, _r2n, 2); | |||
| _sum0 = vmlal_s8(_sum0, _r2, _k6); | |||
| _sum0 = vmlal_s8(_sum0, _r21, _k7); | |||
| _sum0 = vmlal_s8(_sum0, _r22, _k8); | |||
| int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); | |||
| int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); | |||
| vst1q_s32(outptr0, sum0_s32); | |||
| vst1q_s32(outptr0+4, sum0n_s32); | |||
| r0 += 8; | |||
| r1 += 8; | |||
| r2 += 8; | |||
| outptr0 += 8; | |||
| } | |||
| for (; remain>0; remain--) | |||
| { | |||
| int sum = 0; | |||
| sum += (int)r0[0] * kernel[0]; | |||
| sum += (int)r0[1] * kernel[1]; | |||
| sum += (int)r0[2] * kernel[2]; | |||
| sum += (int)r1[0] * kernel[3]; | |||
| sum += (int)r1[1] * kernel[4]; | |||
| sum += (int)r1[2] * kernel[5]; | |||
| sum += (int)r2[0] * kernel[6]; | |||
| sum += (int)r2[1] * kernel[7]; | |||
| sum += (int)r2[2] * kernel[8]; | |||
| *outptr0 = sum; | |||
| r0++; | |||
| r1++; | |||
| r2++; | |||
| outptr0++; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| } | |||
| } | |||
| } | |||
| static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2*outw + w; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const signed char* kernel = (const signed char*)_kernel + p*9; | |||
| int* outptr = out; | |||
| const signed char* img = bottom_blob.channel(p); | |||
| const signed char* r0 = img; | |||
| const signed char* r1 = img + w; | |||
| const signed char* r2 = img + w*2; | |||
| int i = 0; | |||
| int8x8_t _k0 = vdup_n_s8(kernel[0]); | |||
| int8x8_t _k1 = vdup_n_s8(kernel[1]); | |||
| int8x8_t _k2 = vdup_n_s8(kernel[2]); | |||
| int8x8_t _k3 = vdup_n_s8(kernel[3]); | |||
| int8x8_t _k4 = vdup_n_s8(kernel[4]); | |||
| int8x8_t _k5 = vdup_n_s8(kernel[5]); | |||
| int8x8_t _k6 = vdup_n_s8(kernel[6]); | |||
| int8x8_t _k7 = vdup_n_s8(kernel[7]); | |||
| int8x8_t _k8 = vdup_n_s8(kernel[8]); | |||
| for (; i < outh; i++) | |||
| { | |||
| int nn = outw >> 3; | |||
| int remain = outw & 7; | |||
| for (; nn > 0; nn--) | |||
| { | |||
| int8x8x2_t _r0 = vld2_s8(r0); | |||
| int8x8x2_t _r0n = vld2_s8(r0+16); | |||
| int8x8_t _r00 = _r0.val[0]; | |||
| int8x8_t _r01 = _r0.val[1]; | |||
| int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1); | |||
| int16x8_t _sum = vmull_s8(_r00, _k0); | |||
| _sum = vmlal_s8(_sum, _r01, _k1); | |||
| _sum = vmlal_s8(_sum, _r02, _k2); | |||
| int8x8x2_t _r1 = vld2_s8(r1); | |||
| int8x8x2_t _r1n = vld2_s8(r1+16); | |||
| int8x8_t _r10 = _r1.val[0]; | |||
| int8x8_t _r11 = _r1.val[1]; | |||
| int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1); | |||
| _sum = vmlal_s8(_sum, _r10, _k3); | |||
| _sum = vmlal_s8(_sum, _r11, _k4); | |||
| _sum = vmlal_s8(_sum, _r12, _k5); | |||
| int8x8x2_t _r2 = vld2_s8(r2); | |||
| int8x8x2_t _r2n = vld2_s8(r2+16); | |||
| int8x8_t _r20 = _r2.val[0]; | |||
| int8x8_t _r21 = _r2.val[1]; | |||
| int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1); | |||
| _sum = vmlal_s8(_sum, _r20, _k6); | |||
| _sum = vmlal_s8(_sum, _r21, _k7); | |||
| _sum = vmlal_s8(_sum, _r22, _k8); | |||
| int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum)); | |||
| int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum)); | |||
| vst1q_s32(outptr, sum0_s32); | |||
| vst1q_s32(outptr+4, sum0n_s32); | |||
| r0 += 16; | |||
| r1 += 16; | |||
| r2 += 16; | |||
| outptr += 8; | |||
| } | |||
| for (; remain>0; remain--) | |||
| { | |||
| int sum = 0; | |||
| sum += (int)r0[0] * kernel[0]; | |||
| sum += (int)r0[1] * kernel[1]; | |||
| sum += (int)r0[2] * kernel[2]; | |||
| sum += (int)r1[0] * kernel[3]; | |||
| sum += (int)r1[1] * kernel[4]; | |||
| sum += (int)r1[2] * kernel[5]; | |||
| sum += (int)r2[0] * kernel[6]; | |||
| sum += (int)r2[1] * kernel[7]; | |||
| sum += (int)r2[2] * kernel[8]; | |||
| *outptr = sum; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| } | |||
| } | |||
| } | |||
| #else // __aarch64__ | |||
| static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -824,5 +483,3 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| @@ -13,7 +13,7 @@ | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "convolutiondepthwise_arm.h" | |||
| #include "benchmark.h" | |||
| #ifdef _OPENMP | |||
| #include <omp.h> | |||
| #endif | |||
| @@ -147,6 +147,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| Mat bottom_blob_unbordered = bottom_blob; | |||
| if (use_int8_inference && elemsize != 1) | |||
| { | |||
| // start = ncnn::get_current_time(); | |||
| Mat bottom_blob_int8; | |||
| bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); | |||
| if (bottom_blob_int8.empty()) | |||
| @@ -167,8 +169,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g); | |||
| } | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| } | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| } | |||
| Mat bottom_blob_bordered = bottom_blob_unbordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| @@ -211,25 +213,67 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) | |||
| { | |||
| if (stride_w == 1 && stride_h == 1) | |||
| if (use_int8_requantize) | |||
| { | |||
| convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| Mat top_blob_tm; | |||
| top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); | |||
| if (top_blob_tm.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| } | |||
| // requantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1); | |||
| Mat top_blob_g = top_blob.channel_range(g, 1); | |||
| requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g); | |||
| } | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| else | |||
| { | |||
| convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel_range(g, 1); | |||
| dequantize_ops[g]->forward_inplace(top_blob_g, opt_g); | |||
| // start = ncnn::get_current_time(); | |||
| top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel_range(g, 1); | |||
| dequantize_ops[g]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -31,19 +31,6 @@ static inline signed char float2int8(float v) | |||
| int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| #if !__aarch64__ && __ARM_NEON | |||
| int FPSCR_value = 0; | |||
| asm volatile( | |||
| "vmrs %0, FPSCR \n" | |||
| "bic r10, %0, #0x00c00000 \n" | |||
| "vmsr FPSCR, r10 \n" | |||
| : "=r"(FPSCR_value) | |||
| : | |||
| : "memory", "r10" | |||
| ); | |||
| #endif | |||
| int dims = bottom_blob.dims; | |||
| if (dims == 1) | |||
| @@ -200,15 +187,6 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| } | |||
| } | |||
| #if !__aarch64__ && __ARM_NEON | |||
| asm volatile( | |||
| "vmsr FPSCR, %0 \n" | |||
| : | |||
| : "r"(FPSCR_value) | |||
| : "memory" | |||
| ); | |||
| #endif | |||
| return 0; | |||
| } | |||
| @@ -22,8 +22,92 @@ namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(ReLU_arm) | |||
| int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| if (slope == 0.f) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| signed char* ptr = bottom_top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 4; | |||
| int remain = size - (nn << 4); | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| #if __ARM_NEON | |||
| #if __aarch64__ | |||
| int8x16_t _zero = vdupq_n_s8(0); | |||
| for (; nn>0; nn--) | |||
| { | |||
| int8x16_t _p = vld1q_s8(ptr); | |||
| _p = vmaxq_s8(_p, _zero); | |||
| vst1q_s8(ptr, _p); | |||
| ptr += 16; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "veor q1, q0, q0 \n" | |||
| "0: \n" | |||
| "pld [%1, #128] \n" | |||
| "vld1.s8 {d0-d1}, [%1 :128] \n" | |||
| "vmax.s8 q0, q0, q1 \n" | |||
| "subs %0, #1 \n" | |||
| "vst1.s8 {d0-d1}, [%1 :128]! \n" | |||
| "bne 0b \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(ptr) // %1 | |||
| : "0"(nn), | |||
| "1"(ptr) | |||
| : "cc", "memory", "q0", "q1" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #endif // __ARM_NEON | |||
| for (; remain>0; remain--) | |||
| { | |||
| if (*ptr < 0) | |||
| *ptr = 0; | |||
| ptr++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // TODO | |||
| // #pragma omp parallel for num_threads(opt.num_threads) | |||
| // for (int q=0; q<channels; q++) | |||
| // { | |||
| // float* ptr = bottom_top_blob.channel(q); | |||
| // for (int i=0; i<size; i++) | |||
| // { | |||
| // if (ptr[i] < 0) | |||
| // ptr[i] *= slope; | |||
| // } | |||
| // } | |||
| } | |||
| return 0; | |||
| } | |||
| int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| if (bottom_top_blob.elemsize == 1u) | |||
| return ReLU_arm::forward_inplace_int8(bottom_top_blob, opt); | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| @@ -23,6 +23,7 @@ class ReLU_arm : public ReLU | |||
| { | |||
| public: | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,325 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "requantize_arm.h" | |||
| #include <math.h> | |||
| #if __ARM_NEON | |||
| #include <arm_neon.h> | |||
| #endif // __ARM_NEON | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Requantize_arm) | |||
| static inline signed char float2int8(float v) | |||
| { | |||
| int int32 = round(v); | |||
| if (int32 > 127) return 127; | |||
| if (int32 < -128) return -128; | |||
| return (signed char)int32; | |||
| } | |||
| int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_blob.dims; | |||
| if (dims == 1) | |||
| { | |||
| int w = bottom_blob.w; | |||
| const int* intptr = bottom_blob; | |||
| signed char * ptr = top_blob; | |||
| if (bias_term) | |||
| { | |||
| if (bias_data_size > 1) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| float bias = bias_data[0]; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(intptr[i] * scale_in * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| if (bias_term) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| const int* intptr = bottom_blob.row<const int>(i); | |||
| signed char* ptr = top_blob.row<signed char>(i); | |||
| float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0]; | |||
| for (int j=0; j<w; j++) | |||
| { | |||
| ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out); | |||
| if (fusion_relu && ptr[j] < 0) | |||
| ptr[j] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| const int* intptr = bottom_blob.row<const int>(i); | |||
| signed char* ptr = top_blob.row<signed char>(i); | |||
| for (int j=0; j<w; j++) | |||
| { | |||
| ptr[j] = float2int8(intptr[j] * scale_in * scale_out); | |||
| if (fusion_relu && ptr[j] < 0) | |||
| ptr[j] = 0; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (dims == 3) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| if (bias_term) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const int* intptr = bottom_blob.channel(q); | |||
| signed char* ptr = top_blob.channel(q); | |||
| float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0]; | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size & 7; | |||
| #if __aarch64__ | |||
| for (; nn>0; nn--) | |||
| { | |||
| ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out); | |||
| ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out); | |||
| ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out); | |||
| ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out); | |||
| ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out); | |||
| ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out); | |||
| ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out); | |||
| ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out); | |||
| ptr += 8; | |||
| intptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #256] \n" | |||
| "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data | |||
| "vdup.f32 q10, %6 \n" //q10 scale_in | |||
| "vdup.f32 q11, %7 \n" //q11 scale_out | |||
| "vdup.f32 q12, %8 \n" //q12 bias | |||
| "0: \n" | |||
| // top_s32 -> top_f32 | |||
| "vcvt.f32.s32 q0, q0 \n" | |||
| "vcvt.f32.s32 q1, q1 \n" | |||
| // top_f32 = top_f32 * scale_int | |||
| "vmul.f32 q0, q0, q10 \n" | |||
| "vmul.f32 q1, q1, q10 \n" | |||
| // top_f32 = top_f32 + bias | |||
| "vadd.f32 q0, q0, q12 \n" | |||
| "vadd.f32 q1, q1, q12 \n" | |||
| // top_f32 = top_f32 * scale_out | |||
| "vmul.f32 q0, q0, q11 \n" | |||
| "vmul.f32 q1, q1, q11 \n" | |||
| // top_f32 -> top_s32 | |||
| "vcvtr.s32.f32 s0, s0 \n" | |||
| "vcvtr.s32.f32 s1, s1 \n" | |||
| "vcvtr.s32.f32 s2, s2 \n" | |||
| "vcvtr.s32.f32 s3, s3 \n" | |||
| "vcvtr.s32.f32 s4, s4 \n" | |||
| "vcvtr.s32.f32 s5, s5 \n" | |||
| "vcvtr.s32.f32 s6, s6 \n" | |||
| "vcvtr.s32.f32 s7, s7 \n" | |||
| // top_s32 -> top_s16 | |||
| "vqmovn.s32 d4, q0 \n" | |||
| "vqmovn.s32 d5, q1 \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data | |||
| // top_s16 -> top_s8 | |||
| "vqmovn.s16 d4, q2 \n" | |||
| // save top_s8 | |||
| "vst1.8 {d4}, [%2:64]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %1, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(intptr), // %1 | |||
| "=r"(ptr) // %2 | |||
| : "0"(nn), | |||
| "1"(intptr), | |||
| "2"(ptr), | |||
| "r"(scale_in), // %6 | |||
| "r"(scale_out), // %7 | |||
| "r"(bias) // %8 | |||
| : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| for (; remain > 0; remain--) | |||
| { | |||
| *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out); | |||
| intptr++; | |||
| ptr ++; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const int* intptr = bottom_blob.channel(q); | |||
| signed char* ptr = top_blob.channel(q); | |||
| #if __ARM_NEON | |||
| int nn = size >> 3; | |||
| int remain = size & 7; | |||
| #if __aarch64__ | |||
| //TODO | |||
| for (; nn>0; nn--) | |||
| { | |||
| ptr[0] = float2int8(intptr[0] * scale_in * scale_out); | |||
| ptr[1] = float2int8(intptr[1] * scale_in * scale_out); | |||
| ptr[2] = float2int8(intptr[2] * scale_in * scale_out); | |||
| ptr[3] = float2int8(intptr[3] * scale_in * scale_out); | |||
| ptr[4] = float2int8(intptr[4] * scale_in * scale_out); | |||
| ptr[5] = float2int8(intptr[5] * scale_in * scale_out); | |||
| ptr[6] = float2int8(intptr[6] * scale_in * scale_out); | |||
| ptr[7] = float2int8(intptr[7] * scale_in * scale_out); | |||
| ptr += 8; | |||
| intptr += 8; | |||
| } | |||
| #else | |||
| if (nn > 0) | |||
| { | |||
| asm volatile( | |||
| "pld [%1, #256] \n" | |||
| "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data | |||
| "vdup.f32 q10, %6 \n" //q10 scale_in | |||
| "vdup.f32 q11, %7 \n" //q11 scale_out | |||
| "0: \n" | |||
| // top_s32 -> top_f32 | |||
| "vcvt.f32.s32 q0, q0 \n" | |||
| "vcvt.f32.s32 q1, q1 \n" | |||
| // top_f32 = top_f32 * scale_int | |||
| "vmul.f32 q0, q0, q10 \n" | |||
| "vmul.f32 q1, q1, q10 \n" | |||
| // top_f32 = top_f32 * scale_out | |||
| "vmul.f32 q0, q0, q11 \n" | |||
| "vmul.f32 q1, q1, q11 \n" | |||
| // top_f32 -> top_s32 | |||
| "vcvtr.s32.f32 s0, s0 \n" | |||
| "vcvtr.s32.f32 s1, s1 \n" | |||
| "vcvtr.s32.f32 s2, s2 \n" | |||
| "vcvtr.s32.f32 s3, s3 \n" | |||
| "vcvtr.s32.f32 s4, s4 \n" | |||
| "vcvtr.s32.f32 s5, s5 \n" | |||
| "vcvtr.s32.f32 s6, s6 \n" | |||
| "vcvtr.s32.f32 s7, s7 \n" | |||
| // top_s32 -> top_s16 | |||
| "vqmovn.s32 d4, q0 \n" | |||
| "vqmovn.s32 d5, q1 \n" | |||
| "pld [%1, #256] \n" | |||
| "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data | |||
| // top_s16 -> top_s8 | |||
| "vqmovn.s16 d4, q2 \n" | |||
| // save top_s8 | |||
| "vst1.8 {d4}, [%2:64]! \n" | |||
| "subs %0, #1 \n" | |||
| "bne 0b \n" | |||
| "sub %1, #32 \n" | |||
| : "=r"(nn), // %0 | |||
| "=r"(intptr), // %1 | |||
| "=r"(ptr) // %2 | |||
| : "0"(nn), | |||
| "1"(intptr), | |||
| "2"(ptr), | |||
| "r"(scale_in), // %6 | |||
| "r"(scale_out) // %7 | |||
| : "cc", "memory", "q0", "q1", "q2", "q10", "q11" | |||
| ); | |||
| } | |||
| #endif // __aarch64__ | |||
| #else | |||
| int remain = size; | |||
| #endif // __ARM_NEON | |||
| for (; remain > 0; remain--) | |||
| { | |||
| *ptr = float2int8(*intptr * scale_in * scale_out); | |||
| intptr++; | |||
| ptr ++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,30 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_REQUANTIZE_ARM_H | |||
| #define LAYER_REQUANTIZE_ARM_H | |||
| #include "requantize.h" | |||
| namespace ncnn { | |||
| class Requantize_arm : public Requantize | |||
| { | |||
| public: | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_REQUANTIZE_ARM_H | |||
| @@ -25,6 +25,7 @@ Convolution::Convolution() | |||
| one_blob_only = true; | |||
| support_inplace = false; | |||
| support_vulkan = true; | |||
| use_int8_requantize = false; | |||
| #if NCNN_VULKAN | |||
| padding = 0; | |||
| @@ -42,7 +43,6 @@ Convolution::Convolution() | |||
| #endif // NCNN_VULKAN | |||
| quantize = 0; | |||
| dequantize = 0; | |||
| } | |||
| Convolution::~Convolution() | |||
| @@ -52,7 +52,14 @@ Convolution::~Convolution() | |||
| #endif // NCNN_VULKAN | |||
| delete quantize; | |||
| delete dequantize; | |||
| for (int i=0; i<(int)dequantize_ops.size(); i++) | |||
| delete dequantize_ops[i]; | |||
| dequantize_ops.clear(); | |||
| for (int i=0; i<(int)requantize_ops.size(); i++) | |||
| delete requantize_ops[i]; | |||
| requantize_ops.clear(); | |||
| } | |||
| int Convolution::load_param(const ParamDict& pd) | |||
| @@ -113,10 +120,18 @@ int Convolution::load_model(const ModelBin& mb) | |||
| if (int8_scale_term) | |||
| { | |||
| weight_data_int8_scale = mb.load(1, 1)[0]; | |||
| weight_data_int8_scales = mb.load(num_output, 1); | |||
| bottom_blob_int8_scale = mb.load(1, 1)[0]; | |||
| } | |||
| for (int i=0; i<(int)dequantize_ops.size(); i++) | |||
| delete dequantize_ops[i]; | |||
| dequantize_ops.clear(); | |||
| for (int i=0; i<(int)requantize_ops.size(); i++) | |||
| delete requantize_ops[i]; | |||
| requantize_ops.clear(); | |||
| bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u); | |||
| bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u); | |||
| @@ -126,27 +141,39 @@ int Convolution::load_model(const ModelBin& mb) | |||
| return -1; | |||
| } | |||
| // runtime quantize the weight data | |||
| if (weight_data_is_float32 && use_int8_inference) | |||
| { | |||
| // quantize weight to int8 | |||
| Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| Mat int8_weight_data(weight_data_size, (size_t)1u); | |||
| if (int8_weight_data.empty()) | |||
| return -100; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, weight_data_int8_scale);// scale | |||
| const int weight_data_size_output = weight_data_size / num_output; | |||
| for (int n=0; n<num_output; n++) | |||
| { | |||
| Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| op->load_param(pd); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, weight_data_int8_scales[n]);// scale | |||
| Mat int8_weight_data; | |||
| op->forward(weight_data, int8_weight_data); | |||
| op->load_param(pd); | |||
| delete op; | |||
| ncnn::Option opt = ncnn::get_default_option(); | |||
| opt.blob_allocator = int8_weight_data.allocator; | |||
| if (int8_weight_data.empty()) | |||
| return -100; | |||
| const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output); | |||
| Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output); | |||
| op->forward(weight_data_n, int8_weight_data_n, opt); | |||
| delete op; | |||
| } | |||
| weight_data = int8_weight_data; | |||
| } | |||
| // initial the quantize,dequantize op layer | |||
| if (use_int8_inference) | |||
| { | |||
| quantize = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| @@ -157,22 +184,74 @@ int Convolution::load_model(const ModelBin& mb) | |||
| quantize->load_param(pd); | |||
| } | |||
| dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| dequantize_ops.resize(num_output); | |||
| for (int n=0; n<num_output; n++) | |||
| { | |||
| float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale); | |||
| dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| float top_rescale = 1.f; | |||
| if (weight_data_int8_scales[n] == 0) | |||
| top_rescale = 0; | |||
| else | |||
| top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, num_output);// bias_data_size | |||
| pd.set(1, bias_term); // bias_term | |||
| pd.set(2, 1); // bias_data_size | |||
| dequantize->load_param(pd); | |||
| dequantize_ops[n]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = bias_data; | |||
| weights[0] = bias_data.range(n, 1); | |||
| dequantize->load_model(ModelBinFromMatArray(weights)); | |||
| dequantize_ops[n]->load_model(ModelBinFromMatArray(weights)); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution::create_requantize_op(void) | |||
| { | |||
| if (!use_int8_requantize) | |||
| { | |||
| fprintf(stderr, "requantized op set but use_int8_requantize disabled\n"); | |||
| return -1; | |||
| } | |||
| requantize_ops.resize(num_output); | |||
| for (int n=0; n<num_output; n++) | |||
| { | |||
| requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize); | |||
| float scale_in = 1.f; | |||
| float scale_out = 1.f; | |||
| if (weight_data_int8_scales[n] == 0) | |||
| { | |||
| scale_in = 0; | |||
| } | |||
| else | |||
| { | |||
| scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]); | |||
| } | |||
| scale_out = top_blob_int8_scale; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, scale_in); // scale in | |||
| pd.set(1, scale_out); // scale_out | |||
| pd.set(2, bias_term); // bias_term | |||
| pd.set(3, 1); // bias_data_size | |||
| requantize_ops[n]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = bias_data.range(n, 1); | |||
| requantize_ops[n]->load_model(ModelBinFromMatArray(weights)); | |||
| } | |||
| return 0; | |||
| @@ -210,7 +289,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| if (int8_scale_term) | |||
| { | |||
| weights[2] = Mat(1, (size_t)4u, (void*)&weight_data_int8_scale); | |||
| weights[2] = weight_data_int8_scales; | |||
| weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale); | |||
| } | |||
| @@ -309,50 +388,118 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op | |||
| if (use_int8_inference) | |||
| { | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| if (use_int8_requantize == true) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| Mat top_blob_tm; | |||
| top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); | |||
| if (top_blob_tm.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| for (int i = 0; i < outh; i++) | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| int sum = 0; | |||
| int* outptr = top_blob_tm.channel(p); | |||
| const signed char* kptr = (const signed char*)weight_data + maxk * channels * p; | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w; | |||
| int sum = 0; | |||
| const signed char* kptr = (const signed char*)weight_data + maxk * channels * p; | |||
| for (int k = 0; k < maxk; k++) | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| int val = sptr[ space_ofs[k] ]; | |||
| int w = kptr[k]; | |||
| sum += val * w; | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| int val = sptr[ space_ofs[k] ]; | |||
| int w = kptr[k]; | |||
| sum += val * w; | |||
| } | |||
| kptr += maxk; | |||
| } | |||
| kptr += maxk; | |||
| outptr[j] = sum; | |||
| } | |||
| outptr[j] = sum; | |||
| outptr += outw; | |||
| } | |||
| outptr += outw; | |||
| // requantize, reverse scale inplace | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1); | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g); | |||
| } | |||
| } | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| else | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| // num_output | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| int* outptr = top_blob.channel(p); | |||
| dequantize->forward_inplace(top_blob, opt_g); | |||
| } | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| for (int j = 0; j < outw; j++) | |||
| { | |||
| int sum = 0; | |||
| const signed char* kptr = (const signed char*)weight_data + maxk * channels * p; | |||
| // channels | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const Mat m = bottom_blob_bordered.channel(q); | |||
| const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w; | |||
| for (int k = 0; k < maxk; k++) | |||
| { | |||
| int val = sptr[ space_ofs[k] ]; | |||
| int w = kptr[k]; | |||
| sum += val * w; | |||
| } | |||
| kptr += maxk; | |||
| } | |||
| outptr[j] = sum; | |||
| } | |||
| outptr += outw; | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| dequantize_ops[p]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| @@ -29,6 +29,8 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int create_requantize_op(void); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #if NCNN_VULKAN | |||
| @@ -91,13 +93,16 @@ public: | |||
| Pipeline* pipeline_innerproduct_pack4to1; | |||
| #endif // NCNN_VULKAN | |||
| float weight_data_int8_scale; | |||
| Mat weight_data_int8_scales; | |||
| float bottom_blob_int8_scale; | |||
| float top_blob_int8_scale; | |||
| bool use_int8_inference; | |||
| bool use_int8_requantize; | |||
| ncnn::Layer* quantize; | |||
| ncnn::Layer* dequantize; | |||
| std::vector<ncnn::Layer*> dequantize_ops; | |||
| std::vector<ncnn::Layer*> requantize_ops; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -25,6 +25,7 @@ ConvolutionDepthWise::ConvolutionDepthWise() | |||
| one_blob_only = true; | |||
| support_inplace = false; | |||
| support_vulkan = true; | |||
| use_int8_requantize = false; | |||
| #if NCNN_VULKAN | |||
| padding = 0; | |||
| @@ -58,6 +59,11 @@ ConvolutionDepthWise::~ConvolutionDepthWise() | |||
| delete dequantize_ops[i]; | |||
| dequantize_ops.clear(); | |||
| for (int i=0; i<(int)requantize_ops.size(); i++) | |||
| delete requantize_ops[i]; | |||
| requantize_ops.clear(); | |||
| } | |||
| int ConvolutionDepthWise::load_param(const ParamDict& pd) | |||
| @@ -150,7 +156,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| if (int8_scale_term == 1) | |||
| { | |||
| weight_data_int8_scales = mb.load(group, 1); | |||
| bottom_blob_int8_scales = mb.load(group, 1); | |||
| bottom_blob_int8_scales = mb.load(1, 1); | |||
| float bottom_blob_int8_scale = bottom_blob_int8_scales[0]; | |||
| bottom_blob_int8_scales = Mat(group); | |||
| bottom_blob_int8_scales.fill(bottom_blob_int8_scale); | |||
| } | |||
| else if (int8_scale_term == 2) | |||
| { | |||
| @@ -177,6 +187,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| dequantize_ops.clear(); | |||
| for (int i=0; i<(int)requantize_ops.size(); i++) | |||
| delete requantize_ops[i]; | |||
| requantize_ops.clear(); | |||
| bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u); | |||
| bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u); | |||
| @@ -236,7 +251,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| { | |||
| dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| float top_rescale = 1.f; | |||
| if (weight_data_int8_scales[g] == 0) | |||
| top_rescale = 0; | |||
| else | |||
| top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| @@ -255,6 +274,50 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise::create_requantize_op(void) | |||
| { | |||
| if (!use_int8_requantize) | |||
| { | |||
| fprintf(stderr, "requantized op set but use_int8_requantize disabled\n"); | |||
| return -1; | |||
| } | |||
| requantize_ops.resize(group); | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| requantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Requantize); | |||
| float scale_in = 1.f; | |||
| float scale_out = 1.f; | |||
| if (weight_data_int8_scales[g] == 0) | |||
| { | |||
| scale_in = 0; | |||
| } | |||
| else | |||
| { | |||
| scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); | |||
| } | |||
| scale_out = top_blob_int8_scale; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, scale_in); // scale in | |||
| pd.set(1, scale_out); // scale_out | |||
| pd.set(2, bias_term); // bias_term | |||
| pd.set(3, 1); // bias_data_size | |||
| requantize_ops[g]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = bias_data.range(g, 1); | |||
| requantize_ops[g]->load_model(ModelBinFromMatArray(weights)); | |||
| } | |||
| return 0; | |||
| } | |||
| int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| // convolv with NxN kernel | |||
| @@ -29,6 +29,8 @@ public: | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int create_requantize_op(void); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| #if NCNN_VULKAN | |||
| @@ -92,11 +94,14 @@ public: | |||
| Mat weight_data_int8_scales; | |||
| Mat bottom_blob_int8_scales; | |||
| float top_blob_int8_scale; | |||
| bool use_int8_inference; | |||
| bool use_int8_requantize; | |||
| std::vector<ncnn::Layer*> quantize_ops; | |||
| std::vector<ncnn::Layer*> dequantize_ops; | |||
| std::vector<ncnn::Layer*> requantize_ops; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -36,7 +36,6 @@ InnerProduct::InnerProduct() | |||
| #endif // NCNN_VULKAN | |||
| quantize = 0; | |||
| dequantize = 0; | |||
| } | |||
| InnerProduct::~InnerProduct() | |||
| @@ -46,7 +45,11 @@ InnerProduct::~InnerProduct() | |||
| #endif // NCNN_VULKAN | |||
| delete quantize; | |||
| delete dequantize; | |||
| for (int i=0; i<(int)dequantize_ops.size(); i++) | |||
| delete dequantize_ops[i]; | |||
| dequantize_ops.clear(); | |||
| } | |||
| int InnerProduct::load_param(const ParamDict& pd) | |||
| @@ -92,7 +95,7 @@ int InnerProduct::load_model(const ModelBin& mb) | |||
| if (int8_scale_term) | |||
| { | |||
| weight_data_int8_scale = mb.load(1, 1)[0]; | |||
| weight_data_int8_scales = mb.load(num_output, 1); | |||
| bottom_blob_int8_scale = mb.load(1, 1)[0]; | |||
| } | |||
| @@ -105,25 +108,71 @@ int InnerProduct::load_model(const ModelBin& mb) | |||
| return -1; | |||
| } | |||
| // initial the quantize,dequantize op layer | |||
| if (use_int8_inference) | |||
| { | |||
| quantize = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scale);// scale | |||
| quantize->load_param(pd); | |||
| } | |||
| dequantize_ops.resize(num_output); | |||
| for (int n=0; n<num_output; n++) | |||
| { | |||
| dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize); | |||
| float top_rescale = 1.f; | |||
| if (weight_data_int8_scales[n] == 0) | |||
| top_rescale = 0; | |||
| else | |||
| top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term); // bias_term | |||
| pd.set(2, 1); // bias_data_size | |||
| dequantize_ops[n]->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = bias_data.range(n, 1); | |||
| dequantize_ops[n]->load_model(ModelBinFromMatArray(weights)); | |||
| } | |||
| } | |||
| // runtime quantize the weight data | |||
| if (weight_data_is_float32 && use_int8_inference) | |||
| { | |||
| // quantize weight to int8 | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, weight_data_int8_scale);// scale | |||
| Mat int8_weight_data(weight_data_size, (size_t)1u); | |||
| if (int8_weight_data.empty()) | |||
| return -100; | |||
| quantize->load_param(pd); | |||
| const int weight_data_size_output = weight_data_size / num_output; | |||
| Mat int8_weight_data; | |||
| quantize->forward(weight_data, int8_weight_data); | |||
| for (int n=0; n<num_output; n++) | |||
| { | |||
| Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize); | |||
| if (int8_weight_data.empty()) | |||
| return -100; | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, weight_data_int8_scales[n]);// scale | |||
| op->load_param(pd); | |||
| ncnn::Option opt = ncnn::get_default_option(); | |||
| opt.blob_allocator = int8_weight_data.allocator; | |||
| const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output); | |||
| Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output); | |||
| op->forward(weight_data_n, int8_weight_data_n, opt); | |||
| delete op; | |||
| } | |||
| weight_data = int8_weight_data; | |||
| } | |||
| @@ -152,12 +201,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| // quantize, scale and round to nearest | |||
| { | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, bottom_blob_int8_scale);// scale | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = bottom_blob_int8.allocator; | |||
| quantize->load_param(pd); | |||
| quantize->forward(bottom_blob, bottom_blob_int8, opt); | |||
| quantize->forward(bottom_blob, bottom_blob_int8, opt_g); | |||
| } | |||
| // num_output | |||
| @@ -179,26 +226,24 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o | |||
| } | |||
| } | |||
| out[p] = sum; | |||
| out[p] = sum; | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale); | |||
| ncnn::ParamDict pd; | |||
| pd.set(0, top_rescale);// scale | |||
| pd.set(1, bias_term);// bias_term | |||
| pd.set(2, num_output);// bias_data_size | |||
| dequantize->load_param(pd); | |||
| ncnn::Mat weights[1]; | |||
| weights[0] = bias_data; | |||
| dequantize->load_model(ModelBinFromMatArray(weights)); | |||
| dequantize->forward_inplace(top_blob, opt); | |||
| int* out_s32 = top_blob; | |||
| float* out_f32 = top_blob; | |||
| float top_rescale = 1.f; | |||
| if (weight_data_int8_scales[p] == 0) | |||
| top_rescale = 0; | |||
| else | |||
| top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]); | |||
| if (bias_term) | |||
| out_f32[p] = out_s32[p] * top_rescale + bias_data[p]; | |||
| else | |||
| out_f32[p] = out_s32[p] * top_rescale; | |||
| } | |||
| return 0; | |||
| @@ -76,13 +76,13 @@ public: | |||
| Pipeline* pipeline_innerproduct_pack4to1; | |||
| #endif // NCNN_VULKAN | |||
| float weight_data_int8_scale; | |||
| Mat weight_data_int8_scales; | |||
| float bottom_blob_int8_scale; | |||
| bool use_int8_inference; | |||
| ncnn::Layer* quantize; | |||
| ncnn::Layer* dequantize; | |||
| std::vector<ncnn::Layer*> dequantize_ops; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -38,8 +38,51 @@ int ReLU::load_param(const ParamDict& pd) | |||
| return 0; | |||
| } | |||
| int ReLU::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| int size = w * h; | |||
| if (slope == 0.f) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| signed char* ptr = bottom_top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| if (ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // TODO | |||
| // #pragma omp parallel for num_threads(opt.num_threads) | |||
| // for (int q=0; q<channels; q++) | |||
| // { | |||
| // float* ptr = bottom_top_blob.channel(q); | |||
| // for (int i=0; i<size; i++) | |||
| // { | |||
| // if (ptr[i] < 0) | |||
| // ptr[i] *= slope; | |||
| // } | |||
| // } | |||
| } | |||
| return 0; | |||
| } | |||
| int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const | |||
| { | |||
| if (bottom_top_blob.elemsize == 1u) | |||
| return ReLU::forward_inplace_int8(bottom_top_blob, opt); | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| @@ -27,6 +27,7 @@ public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const; | |||
| virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const; | |||
| #if NCNN_VULKAN | |||
| virtual int create_pipeline(); | |||
| @@ -0,0 +1,195 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #include "requantize.h" | |||
| #include <math.h> | |||
| namespace ncnn { | |||
| DEFINE_LAYER_CREATOR(Requantize) | |||
| Requantize::Requantize() | |||
| { | |||
| one_blob_only = true; | |||
| support_inplace = false; | |||
| fusion_relu = false; | |||
| } | |||
| static inline signed char float2int8(float v) | |||
| { | |||
| int int32 = round(v); | |||
| if (int32 > 127) return 127; | |||
| if (int32 < -128) return -128; | |||
| return (signed char)int32; | |||
| } | |||
| int Requantize::load_param(const ParamDict& pd) | |||
| { | |||
| scale_in = pd.get(0, 1.f); // bottom_blob_scale * weight_scale | |||
| scale_out = pd.get(1, 1.f); // top_blob_scale | |||
| bias_term = pd.get(2, 0); | |||
| bias_data_size = pd.get(3, 0); | |||
| fusion_relu = pd.get(4, 0); | |||
| return 0; | |||
| } | |||
| int Requantize::load_model(const ModelBin& mb) | |||
| { | |||
| if (bias_term) | |||
| { | |||
| bias_data = mb.load(bias_data_size, 1); | |||
| if (bias_data.empty()) | |||
| return -100; | |||
| } | |||
| return 0; | |||
| } | |||
| int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const | |||
| { | |||
| int dims = bottom_blob.dims; | |||
| if (dims == 1) | |||
| { | |||
| int w = bottom_blob.w; | |||
| const int* intptr = bottom_blob; | |||
| signed char * ptr = top_blob; | |||
| if (bias_term) | |||
| { | |||
| if (bias_data_size > 1) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| float bias = bias_data[0]; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<w; i++) | |||
| { | |||
| ptr[i] = float2int8(intptr[i] * scale_in * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| if (dims == 2) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| if (bias_term) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| const int* intptr = bottom_blob.row<const int>(i); | |||
| signed char* ptr = top_blob.row<signed char>(i); | |||
| float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0]; | |||
| for (int j=0; j<w; j++) | |||
| { | |||
| ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out); | |||
| if (fusion_relu && ptr[j] < 0) | |||
| ptr[j] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=0; i<h; i++) | |||
| { | |||
| const int* intptr = bottom_blob.row<const int>(i); | |||
| signed char* ptr = top_blob.row<signed char>(i); | |||
| for (int j=0; j<w; j++) | |||
| { | |||
| ptr[j] = float2int8(intptr[j] * scale_in * scale_out); | |||
| if (fusion_relu && ptr[j] < 0) | |||
| ptr[j] = 0; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| if (dims == 3) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int channels = bottom_blob.c; | |||
| int size = w * h; | |||
| if (bias_term) | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const int* intptr = bottom_blob.channel(q); | |||
| signed char* ptr = top_blob.channel(q); | |||
| float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0]; | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| else | |||
| { | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<channels; q++) | |||
| { | |||
| const int* intptr = bottom_blob.channel(q); | |||
| signed char* ptr = top_blob.channel(q); | |||
| for (int i=0; i<size; i++) | |||
| { | |||
| ptr[i] = float2int8(intptr[i] * scale_in * scale_out); | |||
| if (fusion_relu && ptr[i] < 0) | |||
| ptr[i] = 0; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| } // namespace ncnn | |||
| @@ -0,0 +1,46 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #ifndef LAYER_REQUANTIZE_H | |||
| #define LAYER_REQUANTIZE_H | |||
| #include "layer.h" | |||
| namespace ncnn { | |||
| class Requantize : public Layer | |||
| { | |||
| public: | |||
| Requantize(); | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| public: | |||
| float scale_in; // bottom_blob_scale * weight_scale | |||
| float scale_out;// top_blob_scale / (bottom_blob_scale * weight_scale) | |||
| int bias_term; | |||
| int bias_data_size; | |||
| bool fusion_relu; | |||
| Mat bias_data; | |||
| }; | |||
| } // namespace ncnn | |||
| #endif // LAYER_REQUANTIZE_H | |||
| @@ -1,6 +1,7 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| @@ -138,3 +139,496 @@ static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_transform_kernel_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch) | |||
| { | |||
| kernel_tm.create(4*4, inch, outch); | |||
| // G | |||
| const float ktm[4][3] = { | |||
| { 1.0f, 0.0f, 0.0f}, | |||
| { 1.0f/2, 1.0f/2, 1.0f/2}, | |||
| { 1.0f/2, -1.0f/2, 1.0f/2}, | |||
| { 0.0f, 0.0f, 1.0f} | |||
| }; | |||
| #pragma omp parallel for | |||
| for (int p = 0; p<outch; p++) | |||
| { | |||
| for (int q = 0; q<inch; q++) | |||
| { | |||
| const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9; | |||
| float* kernel_tm0 = kernel_tm.channel(p).row(q); | |||
| // transform kernel | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 3; | |||
| const float* k2 = kernel0 + 6; | |||
| // h | |||
| float tmp[4][3]; | |||
| for (int i=0; i<4; i++) | |||
| { | |||
| tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; | |||
| tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; | |||
| tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; | |||
| } | |||
| // U | |||
| for (int j=0; j<4; j++) | |||
| { | |||
| float* tmpp = &tmp[j][0]; | |||
| for (int i=0; i<4; i++) | |||
| { | |||
| kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| // pad to 2n+2, winograd F(2,3) | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| outw = (outw + 1) / 2 * 2; | |||
| outh = (outh + 1) / 2 * 2; | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| const float* bias = _bias; | |||
| // BEGIN transform input | |||
| Mat bottom_blob_tm; | |||
| { | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| const int tiles = nColBlocks * nRowBlocks; | |||
| bottom_blob_tm.create(4*4, tiles, inch, 4u, opt.workspace_allocator); | |||
| // BT | |||
| // const float itm[4][4] = { | |||
| // {1.0f, 0.0f, -1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.00f, 0.0f}, | |||
| // {0.0f, -1.0f, 1.00f, 0.0f}, | |||
| // {0.0f, -1.0f, 0.00f, 1.0f} | |||
| // }; | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const float* img = bottom_blob_bordered.channel(q); | |||
| float* out_tm0 = bottom_blob_tm.channel(q); | |||
| for (int j = 0; j < nColBlocks; j++) | |||
| { | |||
| const float* r0 = img + w * j * 2; | |||
| const float* r1 = r0 + w; | |||
| const float* r2 = r1 + w; | |||
| const float* r3 = r2 + w; | |||
| for (int i = 0; i < nRowBlocks; i++) | |||
| { | |||
| float d0[4],d1[4],d2[4],d3[4]; | |||
| float w0[4],w1[4],w2[4],w3[4]; | |||
| float t0[4],t1[4],t2[4],t3[4]; | |||
| // load | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| d0[n] = r0[n]; | |||
| d1[n] = r1[n]; | |||
| d2[n] = r2[n]; | |||
| d3[n] = r3[n]; | |||
| } | |||
| // w = B_t * d | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| w0[n] = d0[n] - d2[n]; | |||
| w1[n] = d1[n] + d2[n]; | |||
| w2[n] = d2[n] - d1[n]; | |||
| w3[n] = d3[n] - d1[n]; | |||
| } | |||
| // transpose d to d_t | |||
| { | |||
| t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3]; | |||
| t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3]; | |||
| t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3]; | |||
| t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3]; | |||
| } | |||
| // d = B_t * d_t | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| d0[n] = t0[n] - t2[n]; | |||
| d1[n] = t1[n] + t2[n]; | |||
| d2[n] = t2[n] - t1[n]; | |||
| d3[n] = t3[n] - t1[n]; | |||
| } | |||
| // save to out_tm | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| out_tm0[n ] = d0[n]; | |||
| out_tm0[n+ 4] = d1[n]; | |||
| out_tm0[n+ 8] = d2[n]; | |||
| out_tm0[n+12] = d3[n]; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| r3 += 2; | |||
| out_tm0 += 16; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_bordered = Mat(); | |||
| // BEGIN dot | |||
| Mat top_blob_tm; | |||
| { | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| const int tiles = nColBlocks * nRowBlocks; | |||
| top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator); | |||
| int nn_outch = outch >> 2; | |||
| int remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| Mat out0_tm = top_blob_tm.channel(p); | |||
| Mat out1_tm = top_blob_tm.channel(p+1); | |||
| Mat out2_tm = top_blob_tm.channel(p+2); | |||
| Mat out3_tm = top_blob_tm.channel(p+3); | |||
| const Mat kernel0_tm = kernel_tm.channel(p); | |||
| const Mat kernel1_tm = kernel_tm.channel(p+1); | |||
| const Mat kernel2_tm = kernel_tm.channel(p+2); | |||
| const Mat kernel3_tm = kernel_tm.channel(p+3); | |||
| for (int i=0; i<tiles; i++) | |||
| { | |||
| float* output0_tm = out0_tm.row(i); | |||
| float* output1_tm = out1_tm.row(i); | |||
| float* output2_tm = out2_tm.row(i); | |||
| float* output3_tm = out3_tm.row(i); | |||
| float sum0[16] = {0.0f}; | |||
| float sum1[16] = {0.0f}; | |||
| float sum2[16] = {0.0f}; | |||
| float sum3[16] = {0.0f}; | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| const float* r0 = bottom_blob_tm.channel(q).row(i); | |||
| const float* r1 = bottom_blob_tm.channel(q+1).row(i); | |||
| const float* r2 = bottom_blob_tm.channel(q+2).row(i); | |||
| const float* r3 = bottom_blob_tm.channel(q+3).row(i); | |||
| const float* k0 = kernel0_tm.row(q); | |||
| const float* k1 = kernel1_tm.row(q); | |||
| const float* k2 = kernel2_tm.row(q); | |||
| const float* k3 = kernel3_tm.row(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += r0[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += r1[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += r2[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += r3[n] * k0[n]; | |||
| k0 -= 16 * 3; | |||
| sum1[n] += r0[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += r1[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += r2[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += r3[n] * k1[n]; | |||
| k1 -= 16 * 3; | |||
| sum2[n] += r0[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += r1[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += r2[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += r3[n] * k2[n]; | |||
| k2 -= 16 * 3; | |||
| sum3[n] += r0[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += r1[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += r2[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += r3[n] * k3[n]; | |||
| k3 -= 16 * 3; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| const float* r0 = bottom_blob_tm.channel(q).row(i); | |||
| const float* k0 = kernel0_tm.row(q); | |||
| const float* k1 = kernel1_tm.row(q); | |||
| const float* k2 = kernel2_tm.row(q); | |||
| const float* k3 = kernel3_tm.row(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += r0[n] * k0[n]; | |||
| sum1[n] += r0[n] * k1[n]; | |||
| sum2[n] += r0[n] * k2[n]; | |||
| sum3[n] += r0[n] * k3[n]; | |||
| } | |||
| } | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| output0_tm[n] = sum0[n]; | |||
| output1_tm[n] = sum1[n]; | |||
| output2_tm[n] = sum2[n]; | |||
| output3_tm[n] = sum3[n]; | |||
| } | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out0_tm = top_blob_tm.channel(p); | |||
| const Mat kernel0_tm = kernel_tm.channel(p); | |||
| for (int i=0; i<tiles; i++) | |||
| { | |||
| float* output0_tm = out0_tm.row(i); | |||
| float sum0[16] = {0.0f}; | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| const float* r0 = bottom_blob_tm.channel(q).row(i); | |||
| const float* r1 = bottom_blob_tm.channel(q+1).row(i); | |||
| const float* r2 = bottom_blob_tm.channel(q+2).row(i); | |||
| const float* r3 = bottom_blob_tm.channel(q+3).row(i); | |||
| const float* k0 = kernel0_tm.row(q); | |||
| const float* k1 = kernel0_tm.row(q+1); | |||
| const float* k2 = kernel0_tm.row(q+2); | |||
| const float* k3 = kernel0_tm.row(q+3); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += r0[n] * k0[n]; | |||
| sum0[n] += r1[n] * k1[n]; | |||
| sum0[n] += r2[n] * k2[n]; | |||
| sum0[n] += r3[n] * k3[n]; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| const float* r0 = bottom_blob_tm.channel(q).row(i); | |||
| const float* k0 = kernel0_tm.row(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += r0[n] * k0[n]; | |||
| } | |||
| } | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| output0_tm[n] = sum0[n]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // END dot | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); | |||
| { | |||
| // AT | |||
| // const float itm[2][4] = { | |||
| // {1.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 1.0f} | |||
| // }; | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out_tm = top_blob_tm.channel(p); | |||
| Mat out = top_blob_bordered.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| for (int j=0; j<nColBlocks; j++) | |||
| { | |||
| float* outRow0 = out.row(j*2); | |||
| float* outRow1 = out.row(j*2+1); | |||
| for(int i=0; i<nRowBlocks; i++) | |||
| { | |||
| float* out_tile = out_tm.row(j*nRowBlocks + i); | |||
| float s0[4],s1[4],s2[4],s3[4]; | |||
| float w0[4],w1[4]; | |||
| float d0[2],d1[2],d2[2],d3[2]; | |||
| float o0[2],o1[2]; | |||
| // load | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| s0[n] = out_tile[n]; | |||
| s1[n] = out_tile[n+ 4]; | |||
| s2[n] = out_tile[n+ 8]; | |||
| s3[n] = out_tile[n+12]; | |||
| } | |||
| // w = A_T * W | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| w0[n] = s0[n] + s1[n] + s2[n]; | |||
| w1[n] = s1[n] - s2[n] + s3[n]; | |||
| } | |||
| // transpose w to w_t | |||
| { | |||
| d0[0] = w0[0]; d0[1] = w1[0]; | |||
| d1[0] = w0[1]; d1[1] = w1[1]; | |||
| d2[0] = w0[2]; d2[1] = w1[2]; | |||
| d3[0] = w0[3]; d3[1] = w1[3]; | |||
| } | |||
| // Y = A_T * w_t | |||
| for (int n = 0; n < 2; n++) | |||
| { | |||
| o0[n] = d0[n] + d1[n] + d2[n] + bias0; | |||
| o1[n] = d1[n] - d2[n] + d3[n] + bias0; | |||
| } | |||
| // save to top blob tm | |||
| outRow0[0] = o0[0]; | |||
| outRow0[1] = o0[1]; | |||
| outRow1[0] = o1[0]; | |||
| outRow1[1] = o1[1]; | |||
| outRow0 += 2; | |||
| outRow1 += 2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); | |||
| } | |||
| static void conv3x3s2_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Mat& _bias, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const int tailstep = w - 2 * outw + w; | |||
| const float* kernel = _kernel; | |||
| const float* bias = _bias; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p = 0; p < outch; p++) | |||
| { | |||
| Mat out = top_blob.channel(p); | |||
| const float bias0 = bias ? bias[p] : 0.f; | |||
| out.fill(bias0); | |||
| for (int q = 0; q < inch; q++) | |||
| { | |||
| float *outptr = out; | |||
| const float *img = bottom_blob.channel(q); | |||
| const float* kernel0 = kernel + p*inch*9 + q*9; | |||
| const float *r0 = img; | |||
| const float *r1 = img + w; | |||
| const float *r2 = img + w * 2; | |||
| const float* k0 = kernel0; | |||
| const float* k1 = kernel0 + 3; | |||
| const float* k2 = kernel0 + 6; | |||
| for (int i = 0; i < outh; i++) | |||
| { | |||
| int remain = outw; | |||
| for (; remain > 0; remain--) | |||
| { | |||
| float sum = 0; | |||
| sum += r0[0] * k0[0]; | |||
| sum += r0[1] * k0[1]; | |||
| sum += r0[2] * k0[2]; | |||
| sum += r1[0] * k1[0]; | |||
| sum += r1[1] * k1[1]; | |||
| sum += r1[2] * k1[2]; | |||
| sum += r2[0] * k2[0]; | |||
| sum += r2[1] * k2[1]; | |||
| sum += r2[2] * k2[2]; | |||
| *outptr += sum; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| outptr++; | |||
| } | |||
| r0 += tailstep; | |||
| r1 += tailstep; | |||
| r2 += tailstep; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -11,12 +11,6 @@ | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static inline short saturate2int16(int v) | |||
| { | |||
| if (v > 32767) return 32767; | |||
| if (v < -32768) return -32768; | |||
| return (short)v; | |||
| } | |||
| static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| @@ -84,6 +78,424 @@ static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch) | |||
| { | |||
| kernel_tm.create(4*4, inch, outch, 2ul); | |||
| // G | |||
| const short ktm[4][3] = { | |||
| { 2, 0, 0}, | |||
| { 1, 1, 1}, | |||
| { 1, -1, 1}, | |||
| { 0, 0, 2} | |||
| }; | |||
| #pragma omp parallel for | |||
| for (int p = 0; p<outch; p++) | |||
| { | |||
| for (int q = 0; q<inch; q++) | |||
| { | |||
| const signed char* kernel0 = (const signed char*)kernel + p*inch * 9 + q * 9; | |||
| short* kernel_tm0 = kernel_tm.channel(p).row<short>(q); | |||
| // transform kernel | |||
| const signed char* k0 = kernel0; | |||
| const signed char* k1 = kernel0 + 3; | |||
| const signed char* k2 = kernel0 + 6; | |||
| // h | |||
| short tmp[4][3]; | |||
| for (int i=0; i<4; i++) | |||
| { | |||
| tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; | |||
| tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; | |||
| tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; | |||
| } | |||
| // U | |||
| for (int j=0; j<4; j++) | |||
| { | |||
| short* tmpp = &tmp[j][0]; | |||
| for (int i=0; i<4; i++) | |||
| { | |||
| kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int h = bottom_blob.h; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| // pad to 2n+2, winograd F(2,3) | |||
| Mat bottom_blob_bordered = bottom_blob; | |||
| outw = (outw + 1) / 2 * 2; | |||
| outh = (outh + 1) / 2 * 2; | |||
| w = outw + 2; | |||
| h = outh + 2; | |||
| copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); | |||
| // BEGIN transform input | |||
| Mat bottom_blob_tm; | |||
| { | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| const int tiles = nColBlocks * nRowBlocks; | |||
| bottom_blob_tm.create(4*4, tiles, inch, 2u, opt.workspace_allocator); | |||
| // BT | |||
| // const float itm[4][4] = { | |||
| // {1.0f, 0.0f, -1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, 1.00f, 0.0f}, | |||
| // {0.0f, -1.0f, 1.00f, 0.0f}, | |||
| // {0.0f, -1.0f, 0.00f, 1.0f} | |||
| // }; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int q=0; q<inch; q++) | |||
| { | |||
| const signed char* img = bottom_blob_bordered.channel(q); | |||
| short* out_tm0 = bottom_blob_tm.channel(q); | |||
| for (int j = 0; j < nColBlocks; j++) | |||
| { | |||
| const signed char* r0 = img + w * j * 2; | |||
| const signed char* r1 = r0 + w; | |||
| const signed char* r2 = r1 + w; | |||
| const signed char* r3 = r2 + w; | |||
| for (int i = 0; i < nRowBlocks; i++) | |||
| { | |||
| short d0[4],d1[4],d2[4],d3[4]; | |||
| short w0[4],w1[4],w2[4],w3[4]; | |||
| short t0[4],t1[4],t2[4],t3[4]; | |||
| // load | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| d0[n] = r0[n]; | |||
| d1[n] = r1[n]; | |||
| d2[n] = r2[n]; | |||
| d3[n] = r3[n]; | |||
| } | |||
| // w = B_t * d | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| w0[n] = d0[n] - d2[n]; | |||
| w1[n] = d1[n] + d2[n]; | |||
| w2[n] = d2[n] - d1[n]; | |||
| w3[n] = d3[n] - d1[n]; | |||
| } | |||
| // transpose d to d_t | |||
| { | |||
| t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3]; | |||
| t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3]; | |||
| t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3]; | |||
| t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3]; | |||
| } | |||
| // U = B_t * d_t | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| d0[n] = t0[n] - t2[n]; | |||
| d1[n] = t1[n] + t2[n]; | |||
| d2[n] = t2[n] - t1[n]; | |||
| d3[n] = t3[n] - t1[n]; | |||
| } | |||
| // save to out_tm | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| out_tm0[n ] = d0[n]; | |||
| out_tm0[n+ 4] = d1[n]; | |||
| out_tm0[n+ 8] = d2[n]; | |||
| out_tm0[n+12] = d3[n]; | |||
| } | |||
| r0 += 2; | |||
| r1 += 2; | |||
| r2 += 2; | |||
| r3 += 2; | |||
| out_tm0 += 16; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_bordered = Mat(); | |||
| // BEGIN dot | |||
| Mat top_blob_tm; | |||
| { | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| const int tiles = nColBlocks * nRowBlocks; | |||
| top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator); | |||
| int nn_outch = outch >> 2; | |||
| int remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| Mat out0_tm = top_blob_tm.channel(p); | |||
| Mat out1_tm = top_blob_tm.channel(p+1); | |||
| Mat out2_tm = top_blob_tm.channel(p+2); | |||
| Mat out3_tm = top_blob_tm.channel(p+3); | |||
| const Mat kernel0_tm = kernel_tm.channel(p); | |||
| const Mat kernel1_tm = kernel_tm.channel(p+1); | |||
| const Mat kernel2_tm = kernel_tm.channel(p+2); | |||
| const Mat kernel3_tm = kernel_tm.channel(p+3); | |||
| for (int i=0; i<tiles; i++) | |||
| { | |||
| int* output0_tm = out0_tm.row<int>(i); | |||
| int* output1_tm = out1_tm.row<int>(i); | |||
| int* output2_tm = out2_tm.row<int>(i); | |||
| int* output3_tm = out3_tm.row<int>(i); | |||
| int sum0[16] = {0}; | |||
| int sum1[16] = {0}; | |||
| int sum2[16] = {0}; | |||
| int sum3[16] = {0}; | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| const short* r0 = bottom_blob_tm.channel(q).row<short>(i); | |||
| const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i); | |||
| const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i); | |||
| const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i); | |||
| const short* k0 = kernel0_tm.row<short>(q); | |||
| const short* k1 = kernel1_tm.row<short>(q); | |||
| const short* k2 = kernel2_tm.row<short>(q); | |||
| const short* k3 = kernel3_tm.row<short>(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += (int)r0[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += (int)r1[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += (int)r2[n] * k0[n]; | |||
| k0 += 16; | |||
| sum0[n] += (int)r3[n] * k0[n]; | |||
| k0 -= 16 * 3; | |||
| sum1[n] += (int)r0[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += (int)r1[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += (int)r2[n] * k1[n]; | |||
| k1 += 16; | |||
| sum1[n] += (int)r3[n] * k1[n]; | |||
| k1 -= 16 * 3; | |||
| sum2[n] += (int)r0[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += (int)r1[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += (int)r2[n] * k2[n]; | |||
| k2 += 16; | |||
| sum2[n] += (int)r3[n] * k2[n]; | |||
| k2 -= 16 * 3; | |||
| sum3[n] += (int)r0[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += (int)r1[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += (int)r2[n] * k3[n]; | |||
| k3 += 16; | |||
| sum3[n] += (int)r3[n] * k3[n]; | |||
| k3 -= 16 * 3; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| const short* r0 = bottom_blob_tm.channel(q).row<short>(i); | |||
| const short* k0 = kernel0_tm.row<short>(q); | |||
| const short* k1 = kernel1_tm.row<short>(q); | |||
| const short* k2 = kernel2_tm.row<short>(q); | |||
| const short* k3 = kernel3_tm.row<short>(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += (int)r0[n] * k0[n]; | |||
| sum1[n] += (int)r0[n] * k1[n]; | |||
| sum2[n] += (int)r0[n] * k2[n]; | |||
| sum3[n] += (int)r0[n] * k3[n]; | |||
| } | |||
| } | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| output0_tm[n] = sum0[n]; | |||
| output1_tm[n] = sum1[n]; | |||
| output2_tm[n] = sum2[n]; | |||
| output3_tm[n] = sum3[n]; | |||
| } | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| Mat out0_tm = top_blob_tm.channel(p); | |||
| const Mat kernel0_tm = kernel_tm.channel(p); | |||
| for (int i=0; i<tiles; i++) | |||
| { | |||
| int* output0_tm = out0_tm.row<int>(i); | |||
| int sum0[16] = {0}; | |||
| int q = 0; | |||
| for (; q+3<inch; q+=4) | |||
| { | |||
| const short* r0 = bottom_blob_tm.channel(q).row<short>(i); | |||
| const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i); | |||
| const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i); | |||
| const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i); | |||
| const short* k0 = kernel0_tm.row<short>(q); | |||
| const short* k1 = kernel0_tm.row<short>(q+1); | |||
| const short* k2 = kernel0_tm.row<short>(q+2); | |||
| const short* k3 = kernel0_tm.row<short>(q+3); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += (int)r0[n] * k0[n]; | |||
| sum0[n] += (int)r1[n] * k1[n]; | |||
| sum0[n] += (int)r2[n] * k2[n]; | |||
| sum0[n] += (int)r3[n] * k3[n]; | |||
| } | |||
| } | |||
| for (; q<inch; q++) | |||
| { | |||
| const short* r0 = bottom_blob_tm.channel(q).row<short>(i); | |||
| const short* k0 = kernel0_tm.row<short>(q); | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| sum0[n] += (int)r0[n] * k0[n]; | |||
| } | |||
| } | |||
| for (int n=0; n<16; n++) | |||
| { | |||
| output0_tm[n] = sum0[n]; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| bottom_blob_tm = Mat(); | |||
| // END dot | |||
| // BEGIN transform output | |||
| Mat top_blob_bordered; | |||
| top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); | |||
| { | |||
| // AT | |||
| // const float itm[2][4] = { | |||
| // {1.0f, 1.0f, 1.0f, 0.0f}, | |||
| // {0.0f, 1.0f, -1.0f, 1.0f} | |||
| // }; | |||
| int w_tm = outw / 2 * 4; | |||
| int h_tm = outh / 2 * 4; | |||
| int nColBlocks = h_tm/4; // may be the block num in Feathercnn | |||
| int nRowBlocks = w_tm/4; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<outch; p++) | |||
| { | |||
| Mat out_tm = top_blob_tm.channel(p); | |||
| Mat out = top_blob_bordered.channel(p); | |||
| for (int j=0; j<nColBlocks; j++) | |||
| { | |||
| int* outRow0 = out.row<int>(j*2); | |||
| int* outRow1 = out.row<int>(j*2+1); | |||
| for(int i=0; i<nRowBlocks; i++) | |||
| { | |||
| int* out_tile = out_tm.row<int>(j*nRowBlocks + i); | |||
| int s0[4],s1[4],s2[4],s3[4]; | |||
| int w0[4],w1[4]; | |||
| int d0[2],d1[2],d2[2],d3[2]; | |||
| int o0[2],o1[2]; | |||
| // load | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| s0[n] = out_tile[n]; | |||
| s1[n] = out_tile[n+ 4]; | |||
| s2[n] = out_tile[n+ 8]; | |||
| s3[n] = out_tile[n+12]; | |||
| } | |||
| // w = A_T * W | |||
| for (int n = 0; n < 4; n++) | |||
| { | |||
| w0[n] = s0[n] + s1[n] + s2[n]; | |||
| w1[n] = s1[n] - s2[n] + s3[n]; | |||
| } | |||
| // transpose w to w_t | |||
| { | |||
| d0[0] = w0[0]; d0[1] = w1[0]; | |||
| d1[0] = w0[1]; d1[1] = w1[1]; | |||
| d2[0] = w0[2]; d2[1] = w1[2]; | |||
| d3[0] = w0[3]; d3[1] = w1[3]; | |||
| } | |||
| // Y = A_T * w_t | |||
| for (int n = 0; n < 2; n++) | |||
| { | |||
| o0[n] = d0[n] + d1[n] + d2[n]; | |||
| o1[n] = d1[n] - d2[n] + d3[n]; | |||
| } | |||
| // save to top blob tm,why right 2,because the G' = G*2 | |||
| outRow0[0] = o0[0] >> 2; | |||
| outRow0[1] = o0[1] >> 2; | |||
| outRow1[0] = o1[0] >> 2; | |||
| outRow1[1] = o1[1] >> 2; | |||
| outRow0 += 2; | |||
| outRow1 += 2; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| // END transform output | |||
| // cut result pad | |||
| copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); | |||
| } | |||
| static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -122,23 +534,19 @@ static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat | |||
| for (; remain > 0; remain--) | |||
| { | |||
| short sum0 = 0; | |||
| short sum1 = 0; | |||
| short sum2 = 0; | |||
| sum0 += (short)r0[0] * kernel0[0]; | |||
| sum0 += (short)r0[1] * kernel0[1]; | |||
| sum0 += (short)r0[2] * kernel0[2]; | |||
| sum1 += (short)r1[0] * kernel0[3]; | |||
| sum1 += (short)r1[1] * kernel0[4]; | |||
| sum1 += (short)r1[2] * kernel0[5]; | |||
| sum2 += (short)r2[0] * kernel0[6]; | |||
| sum2 += (short)r2[1] * kernel0[7]; | |||
| sum2 += (short)r2[2] * kernel0[8]; | |||
| *outptr0 = saturate2int16(*outptr0 + sum0); | |||
| *outptr0 = saturate2int16(*outptr0 + sum1); | |||
| *outptr0 = saturate2int16(*outptr0 + sum2); | |||
| int sum0 = 0; | |||
| sum0 += (int)r0[0] * kernel0[0]; | |||
| sum0 += (int)r0[1] * kernel0[1]; | |||
| sum0 += (int)r0[2] * kernel0[2]; | |||
| sum0 += (int)r1[0] * kernel0[3]; | |||
| sum0 += (int)r1[1] * kernel0[4]; | |||
| sum0 += (int)r1[2] * kernel0[5]; | |||
| sum0 += (int)r2[0] * kernel0[6]; | |||
| sum0 += (int)r2[1] * kernel0[7]; | |||
| sum0 += (int)r2[2] * kernel0[8]; | |||
| *outptr0 += sum0; | |||
| r0 += 2; | |||
| r1 += 2; | |||
| @@ -0,0 +1,35 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv5x5s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 5; | |||
| int kernel_h = 5; | |||
| int stride_w = 1; | |||
| int stride_h = 1; | |||
| conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| static void conv5x5s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 5; | |||
| int kernel_h = 5; | |||
| int stride_w = 2; | |||
| int stride_h = 2; | |||
| conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| @@ -0,0 +1,35 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv7x7s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 7; | |||
| int kernel_h = 7; | |||
| int stride_w = 1; | |||
| int stride_h = 1; | |||
| conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| static void conv7x7s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) | |||
| { | |||
| int kernel_w = 7; | |||
| int kernel_h = 7; | |||
| int stride_w = 2; | |||
| int stride_h = 2; | |||
| conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); | |||
| } | |||
| @@ -0,0 +1,381 @@ | |||
| // SenseNets is pleased to support the open source community by supporting ncnn available. | |||
| // | |||
| // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| static void conv_im2col_sgemm_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \ | |||
| const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt) | |||
| { | |||
| int w = bottom_blob.w; | |||
| int inch = bottom_blob.c; | |||
| int outw = top_blob.w; | |||
| int outh = top_blob.h; | |||
| int outch = top_blob.c; | |||
| const signed char *kernel = _kernel; | |||
| // im2col | |||
| Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator); | |||
| { | |||
| const int stride = kernel_h*kernel_w*outw*outh; | |||
| signed char* ret = (signed char*)bottom_im2col; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<inch; p++) | |||
| { | |||
| const signed char* input = bottom_blob.channel(p); | |||
| int retID = stride * p; | |||
| for (int u=0; u<kernel_h; u++) | |||
| { | |||
| for (int v=0; v<kernel_w; v++) | |||
| { | |||
| for (int i=0; i<outh; i++) | |||
| { | |||
| for (int j=0; j<outw; j++) | |||
| { | |||
| int row = u + i * stride_h; | |||
| int col = v + j * stride_w; | |||
| int index = row * w + col; | |||
| ret[retID] = input[index]; | |||
| retID++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| int kernel_size = kernel_w * kernel_h; | |||
| int out_size = outw * outh; | |||
| // bottom_im2col memory packed 4 x 8 | |||
| Mat bottom_tm(8*kernel_size, inch, out_size/8 + out_size%8, (size_t)1u, opt.workspace_allocator); | |||
| { | |||
| int nn_size = out_size >> 3; | |||
| int remain_size_start = nn_size << 3; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int ii=0; ii<nn_size; ii++) | |||
| { | |||
| int i = ii * 8; | |||
| const signed char* img0 = bottom_im2col.channel(0); | |||
| img0 += i; | |||
| signed char* tmpptr = bottom_tm.channel(i/8); | |||
| for (int q=0; q<inch*kernel_size; q++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr[1] = img0[1]; | |||
| tmpptr[2] = img0[2]; | |||
| tmpptr[3] = img0[3]; | |||
| tmpptr[4] = img0[4]; | |||
| tmpptr[5] = img0[5]; | |||
| tmpptr[6] = img0[6]; | |||
| tmpptr[7] = img0[7]; | |||
| tmpptr += 8; | |||
| img0 += out_size; | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=remain_size_start; i<out_size; i++) | |||
| { | |||
| const signed char* img0 = bottom_im2col.channel(0); | |||
| img0 += i; | |||
| signed char* tmpptr = bottom_tm.channel(i/8 + i%8); | |||
| for (int q=0; q<inch*kernel_size; q++) | |||
| { | |||
| tmpptr[0] = img0[0]; | |||
| tmpptr += 1; | |||
| img0 += out_size; | |||
| } | |||
| } | |||
| } | |||
| // kernel memory packed 4 x 8 | |||
| Mat kernel_tm(4*kernel_size, inch, outch/4 + outch%4, (size_t)1u, opt.workspace_allocator); | |||
| { | |||
| int nn_outch = 0; | |||
| int remain_outch_start = 0; | |||
| nn_outch = outch >> 2; | |||
| remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int p = pp * 4; | |||
| const signed char* k0 = kernel + (p+0)*inch*kernel_size; | |||
| const signed char* k1 = kernel + (p+1)*inch*kernel_size; | |||
| const signed char* k2 = kernel + (p+2)*inch*kernel_size; | |||
| const signed char* k3 = kernel + (p+3)*inch*kernel_size; | |||
| signed char* ktmp = kernel_tm.channel(p/4); | |||
| for (int q=0; q<inch*kernel_size; q++) | |||
| { | |||
| ktmp[0] = k0[0]; | |||
| ktmp[1] = k1[0]; | |||
| ktmp[2] = k2[0]; | |||
| ktmp[3] = k3[0]; | |||
| ktmp += 4; | |||
| k0 += 1; | |||
| k1 += 1; | |||
| k2 += 1; | |||
| k3 += 1; | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=remain_outch_start; p<outch; p++) | |||
| { | |||
| const signed char* k0 = kernel + (p+0)*inch*kernel_size; | |||
| signed char* ktmp = kernel_tm.channel(p/4 + p%4); | |||
| for (int q=0; q<inch*kernel_size; q++) | |||
| { | |||
| ktmp[0] = k0[0]; | |||
| ktmp++; | |||
| k0++; | |||
| } | |||
| } | |||
| } | |||
| // sgemm(int M, int N, int L, float* A, float* B, float* C) | |||
| { | |||
| // int M = outch; // outch | |||
| int N = outw * outh; // outsize or out stride | |||
| int L = kernel_w * kernel_h * inch; // ksize * inch | |||
| int nn_outch = 0; | |||
| int remain_outch_start = 0; | |||
| nn_outch = outch >> 2; | |||
| remain_outch_start = nn_outch << 2; | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int pp=0; pp<nn_outch; pp++) | |||
| { | |||
| int i = pp * 4; | |||
| int* output0 = top_blob.channel(i); | |||
| int* output1 = top_blob.channel(i+1); | |||
| int* output2 = top_blob.channel(i+2); | |||
| int* output3 = top_blob.channel(i+3); | |||
| int j=0; | |||
| for (; j+7<N; j=j+8) | |||
| { | |||
| signed char* vb = bottom_tm.channel(j/8); | |||
| signed char* va = kernel_tm.channel(i/4); | |||
| int sum0[8] = {0}; | |||
| int sum1[8] = {0}; | |||
| int sum2[8] = {0}; | |||
| int sum3[8] = {0}; | |||
| int k=0; | |||
| for (; k+7<L; k=k+8) | |||
| { | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| sum0[n] += (int)va[0] * vb[n]; | |||
| sum1[n] += (int)va[1] * vb[n]; | |||
| sum2[n] += (int)va[2] * vb[n]; | |||
| sum3[n] += (int)va[3] * vb[n]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+8]; | |||
| sum1[n] += (int)va[1] * vb[n+8]; | |||
| sum2[n] += (int)va[2] * vb[n+8]; | |||
| sum3[n] += (int)va[3] * vb[n+8]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+16]; | |||
| sum1[n] += (int)va[1] * vb[n+16]; | |||
| sum2[n] += (int)va[2] * vb[n+16]; | |||
| sum3[n] += (int)va[3] * vb[n+16]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+24]; | |||
| sum1[n] += (int)va[1] * vb[n+24]; | |||
| sum2[n] += (int)va[2] * vb[n+24]; | |||
| sum3[n] += (int)va[3] * vb[n+24]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+32]; | |||
| sum1[n] += (int)va[1] * vb[n+32]; | |||
| sum2[n] += (int)va[2] * vb[n+32]; | |||
| sum3[n] += (int)va[3] * vb[n+32]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+40]; | |||
| sum1[n] += (int)va[1] * vb[n+40]; | |||
| sum2[n] += (int)va[2] * vb[n+40]; | |||
| sum3[n] += (int)va[3] * vb[n+40]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+48]; | |||
| sum1[n] += (int)va[1] * vb[n+48]; | |||
| sum2[n] += (int)va[2] * vb[n+48]; | |||
| sum3[n] += (int)va[3] * vb[n+48]; | |||
| va += 4; | |||
| sum0[n] += (int)va[0] * vb[n+56]; | |||
| sum1[n] += (int)va[1] * vb[n+56]; | |||
| sum2[n] += (int)va[2] * vb[n+56]; | |||
| sum3[n] += (int)va[3] * vb[n+56]; | |||
| va -= 28; | |||
| } | |||
| va += 32; | |||
| vb += 64; | |||
| } | |||
| for (; k<L; k++) | |||
| { | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| sum0[n] += (int)va[0] * vb[n]; | |||
| sum1[n] += (int)va[1] * vb[n]; | |||
| sum2[n] += (int)va[2] * vb[n]; | |||
| sum3[n] += (int)va[3] * vb[n]; | |||
| } | |||
| va += 4; | |||
| vb += 8; | |||
| } | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| output0[n] = sum0[n]; | |||
| output1[n] = sum1[n]; | |||
| output2[n] = sum2[n]; | |||
| output3[n] = sum3[n]; | |||
| } | |||
| output0 += 8; | |||
| output1 += 8; | |||
| output2 += 8; | |||
| output3 += 8; | |||
| } | |||
| for (; j<N; j++) | |||
| { | |||
| int sum0 = 0; | |||
| int sum1 = 0; | |||
| int sum2 = 0; | |||
| int sum3 = 0; | |||
| signed char* vb = bottom_tm.channel(j/8 + j%8); | |||
| signed char* va = kernel_tm.channel(i/4); | |||
| for (int k=0; k<L; k++) | |||
| { | |||
| sum0 += (int)va[0] * vb[0]; | |||
| sum1 += (int)va[1] * vb[0]; | |||
| sum2 += (int)va[2] * vb[0]; | |||
| sum3 += (int)va[3] * vb[0]; | |||
| va += 4; | |||
| vb += 1; | |||
| } | |||
| output0[0] = sum0; | |||
| output1[0] = sum1; | |||
| output2[0] = sum2; | |||
| output3[0] = sum3; | |||
| output0++; | |||
| output1++; | |||
| output2++; | |||
| output3++; | |||
| } | |||
| } | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int i=remain_outch_start; i<outch; i++) | |||
| { | |||
| int* output = top_blob.channel(i); | |||
| int j=0; | |||
| for (; j+7<N; j=j+8) | |||
| { | |||
| signed char* vb = bottom_tm.channel(j/8); | |||
| signed char* va = kernel_tm.channel(i/4 + i%4); | |||
| int sum[8] = {0}; | |||
| int k=0; | |||
| for (; k+7<L; k=k+8) | |||
| { | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| sum[n] += (int)va[0] * vb[n]; | |||
| sum[n] += (int)va[1] * vb[n+8]; | |||
| sum[n] += (int)va[2] * vb[n+16]; | |||
| sum[n] += (int)va[3] * vb[n+24]; | |||
| sum[n] += (int)va[4] * vb[n+32]; | |||
| sum[n] += (int)va[5] * vb[n+40]; | |||
| sum[n] += (int)va[6] * vb[n+48]; | |||
| sum[n] += (int)va[7] * vb[n+56]; | |||
| } | |||
| va += 8; | |||
| vb += 64; | |||
| } | |||
| for (; k<L; k++) | |||
| { | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| sum[n] += (int)va[0] * vb[n]; | |||
| } | |||
| va += 1; | |||
| vb += 8; | |||
| } | |||
| for (int n=0; n<8; n++) | |||
| { | |||
| output[n] = sum[n]; | |||
| } | |||
| output += 8; | |||
| } | |||
| for (; j<N; j++) | |||
| { | |||
| int sum = 0; | |||
| signed char* vb = bottom_tm.channel(j/8 + j%8); | |||
| signed char* va = kernel_tm.channel(i/4 + i%4); | |||
| for (int k=0; k<L; k++) | |||
| { | |||
| sum += (int)va[0] * vb[0]; | |||
| va += 1; | |||
| vb += 1; | |||
| } | |||
| output[0] = sum; | |||
| output++; | |||
| } | |||
| } | |||
| } | |||
| } | |||
| @@ -14,17 +14,61 @@ | |||
| #include "convolution_x86.h" | |||
| #include "layer_type.h" | |||
| #include "benchmark.h" | |||
| namespace ncnn { | |||
| #include "convolution_1x1.h" | |||
| #include "convolution_3x3.h" | |||
| #include "convolution_5x5.h" | |||
| #include "convolution_sgemm_int8.h" | |||
| #include "convolution_1x1_int8.h" | |||
| #include "convolution_3x3_int8.h" | |||
| #include "convolution_5x5_int8.h" | |||
| #include "convolution_7x7_int8.h" | |||
| DEFINE_LAYER_CREATOR(Convolution_x86) | |||
| int Convolution_x86::load_param(const ParamDict& pd) | |||
| { | |||
| int ret = Convolution::load_param(pd); | |||
| if (ret != 0) | |||
| return ret; | |||
| use_winograd3x3 = false; | |||
| if (pd.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) | |||
| { | |||
| int num_input = weight_data_size / 9 / num_output; | |||
| // winograd is slow on small channel count | |||
| if(num_input >= 16 && num_output >= 16) | |||
| use_winograd3x3 = true; | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution_x86::load_model(const ModelBin& mb) | |||
| { | |||
| int ret = Convolution::load_model(mb); | |||
| if (ret != 0) | |||
| return ret; | |||
| if (use_winograd3x3) | |||
| { | |||
| int num_input = weight_data_size / 9 / num_output; | |||
| if (use_int8_inference) | |||
| conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output); | |||
| else | |||
| conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output); | |||
| } | |||
| return 0; | |||
| } | |||
| int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const | |||
| { | |||
| int w = bottom_blob.w; | |||
| @@ -147,7 +191,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| const int kernel_size = kernel_w; | |||
| const int stride = stride_w; | |||
| if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h) | |||
| if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h) | |||
| { | |||
| return Convolution::forward(bottom_blob, top_blob, opt); | |||
| } | |||
| @@ -155,26 +199,23 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| conv_func conv_func_table[5][5] = | |||
| conv_func conv_func_table[7][4] = | |||
| { | |||
| { | |||
| conv1x1s1_sse, | |||
| conv1x1s2_sse, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 1 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 2 | |||
| { | |||
| conv3x3s1_sse, | |||
| 0, | |||
| 0, | |||
| conv3x3s2_sse, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 3 | |||
| @@ -182,35 +223,43 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 4 | |||
| { | |||
| conv5x5s1_sse, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 5 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 6 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 5 | |||
| } // kernel_size = 7 | |||
| }; | |||
| typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&); | |||
| // kernel_size x stride | |||
| conv_int8_func conv_int8_func_table[5][5] = | |||
| conv_int8_func conv_int8_func_table[7][4] = | |||
| { | |||
| { | |||
| conv1x1s1_int8_sse, | |||
| conv1x1s2_int8_sse, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 1 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 2 | |||
| { | |||
| @@ -218,22 +267,31 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| conv3x3s2_int8_sse, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 3 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 4 | |||
| { | |||
| conv5x5s1_int8_sse, | |||
| conv5x5s2_int8_sse, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 5 | |||
| { | |||
| 0, | |||
| 0, | |||
| 0, | |||
| 0 | |||
| }, // kernel_size = 6 | |||
| { | |||
| conv7x7s1_int8_sse, | |||
| conv7x7s2_int8_sse, | |||
| 0, | |||
| 0 | |||
| } // kernel_size = 5 | |||
| } // kernel_size = 7 | |||
| }; | |||
| conv_func conv = 0; | |||
| @@ -322,21 +380,69 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option | |||
| if (use_int8_inference) | |||
| { | |||
| conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| // dequantize, reverse scale inplace | |||
| if (use_int8_requantize == true) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm; | |||
| top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); | |||
| if (top_blob_tm.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (use_winograd3x3) | |||
| conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt); | |||
| else | |||
| conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| dequantize->forward_inplace(top_blob, opt_g); | |||
| // requantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1); | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (use_winograd3x3) | |||
| conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt); | |||
| else | |||
| conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int p=0; p<num_output; p++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel_range(p, 1); | |||
| dequantize_ops[p]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| } | |||
| return 0; | |||
| } | |||
| conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| if (use_winograd3x3) | |||
| { | |||
| conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt); | |||
| } | |||
| else | |||
| conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); | |||
| return 0; | |||
| } | |||
| @@ -24,8 +24,16 @@ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option | |||
| class Convolution_x86 : public Convolution | |||
| { | |||
| public: | |||
| virtual int load_param(const ParamDict& pd); | |||
| virtual int load_model(const ModelBin& mb); | |||
| virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; | |||
| virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const; | |||
| public: | |||
| bool use_winograd3x3; | |||
| Mat weight_3x3_winograd23_data; | |||
| }; | |||
| } // namespace ncnn | |||
| @@ -134,7 +134,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| } | |||
| const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; | |||
| Mat bottom_blob_unbordered = bottom_blob; | |||
| if (use_int8_inference && elemsize != 1) | |||
| @@ -159,8 +159,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g); | |||
| } | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| } | |||
| bottom_blob_unbordered = bottom_blob_int8; | |||
| } | |||
| Mat bottom_blob_bordered = bottom_blob_unbordered; | |||
| if (pad_w > 0 || pad_h > 0) | |||
| @@ -203,25 +203,65 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con | |||
| { | |||
| if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) | |||
| { | |||
| if (stride_w == 1 && stride_h == 1) | |||
| if (use_int8_requantize) | |||
| { | |||
| convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| Mat top_blob_tm; | |||
| top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); | |||
| if (top_blob_tm.empty()) | |||
| return -100; | |||
| top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt); | |||
| } | |||
| // requantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1); | |||
| Mat top_blob_g = top_blob.channel_range(g, 1); | |||
| requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g); | |||
| } | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| else | |||
| { | |||
| convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel(g); | |||
| dequantize_ops[g]->forward_inplace(top_blob_g, opt_g); | |||
| top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); | |||
| if (top_blob.empty()) | |||
| return -100; | |||
| if (stride_w == 1 && stride_h == 1) | |||
| { | |||
| convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| else if (stride_w == 2 && stride_h == 2) | |||
| { | |||
| convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); | |||
| } | |||
| // dequantize, reverse scale inplace | |||
| #pragma omp parallel for num_threads(opt.num_threads) | |||
| for (int g=0; g<group; g++) | |||
| { | |||
| ncnn::Option opt_g = opt; | |||
| opt_g.num_threads = 1; | |||
| opt_g.blob_allocator = top_blob.allocator; | |||
| Mat top_blob_g = top_blob.channel(g); | |||
| dequantize_ops[g]->forward_inplace(top_blob_g, opt_g); | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -16,6 +16,9 @@ | |||
| #include "layer_type.h" | |||
| #include "modelbin.h" | |||
| #include "paramdict.h" | |||
| #include "convolution.h" | |||
| #include "convolutiondepthwise.h" | |||
| #include "relu.h" | |||
| #include <stdarg.h> | |||
| #include <stdio.h> | |||
| @@ -679,6 +682,8 @@ int Net::load_model(FILE* fp) | |||
| } | |||
| #endif // NCNN_VULKAN | |||
| fuse_network(); | |||
| return ret; | |||
| } | |||
| @@ -898,6 +903,110 @@ int Net::load_model(const unsigned char* _mem) | |||
| return mem - _mem; | |||
| } | |||
| void Net::fuse_network() | |||
| { | |||
| // set the int8 op fusion:requantize | |||
| #if NCNN_STRING && NCNN_REQUANT | |||
| // fprintf(stderr, "Test op fusion to int8 implement:\n"); | |||
| for (size_t i=0; i<layers.size(); i++) | |||
| { | |||
| Layer* layer = layers[i]; | |||
| if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise") | |||
| { | |||
| if (((Convolution*)layer)->use_int8_inference == false) | |||
| continue; | |||
| for (size_t n=0; n<blobs[layer->tops[0]].consumers.size(); n++) | |||
| { | |||
| int layer_next_index = blobs[layer->tops[0]].consumers[n]; | |||
| Layer* layer_next = layers[layer_next_index]; | |||
| if (layer_next->type == "ReLU") | |||
| { | |||
| int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0]; | |||
| Layer* layer_next_2 = layers[layer_next_2_index]; | |||
| if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise") | |||
| { | |||
| // fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); | |||
| if (layer->type == "Convolution" && layer_next_2->type == "Convolution") | |||
| { | |||
| ((Convolution*)layer)->use_int8_requantize = true; | |||
| ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; | |||
| ((Convolution*)layer)->create_requantize_op(); | |||
| } | |||
| else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution") | |||
| { | |||
| ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; | |||
| ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; | |||
| ((ConvolutionDepthWise*)layer)->create_requantize_op(); | |||
| } | |||
| else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise") | |||
| { | |||
| ((Convolution*)layer)->use_int8_requantize = true; | |||
| ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; | |||
| ((Convolution*)layer)->create_requantize_op(); | |||
| } | |||
| else | |||
| { | |||
| ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; | |||
| ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; | |||
| ((ConvolutionDepthWise*)layer)->create_requantize_op(); | |||
| } | |||
| } | |||
| else if (layer_next_2->type == "Split") | |||
| { | |||
| bool all_conv = true; | |||
| for (size_t i=0; i<layer_next_2->tops.size(); i++) | |||
| { | |||
| int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; | |||
| if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox" ) | |||
| { | |||
| // fprintf(stderr, "%s, %s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str()); | |||
| all_conv = false; | |||
| } | |||
| } | |||
| if (all_conv == true && layer_next_2->tops.size() >= size_t(2)) | |||
| { | |||
| // fprintf(stderr, "%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); | |||
| for (size_t i=0; i<layer_next_2->tops.size(); i++) | |||
| { | |||
| int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; | |||
| Layer* layer_next_3 = layers[layer_next_3_index]; | |||
| // fprintf(stderr, "%s, ", layer_next_3->name.c_str()); | |||
| if (layer_next_3->type == "Convolution") | |||
| { | |||
| ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale; | |||
| } | |||
| } | |||
| ((Convolution*)layer)->use_int8_requantize = true; | |||
| ((Convolution*)layer)->create_requantize_op(); | |||
| // fprintf(stderr, "\n"); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| // fprintf(stderr, "%s, %s\n", layer->name.c_str(), layer_next->name.c_str()); | |||
| } | |||
| } | |||
| else if (layer_next->type == "Pooling") | |||
| { | |||
| // ToDo | |||
| } | |||
| else | |||
| { | |||
| // fprintf(stderr, "%s\n", layer->name.c_str()); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| #endif | |||
| } | |||
| void Net::clear() | |||
| { | |||
| blobs.clear(); | |||
| @@ -76,6 +76,10 @@ public: | |||
| // return bytes consumed | |||
| int load_model(const unsigned char* mem); | |||
| // parse the structure of network | |||
| // fuse int8 op dequantize and quantize by requantize | |||
| void fuse_network(); | |||
| // unload network structure and weight data | |||
| void clear(); | |||
| @@ -22,5 +22,7 @@ | |||
| #cmakedefine01 NCNN_PIXEL | |||
| #cmakedefine01 NCNN_PIXEL_ROTATE | |||
| #cmakedefine01 NCNN_VULKAN | |||
| #cmakedefine01 NCNN_REQUANT | |||
| #cmakedefine01 NCNN_IM2COL_SGEMM | |||
| #endif // NCNN_PLATFORM_H | |||
| @@ -685,7 +685,7 @@ int main(int argc, char** argv) | |||
| if (int8_scale_term) | |||
| { | |||
| if ((int)weight_int8scale.size() == num_group && (int)blob_int8scale.size() == num_group) | |||
| if ((int)weight_int8scale.size() == num_group) | |||
| { | |||
| fprintf(pp, " 8=1"); | |||
| } | |||