diff --git a/CMakeLists.txt b/CMakeLists.txt index c430db591..15b8ba95a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,6 +32,8 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON) option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" OFF) option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF) option(NCNN_VULKAN "vulkan compute support" OFF) +option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF) +option(NCNN_IM2COL_SGEMM "im2col sgemm support" OFF) if(NCNN_OPENMP) find_package(OpenMP) diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index fa7978c19..454e086f9 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -202,7 +202,7 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const time_avg /= g_loop_count; - fprintf(stderr, "%16s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg); + fprintf(stderr, "%-20s min = %7.2f max = %7.2f avg = %7.2f\n", comment, time_min, time_max, time_avg); } void squeezenet_init(ncnn::Net& net) @@ -210,6 +210,11 @@ void squeezenet_init(ncnn::Net& net) net.load_param("squeezenet.param"); } +void squeezenet_int8_init(ncnn::Net& net) +{ + net.load_param("squeezenet_int8.param"); +} + void squeezenet_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -226,6 +231,11 @@ void mobilenet_init(ncnn::Net& net) net.load_param("mobilenet.param"); } +void mobilenet_int8_init(ncnn::Net& net) +{ + net.load_param("mobilenet_int8.param"); +} + void mobilenet_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -306,6 +316,11 @@ void googlenet_init(ncnn::Net& net) net.load_param("googlenet.param"); } +void googlenet_int8_init(ncnn::Net& net) +{ + net.load_param("googlenet_int8.param"); +} + void googlenet_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -322,6 +337,11 @@ void resnet18_init(ncnn::Net& net) net.load_param("resnet18.param"); } +void resnet18_int8_init(ncnn::Net& net) +{ + net.load_param("resnet18_int8.param"); +} + void resnet18_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -354,6 +374,11 @@ void vgg16_init(ncnn::Net& net) net.load_param("vgg16.param"); } +void vgg16_int8_init(ncnn::Net& net) +{ + net.load_param("vgg16_int8.param"); +} + void vgg16_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -365,11 +390,37 @@ void vgg16_run(const ncnn::Net& net) ex.extract("prob", out); } +void resnet50_init(ncnn::Net& net) +{ + net.load_param("resnet50.param"); +} + +void resnet50_int8_init(ncnn::Net& net) +{ + net.load_param("resnet50_int8.param"); +} + +void resnet50_run(const ncnn::Net& net) +{ + ncnn::Extractor ex = net.create_extractor(); + + ncnn::Mat in(224, 224, 3); + ex.input("data", in); + + ncnn::Mat out; + ex.extract("prob", out); +} + void squeezenet_ssd_init(ncnn::Net& net) { net.load_param("squeezenet_ssd.param"); } +void squeezenet_ssd_int8_init(ncnn::Net& net) +{ + net.load_param("squeezenet_ssd_int8.param"); +} + void squeezenet_ssd_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -386,6 +437,11 @@ void mobilenet_ssd_init(ncnn::Net& net) net.load_param("mobilenet_ssd.param"); } +void mobilenet_ssd_int8_init(ncnn::Net& net) +{ + net.load_param("mobilenet_ssd_int8.param"); +} + void mobilenet_ssd_run(const ncnn::Net& net) { ncnn::Extractor ex = net.create_extractor(); @@ -497,8 +553,12 @@ int main(int argc, char** argv) // run benchmark("squeezenet", squeezenet_init, squeezenet_run); + benchmark("squeezenet-int8", squeezenet_int8_init, squeezenet_run); + benchmark("mobilenet", mobilenet_init, mobilenet_run); + benchmark("mobilenet-int8", mobilenet_int8_init, mobilenet_run); + benchmark("mobilenet_v2", mobilenet_v2_init, mobilenet_v2_run); benchmark("shufflenet", shufflenet_init, shufflenet_run); @@ -509,16 +569,28 @@ int main(int argc, char** argv) benchmark("googlenet", googlenet_init, googlenet_run); + benchmark("googlenet-int8", googlenet_int8_init, googlenet_run); + benchmark("resnet18", resnet18_init, resnet18_run); + benchmark("resnet18-int8", resnet18_int8_init, resnet18_run); + benchmark("alexnet", alexnet_init, alexnet_run); benchmark("vgg16", vgg16_init, vgg16_run); + benchmark("resnet50", resnet50_init, resnet50_run); + + benchmark("resnet50-int8", resnet50_int8_init, resnet50_run); + benchmark("squeezenet-ssd", squeezenet_ssd_init, squeezenet_ssd_run); + benchmark("squeezenet-ssd-int8", squeezenet_ssd_int8_init, squeezenet_ssd_run); + benchmark("mobilenet-ssd", mobilenet_ssd_init, mobilenet_ssd_run); + benchmark("mobilenet-ssd-int8", mobilenet_ssd_int8_init, mobilenet_ssd_run); + benchmark("mobilenet-yolo", mobilenet_yolo_init, mobilenet_yolo_run); benchmark("mobilenet-yolov3", mobilenet_yolov3_init, mobilenet_yolov3_run); diff --git a/benchmark/googlenet_int8.param b/benchmark/googlenet_int8.param new file mode 100755 index 000000000..d2dfeadc1 --- /dev/null +++ b/benchmark/googlenet_int8.param @@ -0,0 +1,154 @@ +7767517 +152 179 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1/7x7_s2 1 1 data conv1/7x7_s2 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2 +ReLU conv1/relu_7x7 1 1 conv1/7x7_s2 conv1/7x7_s2_conv1/relu_7x7 +Pooling pool1/3x3_s2 1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 0=0 1=3 2=2 3=0 4=0 +LRN pool1/norm1 1 1 pool1/3x3_s2 pool1/norm1 0=0 1=5 2=0.000100 3=0.750000 +Convolution conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce 0=64 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU conv2/relu_3x3_reduce 1 1 conv2/3x3_reduce conv2/3x3_reduce_conv2/relu_3x3_reduce +Convolution conv2/3x3 1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 +ReLU conv2/relu_3x3 1 1 conv2/3x3 conv2/3x3_conv2/relu_3x3 +LRN conv2/norm2 1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 0=0 1=5 2=0.000100 3=0.750000 +Pooling pool2/3x3_s2 1 1 conv2/norm2 pool2/3x3_s2 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_0 1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3 +Convolution inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +ReLU inception_3a/relu_1x1 1 1 inception_3a/1x1 inception_3a/1x1_inception_3a/relu_1x1 +Convolution inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 +ReLU inception_3a/relu_3x3_reduce 1 1 inception_3a/3x3_reduce inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce +Convolution inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 +ReLU inception_3a/relu_3x3 1 1 inception_3a/3x3 inception_3a/3x3_inception_3a/relu_3x3 +Convolution inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=3072 8=2 +ReLU inception_3a/relu_5x5_reduce 1 1 inception_3a/5x5_reduce inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce +Convolution inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5 0=32 1=5 2=1 3=1 4=2 5=1 6=12800 8=2 +ReLU inception_3a/relu_5x5 1 1 inception_3a/5x5 inception_3a/5x5_inception_3a/relu_5x5 +Pooling inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj 0=32 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 +ReLU inception_3a/relu_pool_proj 1 1 inception_3a/pool_proj inception_3a/pool_proj_inception_3a/relu_pool_proj +Concat inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output 0=0 +Split splitncnn_1 1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3 +Convolution inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU inception_3b/relu_1x1 1 1 inception_3b/1x1 inception_3b/1x1_inception_3b/relu_1x1 +Convolution inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU inception_3b/relu_3x3_reduce 1 1 inception_3b/3x3_reduce inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce +Convolution inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=221184 8=2 +ReLU inception_3b/relu_3x3 1 1 inception_3b/3x3 inception_3b/3x3_inception_3b/relu_3x3 +Convolution inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 +ReLU inception_3b/relu_5x5_reduce 1 1 inception_3b/5x5_reduce inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce +Convolution inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5 0=96 1=5 2=1 3=1 4=2 5=1 6=76800 8=2 +ReLU inception_3b/relu_5x5 1 1 inception_3b/5x5 inception_3b/5x5_inception_3b/relu_5x5 +Pooling inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU inception_3b/relu_pool_proj 1 1 inception_3b/pool_proj inception_3b/pool_proj_inception_3b/relu_pool_proj +Concat inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output 0=0 +Pooling pool3/3x3_s2 1 1 inception_3b/output pool3/3x3_s2 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_2 1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3 +Convolution inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=92160 8=2 +ReLU inception_4a/relu_1x1 1 1 inception_4a/1x1 inception_4a/1x1_inception_4a/relu_1x1 +Convolution inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=46080 8=2 +ReLU inception_4a/relu_3x3_reduce 1 1 inception_4a/3x3_reduce inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce +Convolution inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3 0=208 1=3 2=1 3=1 4=1 5=1 6=179712 8=2 +ReLU inception_4a/relu_3x3 1 1 inception_4a/3x3 inception_4a/3x3_inception_4a/relu_3x3 +Convolution inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=7680 8=2 +ReLU inception_4a/relu_5x5_reduce 1 1 inception_4a/5x5_reduce inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce +Convolution inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5 0=48 1=5 2=1 3=1 4=2 5=1 6=19200 8=2 +ReLU inception_4a/relu_5x5 1 1 inception_4a/5x5 inception_4a/5x5_inception_4a/relu_5x5 +Pooling inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=30720 8=2 +ReLU inception_4a/relu_pool_proj 1 1 inception_4a/pool_proj inception_4a/pool_proj_inception_4a/relu_pool_proj +Concat inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output 0=0 +Split splitncnn_3 1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3 +Convolution inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1 0=160 1=1 2=1 3=1 4=0 5=1 6=81920 8=2 +ReLU inception_4b/relu_1x1 1 1 inception_4b/1x1 inception_4b/1x1_inception_4b/relu_1x1 +Convolution inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2 +ReLU inception_4b/relu_3x3_reduce 1 1 inception_4b/3x3_reduce inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce +Convolution inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3 0=224 1=3 2=1 3=1 4=1 5=1 6=225792 8=2 +ReLU inception_4b/relu_3x3 1 1 inception_4b/3x3 inception_4b/3x3_inception_4b/relu_3x3 +Convolution inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +ReLU inception_4b/relu_5x5_reduce 1 1 inception_4b/5x5_reduce inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce +Convolution inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2 +ReLU inception_4b/relu_5x5 1 1 inception_4b/5x5 inception_4b/5x5_inception_4b/relu_5x5 +Pooling inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU inception_4b/relu_pool_proj 1 1 inception_4b/pool_proj inception_4b/pool_proj_inception_4b/relu_pool_proj +Concat inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output 0=0 +Split splitncnn_4 1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3 +Convolution inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 +ReLU inception_4c/relu_1x1 1 1 inception_4c/1x1 inception_4c/1x1_inception_4c/relu_1x1 +Convolution inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 +ReLU inception_4c/relu_3x3_reduce 1 1 inception_4c/3x3_reduce inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce +Convolution inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2 +ReLU inception_4c/relu_3x3 1 1 inception_4c/3x3 inception_4c/3x3_inception_4c/relu_3x3 +Convolution inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +ReLU inception_4c/relu_5x5_reduce 1 1 inception_4c/5x5_reduce inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce +Convolution inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2 +ReLU inception_4c/relu_5x5 1 1 inception_4c/5x5 inception_4c/5x5_inception_4c/relu_5x5 +Pooling inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU inception_4c/relu_pool_proj 1 1 inception_4c/pool_proj inception_4c/pool_proj_inception_4c/relu_pool_proj +Concat inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output 0=0 +Split splitncnn_5 1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3 +Convolution inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2 +ReLU inception_4d/relu_1x1 1 1 inception_4d/1x1 inception_4d/1x1_inception_4d/relu_1x1 +Convolution inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce 0=144 1=1 2=1 3=1 4=0 5=1 6=73728 8=2 +ReLU inception_4d/relu_3x3_reduce 1 1 inception_4d/3x3_reduce inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce +Convolution inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3 0=288 1=3 2=1 3=1 4=1 5=1 6=373248 8=2 +ReLU inception_4d/relu_3x3 1 1 inception_4d/3x3 inception_4d/3x3_inception_4d/relu_3x3 +Convolution inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU inception_4d/relu_5x5_reduce 1 1 inception_4d/5x5_reduce inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce +Convolution inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=51200 8=2 +ReLU inception_4d/relu_5x5 1 1 inception_4d/5x5 inception_4d/5x5_inception_4d/relu_5x5 +Pooling inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU inception_4d/relu_pool_proj 1 1 inception_4d/pool_proj inception_4d/pool_proj_inception_4d/relu_pool_proj +Concat inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output 0=0 +Split splitncnn_6 1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3 +Convolution inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=135168 8=2 +ReLU inception_4e/relu_1x1 1 1 inception_4e/1x1 inception_4e/1x1_inception_4e/relu_1x1 +Convolution inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=84480 8=2 +ReLU inception_4e/relu_3x3_reduce 1 1 inception_4e/3x3_reduce inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce +Convolution inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2 +ReLU inception_4e/relu_3x3 1 1 inception_4e/3x3 inception_4e/3x3_inception_4e/relu_3x3 +Convolution inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16896 8=2 +ReLU inception_4e/relu_5x5_reduce 1 1 inception_4e/5x5_reduce inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce +Convolution inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2 +ReLU inception_4e/relu_5x5 1 1 inception_4e/5x5 inception_4e/5x5_inception_4e/relu_5x5 +Pooling inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=67584 8=2 +ReLU inception_4e/relu_pool_proj 1 1 inception_4e/pool_proj inception_4e/pool_proj_inception_4e/relu_pool_proj +Concat inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output 0=0 +Pooling pool4/3x3_s2 1 1 inception_4e/output pool4/3x3_s2 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_7 1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3 +Convolution inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=212992 8=2 +ReLU inception_5a/relu_1x1 1 1 inception_5a/1x1 inception_5a/1x1_inception_5a/relu_1x1 +Convolution inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=133120 8=2 +ReLU inception_5a/relu_3x3_reduce 1 1 inception_5a/3x3_reduce inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce +Convolution inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2 +ReLU inception_5a/relu_3x3 1 1 inception_5a/3x3 inception_5a/3x3_inception_5a/relu_3x3 +Convolution inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=26624 8=2 +ReLU inception_5a/relu_5x5_reduce 1 1 inception_5a/5x5_reduce inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce +Convolution inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2 +ReLU inception_5a/relu_5x5 1 1 inception_5a/5x5 inception_5a/5x5_inception_5a/relu_5x5 +Pooling inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2 +ReLU inception_5a/relu_pool_proj 1 1 inception_5a/pool_proj inception_5a/pool_proj_inception_5a/relu_pool_proj +Concat inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output 0=0 +Split splitncnn_8 1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3 +Convolution inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=319488 8=2 +ReLU inception_5b/relu_1x1 1 1 inception_5b/1x1 inception_5b/1x1_inception_5b/relu_1x1 +Convolution inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce 0=192 1=1 2=1 3=1 4=0 5=1 6=159744 8=2 +ReLU inception_5b/relu_3x3_reduce 1 1 inception_5b/3x3_reduce inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce +Convolution inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=663552 8=2 +ReLU inception_5b/relu_3x3 1 1 inception_5b/3x3 inception_5b/3x3_inception_5b/relu_3x3 +Convolution inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce 0=48 1=1 2=1 3=1 4=0 5=1 6=39936 8=2 +ReLU inception_5b/relu_5x5_reduce 1 1 inception_5b/5x5_reduce inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce +Convolution inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=153600 8=2 +ReLU inception_5b/relu_5x5 1 1 inception_5b/5x5 inception_5b/5x5_inception_5b/relu_5x5 +Pooling inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 0=0 1=3 2=1 3=1 4=0 +Convolution inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2 +ReLU inception_5b/relu_pool_proj 1 1 inception_5b/pool_proj inception_5b/pool_proj_inception_5b/relu_pool_proj +Concat inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output 0=0 +Pooling pool5/7x7_s1 1 1 inception_5b/output pool5/7x7_s1 0=1 1=7 2=1 3=0 4=0 +Dropout pool5/drop_7x7_s1 1 1 pool5/7x7_s1 pool5/7x7_s1_pool5/drop_7x7_s1 +InnerProduct loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000 +Softmax prob 1 1 loss3/classifier prob 0=0 diff --git a/benchmark/mobilenet_int8.param b/benchmark/mobilenet_int8.param new file mode 100755 index 000000000..7994d3aeb --- /dev/null +++ b/benchmark/mobilenet_int8.param @@ -0,0 +1,114 @@ +7767517 +112 112 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1 0=32 1=3 2=1 3=2 4=1 5=0 6=864 8=2 +BatchNorm conv1/bn 1 1 conv1 conv1_conv1/bn 0=32 +Scale conv1/scale 1 1 conv1_conv1/bn conv1_conv1/scale 0=32 1=1 +ReLU relu1 1 1 conv1_conv1/scale conv1_relu1 +ConvolutionDepthWise conv2_1/dw 1 1 conv1_relu1 conv2_1/dw 0=32 1=3 2=1 3=1 4=1 5=0 6=288 7=32 8=1 +BatchNorm conv2_1/dw/bn 1 1 conv2_1/dw conv2_1/dw_conv2_1/dw/bn 0=32 +Scale conv2_1/dw/scale 1 1 conv2_1/dw_conv2_1/dw/bn conv2_1/dw_conv2_1/dw/scale 0=32 1=1 +ReLU relu2_1/dw 1 1 conv2_1/dw_conv2_1/dw/scale conv2_1/dw_relu2_1/dw +Convolution conv2_1/sep 1 1 conv2_1/dw_relu2_1/dw conv2_1/sep 0=64 1=1 2=1 3=1 4=0 5=0 6=2048 8=2 +BatchNorm conv2_1/sep/bn 1 1 conv2_1/sep conv2_1/sep_conv2_1/sep/bn 0=64 +Scale conv2_1/sep/scale 1 1 conv2_1/sep_conv2_1/sep/bn conv2_1/sep_conv2_1/sep/scale 0=64 1=1 +ReLU relu2_1/sep 1 1 conv2_1/sep_conv2_1/sep/scale conv2_1/sep_relu2_1/sep +ConvolutionDepthWise conv2_2/dw 1 1 conv2_1/sep_relu2_1/sep conv2_2/dw 0=64 1=3 2=1 3=2 4=1 5=0 6=576 7=64 8=1 +BatchNorm conv2_2/dw/bn 1 1 conv2_2/dw conv2_2/dw_conv2_2/dw/bn 0=64 +Scale conv2_2/dw/scale 1 1 conv2_2/dw_conv2_2/dw/bn conv2_2/dw_conv2_2/dw/scale 0=64 1=1 +ReLU relu2_2/dw 1 1 conv2_2/dw_conv2_2/dw/scale conv2_2/dw_relu2_2/dw +Convolution conv2_2/sep 1 1 conv2_2/dw_relu2_2/dw conv2_2/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=8192 8=2 +BatchNorm conv2_2/sep/bn 1 1 conv2_2/sep conv2_2/sep_conv2_2/sep/bn 0=128 +Scale conv2_2/sep/scale 1 1 conv2_2/sep_conv2_2/sep/bn conv2_2/sep_conv2_2/sep/scale 0=128 1=1 +ReLU relu2_2/sep 1 1 conv2_2/sep_conv2_2/sep/scale conv2_2/sep_relu2_2/sep +ConvolutionDepthWise conv3_1/dw 1 1 conv2_2/sep_relu2_2/sep conv3_1/dw 0=128 1=3 2=1 3=1 4=1 5=0 6=1152 7=128 8=1 +BatchNorm conv3_1/dw/bn 1 1 conv3_1/dw conv3_1/dw_conv3_1/dw/bn 0=128 +Scale conv3_1/dw/scale 1 1 conv3_1/dw_conv3_1/dw/bn conv3_1/dw_conv3_1/dw/scale 0=128 1=1 +ReLU relu3_1/dw 1 1 conv3_1/dw_conv3_1/dw/scale conv3_1/dw_relu3_1/dw +Convolution conv3_1/sep 1 1 conv3_1/dw_relu3_1/dw conv3_1/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm conv3_1/sep/bn 1 1 conv3_1/sep conv3_1/sep_conv3_1/sep/bn 0=128 +Scale conv3_1/sep/scale 1 1 conv3_1/sep_conv3_1/sep/bn conv3_1/sep_conv3_1/sep/scale 0=128 1=1 +ReLU relu3_1/sep 1 1 conv3_1/sep_conv3_1/sep/scale conv3_1/sep_relu3_1/sep +ConvolutionDepthWise conv3_2/dw 1 1 conv3_1/sep_relu3_1/sep conv3_2/dw 0=128 1=3 2=1 3=2 4=1 5=0 6=1152 7=128 8=1 +BatchNorm conv3_2/dw/bn 1 1 conv3_2/dw conv3_2/dw_conv3_2/dw/bn 0=128 +Scale conv3_2/dw/scale 1 1 conv3_2/dw_conv3_2/dw/bn conv3_2/dw_conv3_2/dw/scale 0=128 1=1 +ReLU relu3_2/dw 1 1 conv3_2/dw_conv3_2/dw/scale conv3_2/dw_relu3_2/dw +Convolution conv3_2/sep 1 1 conv3_2/dw_relu3_2/dw conv3_2/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=32768 8=2 +BatchNorm conv3_2/sep/bn 1 1 conv3_2/sep conv3_2/sep_conv3_2/sep/bn 0=256 +Scale conv3_2/sep/scale 1 1 conv3_2/sep_conv3_2/sep/bn conv3_2/sep_conv3_2/sep/scale 0=256 1=1 +ReLU relu3_2/sep 1 1 conv3_2/sep_conv3_2/sep/scale conv3_2/sep_relu3_2/sep +ConvolutionDepthWise conv4_1/dw 1 1 conv3_2/sep_relu3_2/sep conv4_1/dw 0=256 1=3 2=1 3=1 4=1 5=0 6=2304 7=256 8=1 +BatchNorm conv4_1/dw/bn 1 1 conv4_1/dw conv4_1/dw_conv4_1/dw/bn 0=256 +Scale conv4_1/dw/scale 1 1 conv4_1/dw_conv4_1/dw/bn conv4_1/dw_conv4_1/dw/scale 0=256 1=1 +ReLU relu4_1/dw 1 1 conv4_1/dw_conv4_1/dw/scale conv4_1/dw_relu4_1/dw +Convolution conv4_1/sep 1 1 conv4_1/dw_relu4_1/dw conv4_1/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm conv4_1/sep/bn 1 1 conv4_1/sep conv4_1/sep_conv4_1/sep/bn 0=256 +Scale conv4_1/sep/scale 1 1 conv4_1/sep_conv4_1/sep/bn conv4_1/sep_conv4_1/sep/scale 0=256 1=1 +ReLU relu4_1/sep 1 1 conv4_1/sep_conv4_1/sep/scale conv4_1/sep_relu4_1/sep +ConvolutionDepthWise conv4_2/dw 1 1 conv4_1/sep_relu4_1/sep conv4_2/dw 0=256 1=3 2=1 3=2 4=1 5=0 6=2304 7=256 8=1 +BatchNorm conv4_2/dw/bn 1 1 conv4_2/dw conv4_2/dw_conv4_2/dw/bn 0=256 +Scale conv4_2/dw/scale 1 1 conv4_2/dw_conv4_2/dw/bn conv4_2/dw_conv4_2/dw/scale 0=256 1=1 +ReLU relu4_2/dw 1 1 conv4_2/dw_conv4_2/dw/scale conv4_2/dw_relu4_2/dw +Convolution conv4_2/sep 1 1 conv4_2/dw_relu4_2/dw conv4_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=131072 8=2 +BatchNorm conv4_2/sep/bn 1 1 conv4_2/sep conv4_2/sep_conv4_2/sep/bn 0=512 +Scale conv4_2/sep/scale 1 1 conv4_2/sep_conv4_2/sep/bn conv4_2/sep_conv4_2/sep/scale 0=512 1=1 +ReLU relu4_2/sep 1 1 conv4_2/sep_conv4_2/sep/scale conv4_2/sep_relu4_2/sep +ConvolutionDepthWise conv5_1/dw 1 1 conv4_2/sep_relu4_2/sep conv5_1/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_1/dw/bn 1 1 conv5_1/dw conv5_1/dw_conv5_1/dw/bn 0=512 +Scale conv5_1/dw/scale 1 1 conv5_1/dw_conv5_1/dw/bn conv5_1/dw_conv5_1/dw/scale 0=512 1=1 +ReLU relu5_1/dw 1 1 conv5_1/dw_conv5_1/dw/scale conv5_1/dw_relu5_1/dw +Convolution conv5_1/sep 1 1 conv5_1/dw_relu5_1/dw conv5_1/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm conv5_1/sep/bn 1 1 conv5_1/sep conv5_1/sep_conv5_1/sep/bn 0=512 +Scale conv5_1/sep/scale 1 1 conv5_1/sep_conv5_1/sep/bn conv5_1/sep_conv5_1/sep/scale 0=512 1=1 +ReLU relu5_1/sep 1 1 conv5_1/sep_conv5_1/sep/scale conv5_1/sep_relu5_1/sep +ConvolutionDepthWise conv5_2/dw 1 1 conv5_1/sep_relu5_1/sep conv5_2/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_2/dw/bn 1 1 conv5_2/dw conv5_2/dw_conv5_2/dw/bn 0=512 +Scale conv5_2/dw/scale 1 1 conv5_2/dw_conv5_2/dw/bn conv5_2/dw_conv5_2/dw/scale 0=512 1=1 +ReLU relu5_2/dw 1 1 conv5_2/dw_conv5_2/dw/scale conv5_2/dw_relu5_2/dw +Convolution conv5_2/sep 1 1 conv5_2/dw_relu5_2/dw conv5_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm conv5_2/sep/bn 1 1 conv5_2/sep conv5_2/sep_conv5_2/sep/bn 0=512 +Scale conv5_2/sep/scale 1 1 conv5_2/sep_conv5_2/sep/bn conv5_2/sep_conv5_2/sep/scale 0=512 1=1 +ReLU relu5_2/sep 1 1 conv5_2/sep_conv5_2/sep/scale conv5_2/sep_relu5_2/sep +ConvolutionDepthWise conv5_3/dw 1 1 conv5_2/sep_relu5_2/sep conv5_3/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_3/dw/bn 1 1 conv5_3/dw conv5_3/dw_conv5_3/dw/bn 0=512 +Scale conv5_3/dw/scale 1 1 conv5_3/dw_conv5_3/dw/bn conv5_3/dw_conv5_3/dw/scale 0=512 1=1 +ReLU relu5_3/dw 1 1 conv5_3/dw_conv5_3/dw/scale conv5_3/dw_relu5_3/dw +Convolution conv5_3/sep 1 1 conv5_3/dw_relu5_3/dw conv5_3/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm conv5_3/sep/bn 1 1 conv5_3/sep conv5_3/sep_conv5_3/sep/bn 0=512 +Scale conv5_3/sep/scale 1 1 conv5_3/sep_conv5_3/sep/bn conv5_3/sep_conv5_3/sep/scale 0=512 1=1 +ReLU relu5_3/sep 1 1 conv5_3/sep_conv5_3/sep/scale conv5_3/sep_relu5_3/sep +ConvolutionDepthWise conv5_4/dw 1 1 conv5_3/sep_relu5_3/sep conv5_4/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_4/dw/bn 1 1 conv5_4/dw conv5_4/dw_conv5_4/dw/bn 0=512 +Scale conv5_4/dw/scale 1 1 conv5_4/dw_conv5_4/dw/bn conv5_4/dw_conv5_4/dw/scale 0=512 1=1 +ReLU relu5_4/dw 1 1 conv5_4/dw_conv5_4/dw/scale conv5_4/dw_relu5_4/dw +Convolution conv5_4/sep 1 1 conv5_4/dw_relu5_4/dw conv5_4/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm conv5_4/sep/bn 1 1 conv5_4/sep conv5_4/sep_conv5_4/sep/bn 0=512 +Scale conv5_4/sep/scale 1 1 conv5_4/sep_conv5_4/sep/bn conv5_4/sep_conv5_4/sep/scale 0=512 1=1 +ReLU relu5_4/sep 1 1 conv5_4/sep_conv5_4/sep/scale conv5_4/sep_relu5_4/sep +ConvolutionDepthWise conv5_5/dw 1 1 conv5_4/sep_relu5_4/sep conv5_5/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_5/dw/bn 1 1 conv5_5/dw conv5_5/dw_conv5_5/dw/bn 0=512 +Scale conv5_5/dw/scale 1 1 conv5_5/dw_conv5_5/dw/bn conv5_5/dw_conv5_5/dw/scale 0=512 1=1 +ReLU relu5_5/dw 1 1 conv5_5/dw_conv5_5/dw/scale conv5_5/dw_relu5_5/dw +Convolution conv5_5/sep 1 1 conv5_5/dw_relu5_5/dw conv5_5/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm conv5_5/sep/bn 1 1 conv5_5/sep conv5_5/sep_conv5_5/sep/bn 0=512 +Scale conv5_5/sep/scale 1 1 conv5_5/sep_conv5_5/sep/bn conv5_5/sep_conv5_5/sep/scale 0=512 1=1 +ReLU relu5_5/sep 1 1 conv5_5/sep_conv5_5/sep/scale conv5_5/sep_relu5_5/sep +ConvolutionDepthWise conv5_6/dw 1 1 conv5_5/sep_relu5_5/sep conv5_6/dw 0=512 1=3 2=1 3=2 4=1 5=0 6=4608 7=512 8=1 +BatchNorm conv5_6/dw/bn 1 1 conv5_6/dw conv5_6/dw_conv5_6/dw/bn 0=512 +Scale conv5_6/dw/scale 1 1 conv5_6/dw_conv5_6/dw/bn conv5_6/dw_conv5_6/dw/scale 0=512 1=1 +ReLU relu5_6/dw 1 1 conv5_6/dw_conv5_6/dw/scale conv5_6/dw_relu5_6/dw +Convolution conv5_6/sep 1 1 conv5_6/dw_relu5_6/dw conv5_6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=524288 8=2 +BatchNorm conv5_6/sep/bn 1 1 conv5_6/sep conv5_6/sep_conv5_6/sep/bn 0=1024 +Scale conv5_6/sep/scale 1 1 conv5_6/sep_conv5_6/sep/bn conv5_6/sep_conv5_6/sep/scale 0=1024 1=1 +ReLU relu5_6/sep 1 1 conv5_6/sep_conv5_6/sep/scale conv5_6/sep_relu5_6/sep +ConvolutionDepthWise conv6/dw 1 1 conv5_6/sep_relu5_6/sep conv6/dw 0=1024 1=3 2=1 3=1 4=1 5=0 6=9216 7=1024 8=1 +BatchNorm conv6/dw/bn 1 1 conv6/dw conv6/dw_conv6/dw/bn 0=1024 +Scale conv6/dw/scale 1 1 conv6/dw_conv6/dw/bn conv6/dw_conv6/dw/scale 0=1024 1=1 +ReLU relu6/dw 1 1 conv6/dw_conv6/dw/scale conv6/dw_relu6/dw +Convolution conv6/sep 1 1 conv6/dw_relu6/dw conv6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm conv6/sep/bn 1 1 conv6/sep conv6/sep_conv6/sep/bn 0=1024 +Scale conv6/sep/scale 1 1 conv6/sep_conv6/sep/bn conv6/sep_conv6/sep/scale 0=1024 1=1 +ReLU relu6/sep 1 1 conv6/sep_conv6/sep/scale conv6/sep_relu6/sep +Pooling pool6 1 1 conv6/sep_relu6/sep pool6 0=1 1=0 2=1 3=0 4=1 +Convolution fc7 1 1 pool6 fc7 0=1000 1=1 2=1 3=1 4=0 5=1 6=1024000 8=2 +Softmax prob 1 1 fc7 prob 0=0 diff --git a/benchmark/mobilenet_ssd_int8.param b/benchmark/mobilenet_ssd_int8.param new file mode 100755 index 000000000..287c49e26 --- /dev/null +++ b/benchmark/mobilenet_ssd_int8.param @@ -0,0 +1,129 @@ +7767517 +127 150 +Input data 0 1 data 0=300 1=300 2=3 +Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 +Convolution conv0 1 1 data_splitncnn_6 conv0 0=32 1=3 2=1 3=2 4=1 5=1 6=864 8=2 +ReLU conv0/relu 1 1 conv0 conv0_conv0/relu +ConvolutionDepthWise conv1/dw 1 1 conv0_conv0/relu conv1/dw 0=32 1=3 2=1 3=1 4=1 5=1 6=288 7=32 8=1 +ReLU conv1/dw/relu 1 1 conv1/dw conv1/dw_conv1/dw/relu +Convolution conv1 1 1 conv1/dw_conv1/dw/relu conv1 0=64 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 +ReLU conv1/relu 1 1 conv1 conv1_conv1/relu +ConvolutionDepthWise conv2/dw 1 1 conv1_conv1/relu conv2/dw 0=64 1=3 2=1 3=2 4=1 5=1 6=576 7=64 8=1 +ReLU conv2/dw/relu 1 1 conv2/dw conv2/dw_conv2/dw/relu +Convolution conv2 1 1 conv2/dw_conv2/dw/relu conv2 0=128 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 +ReLU conv2/relu 1 1 conv2 conv2_conv2/relu +ConvolutionDepthWise conv3/dw 1 1 conv2_conv2/relu conv3/dw 0=128 1=3 2=1 3=1 4=1 5=1 6=1152 7=128 8=1 +ReLU conv3/dw/relu 1 1 conv3/dw conv3/dw_conv3/dw/relu +Convolution conv3 1 1 conv3/dw_conv3/dw/relu conv3 0=128 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU conv3/relu 1 1 conv3 conv3_conv3/relu +ConvolutionDepthWise conv4/dw 1 1 conv3_conv3/relu conv4/dw 0=128 1=3 2=1 3=2 4=1 5=1 6=1152 7=128 8=1 +ReLU conv4/dw/relu 1 1 conv4/dw conv4/dw_conv4/dw/relu +Convolution conv4 1 1 conv4/dw_conv4/dw/relu conv4 0=256 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU conv4/relu 1 1 conv4 conv4_conv4/relu +ConvolutionDepthWise conv5/dw 1 1 conv4_conv4/relu conv5/dw 0=256 1=3 2=1 3=1 4=1 5=1 6=2304 7=256 8=1 +ReLU conv5/dw/relu 1 1 conv5/dw conv5/dw_conv5/dw/relu +Convolution conv5 1 1 conv5/dw_conv5/dw/relu conv5 0=256 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 +ReLU conv5/relu 1 1 conv5 conv5_conv5/relu +ConvolutionDepthWise conv6/dw 1 1 conv5_conv5/relu conv6/dw 0=256 1=3 2=1 3=2 4=1 5=1 6=2304 7=256 8=1 +ReLU conv6/dw/relu 1 1 conv6/dw conv6/dw_conv6/dw/relu +Convolution conv6 1 1 conv6/dw_conv6/dw/relu conv6 0=512 1=1 2=1 3=1 4=0 5=1 6=131072 8=2 +ReLU conv6/relu 1 1 conv6 conv6_conv6/relu +ConvolutionDepthWise conv7/dw 1 1 conv6_conv6/relu conv7/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 +ReLU conv7/dw/relu 1 1 conv7/dw conv7/dw_conv7/dw/relu +Convolution conv7 1 1 conv7/dw_conv7/dw/relu conv7 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv7/relu 1 1 conv7 conv7_conv7/relu +ConvolutionDepthWise conv8/dw 1 1 conv7_conv7/relu conv8/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 +ReLU conv8/dw/relu 1 1 conv8/dw conv8/dw_conv8/dw/relu +Convolution conv8 1 1 conv8/dw_conv8/dw/relu conv8 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv8/relu 1 1 conv8 conv8_conv8/relu +ConvolutionDepthWise conv9/dw 1 1 conv8_conv8/relu conv9/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 +ReLU conv9/dw/relu 1 1 conv9/dw conv9/dw_conv9/dw/relu +Convolution conv9 1 1 conv9/dw_conv9/dw/relu conv9 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv9/relu 1 1 conv9 conv9_conv9/relu +ConvolutionDepthWise conv10/dw 1 1 conv9_conv9/relu conv10/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 +ReLU conv10/dw/relu 1 1 conv10/dw conv10/dw_conv10/dw/relu +Convolution conv10 1 1 conv10/dw_conv10/dw/relu conv10 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv10/relu 1 1 conv10 conv10_conv10/relu +ConvolutionDepthWise conv11/dw 1 1 conv10_conv10/relu conv11/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1 +ReLU conv11/dw/relu 1 1 conv11/dw conv11/dw_conv11/dw/relu +Convolution conv11 1 1 conv11/dw_conv11/dw/relu conv11 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv11/relu 1 1 conv11 conv11_conv11/relu +Split splitncnn_1 1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3 +ConvolutionDepthWise conv12/dw 1 1 conv11_conv11/relu_splitncnn_3 conv12/dw 0=512 1=3 2=1 3=2 4=1 5=1 6=4608 7=512 8=1 +ReLU conv12/dw/relu 1 1 conv12/dw conv12/dw_conv12/dw/relu +Convolution conv12 1 1 conv12/dw_conv12/dw/relu conv12 0=1024 1=1 2=1 3=1 4=0 5=1 6=524288 8=2 +ReLU conv12/relu 1 1 conv12 conv12_conv12/relu +ConvolutionDepthWise conv13/dw 1 1 conv12_conv12/relu conv13/dw 0=1024 1=3 2=1 3=1 4=1 5=1 6=9216 7=1024 8=1 +ReLU conv13/dw/relu 1 1 conv13/dw conv13/dw_conv13/dw/relu +Convolution conv13 1 1 conv13/dw_conv13/dw/relu conv13 0=1024 1=1 2=1 3=1 4=0 5=1 6=1048576 8=2 +ReLU conv13/relu 1 1 conv13 conv13_conv13/relu +Split splitncnn_2 1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3 +Convolution conv14_1 1 1 conv13_conv13/relu_splitncnn_3 conv14_1 0=256 1=1 2=1 3=1 4=0 5=1 6=262144 8=2 +ReLU conv14_1/relu 1 1 conv14_1 conv14_1_conv14_1/relu +Convolution conv14_2 1 1 conv14_1_conv14_1/relu conv14_2 0=512 1=3 2=1 3=2 4=1 5=1 6=1179648 8=2 +ReLU conv14_2/relu 1 1 conv14_2 conv14_2_conv14_2/relu +Split splitncnn_3 1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3 +Convolution conv15_1 1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2 +ReLU conv15_1/relu 1 1 conv15_1 conv15_1_conv15_1/relu +Convolution conv15_2 1 1 conv15_1_conv15_1/relu conv15_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2 +ReLU conv15_2/relu 1 1 conv15_2 conv15_2_conv15_2/relu +Split splitncnn_4 1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3 +Convolution conv16_1 1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU conv16_1/relu 1 1 conv16_1 conv16_1_conv16_1/relu +Convolution conv16_2 1 1 conv16_1_conv16_1/relu conv16_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2 +ReLU conv16_2/relu 1 1 conv16_2 conv16_2_conv16_2/relu +Split splitncnn_5 1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3 +Convolution conv17_1 1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU conv17_1/relu 1 1 conv17_1 conv17_1_conv17_1/relu +Convolution conv17_2 1 1 conv17_1_conv17_1/relu conv17_2 0=128 1=3 2=1 3=2 4=1 5=1 6=73728 8=2 +ReLU conv17_2/relu 1 1 conv17_2 conv17_2_conv17_2/relu +Split splitncnn_6 1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2 +Convolution conv11_mbox_loc 1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 +Permute conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3 +Flatten conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat +Convolution conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 +Permute conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3 +Flatten conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat +PriorBox conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23301=0 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Convolution conv13_mbox_loc 1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 +Permute conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3 +Flatten conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat +Convolution conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=129024 8=2 +Permute conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3 +Flatten conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat +PriorBox conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Convolution conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +Permute conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3 +Flatten conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat +Convolution conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=64512 8=2 +Permute conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3 +Flatten conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat +PriorBox conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Convolution conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 +Permute conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3 +Flatten conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat +Convolution conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 +Permute conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3 +Flatten conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat +PriorBox conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Convolution conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2 +Permute conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3 +Flatten conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat +Convolution conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2 +Permute conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3 +Flatten conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat +PriorBox conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Convolution conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=3072 8=2 +Permute conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3 +Flatten conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat +Convolution conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=16128 8=2 +Permute conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3 +Flatten conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat +PriorBox conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000 +Concat mbox_loc 6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc 0=0 +Concat mbox_conf 6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf 0=0 +Concat mbox_priorbox 6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1 +Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0 +Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 +Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten +DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.250000 diff --git a/benchmark/resnet18_int8.param b/benchmark/resnet18_int8.param new file mode 100755 index 000000000..cd2be6233 --- /dev/null +++ b/benchmark/resnet18_int8.param @@ -0,0 +1,103 @@ +7767517 +101 109 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=0 6=9408 8=2 +BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 +Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 +ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu +Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 +Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2 +BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=64 +Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=64 1=1 +Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 +Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 +ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu +Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 +Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 +Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -23301=0 +ReLU res2a_relu 1 1 res2a res2a_res2a_relu +Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 +Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 +Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 +ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu +Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 +Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 +Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -23301=0 +ReLU res2b_relu 1 1 res2b res2b_res2b_relu +Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 +Convolution res3a_branch1 1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1 0=128 1=1 2=1 3=2 4=0 5=0 6=8192 8=2 +BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=128 +Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=128 1=1 +Convolution res3a_branch2a 1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2 +BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 +Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 +ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu +Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 +Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 +Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -23301=0 +ReLU res3a_relu 1 1 res3a res3a_res3a_relu +Split splitncnn_3 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 +Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 +Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 +ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu +Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 +Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 +Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -23301=0 +ReLU res3b_relu 1 1 res3b res3b_res3b_relu +Split splitncnn_4 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 +Convolution res4a_branch1 1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1 0=256 1=1 2=1 3=2 4=0 5=0 6=32768 8=2 +BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=256 +Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=256 1=1 +Convolution res4a_branch2a 1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2 +BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 +Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 +ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu +Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 +Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 +Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -23301=0 +ReLU res4a_relu 1 1 res4a res4a_res4a_relu +Split splitncnn_5 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 +Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 +Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 +ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu +Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 +Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 +Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -23301=0 +ReLU res4b_relu 1 1 res4b res4b_res4b_relu +Split splitncnn_6 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 +Convolution res5a_branch1 1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 +BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=512 +Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=512 1=1 +Convolution res5a_branch2a 1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a 0=512 1=3 2=1 3=2 4=1 5=0 6=1179648 8=2 +BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 +Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 +ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu +Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 +Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 +Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -23301=0 +ReLU res5a_relu 1 1 res5a res5a_res5a_relu +Split splitncnn_7 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 +Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 +Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 +ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu +Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 +Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 +Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -23301=0 +ReLU res5b_relu 1 1 res5b res5b_res5b_relu +Pooling pool5 1 1 res5b_res5b_relu pool5 0=1 1=7 2=1 3=0 4=0 +InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=512000 +Softmax prob 1 1 fc1000 prob 0=0 diff --git a/benchmark/resnet50.param b/benchmark/resnet50.param new file mode 100755 index 000000000..f9df9a81c --- /dev/null +++ b/benchmark/resnet50.param @@ -0,0 +1,247 @@ +7767517 +245 261 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 +BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 +Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 +ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu +Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 +Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256 +Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1 +Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 +BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 +Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 +ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu +Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 +BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 +Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 +ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu +Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256 +Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1 +Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0 +ReLU res2a_relu 1 1 res2a res2a_res2a_relu +Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 +Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 +Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 +ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu +Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 +BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 +Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 +ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu +Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256 +Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1 +Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0 +ReLU res2b_relu 1 1 res2b res2b_res2b_relu +Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 +Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64 +Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1 +ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu +Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 +BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64 +Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1 +ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu +Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 +BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256 +Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1 +Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0 +ReLU res2c_relu 1 1 res2c res2c_res2c_relu +Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 +Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 +BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512 +Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1 +Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 +BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 +Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 +ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu +Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 +BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 +Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 +ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu +Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512 +Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1 +Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0 +ReLU res3a_relu 1 1 res3a res3a_res3a_relu +Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 +Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 +Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 +ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu +Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 +BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 +Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 +ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu +Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512 +Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1 +Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0 +ReLU res3b_relu 1 1 res3b res3b_res3b_relu +Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 +Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128 +Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1 +ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu +Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 +BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128 +Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1 +ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu +Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512 +Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1 +Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0 +ReLU res3c_relu 1 1 res3c res3c_res3c_relu +Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 +Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128 +Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1 +ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu +Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 +BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128 +Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1 +ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu +Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 +BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512 +Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1 +Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0 +ReLU res3d_relu 1 1 res3d res3d_res3d_relu +Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 +Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 +BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024 +Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1 +Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 +BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 +Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 +ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu +Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 +Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 +ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu +Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024 +Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1 +Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0 +ReLU res4a_relu 1 1 res4a res4a_res4a_relu +Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 +Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 +Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 +ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu +Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 +Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 +ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu +Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024 +Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1 +Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0 +ReLU res4b_relu 1 1 res4b res4b_res4b_relu +Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 +Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256 +Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1 +ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu +Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256 +Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1 +ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu +Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024 +Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1 +Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0 +ReLU res4c_relu 1 1 res4c res4c_res4c_relu +Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 +Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256 +Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1 +ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu +Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256 +Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1 +ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu +Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024 +Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1 +Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0 +ReLU res4d_relu 1 1 res4d res4d_res4d_relu +Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 +Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256 +Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1 +ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu +Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256 +Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1 +ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu +Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024 +Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1 +Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0 +ReLU res4e_relu 1 1 res4e res4e_res4e_relu +Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 +Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256 +Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1 +ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu +Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 +BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256 +Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1 +ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu +Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 +BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024 +Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1 +Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0 +ReLU res4f_relu 1 1 res4f res4f_res4f_relu +Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 +Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 +BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048 +Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1 +Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 +BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 +Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 +ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu +Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 +BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 +Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 +ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu +Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 +BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048 +Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1 +Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0 +ReLU res5a_relu 1 1 res5a res5a_res5a_relu +Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 +Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 +BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 +Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 +ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu +Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 +BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 +Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 +ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu +Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 +BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048 +Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1 +Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0 +ReLU res5b_relu 1 1 res5b res5b_res5b_relu +Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 +Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 +BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512 +Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1 +ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu +Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 +BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512 +Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1 +ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu +Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 +BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048 +Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1 +Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0 +ReLU res5c_relu 1 1 res5c res5c_res5c_relu +Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0 +InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 +Softmax prob 1 1 fc1000 prob 0=0 diff --git a/benchmark/resnet50_int8.param b/benchmark/resnet50_int8.param new file mode 100755 index 000000000..c8e6c00e5 --- /dev/null +++ b/benchmark/resnet50_int8.param @@ -0,0 +1,247 @@ +7767517 +245 261 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1 1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2 +BatchNorm bn_conv1 1 1 conv1 conv1_bn_conv1 0=64 +Scale scale_conv1 1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1 +ReLU conv1_relu 1 1 conv1_scale_conv1 conv1_conv1_relu +Pooling pool1 1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0 +Split splitncnn_0 1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1 +Convolution res2a_branch1 1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2a_branch1 1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256 +Scale scale2a_branch1 1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1 +Convolution res2a_branch2a 1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2 +BatchNorm bn2a_branch2a 1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64 +Scale scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1 +ReLU res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu +Convolution res2a_branch2b 1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2a_branch2b 1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64 +Scale scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1 +ReLU res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu +Convolution res2a_branch2c 1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2a_branch2c 1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256 +Scale scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1 +Eltwise res2a 2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0 +ReLU res2a_relu 1 1 res2a res2a_res2a_relu +Split splitncnn_1 1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1 +Convolution res2b_branch2a 1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2b_branch2a 1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64 +Scale scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1 +ReLU res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu +Convolution res2b_branch2b 1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2b_branch2b 1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64 +Scale scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1 +ReLU res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu +Convolution res2b_branch2c 1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2b_branch2c 1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256 +Scale scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1 +Eltwise res2b 2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0 +ReLU res2b_relu 1 1 res2b res2b_res2b_relu +Split splitncnn_2 1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1 +Convolution res2c_branch2a 1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2c_branch2a 1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64 +Scale scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1 +ReLU res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu +Convolution res2c_branch2b 1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2 +BatchNorm bn2c_branch2b 1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64 +Scale scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1 +ReLU res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu +Convolution res2c_branch2c 1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm bn2c_branch2c 1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256 +Scale scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1 +Eltwise res2c 2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0 +ReLU res2c_relu 1 1 res2c res2c_res2c_relu +Split splitncnn_3 1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1 +Convolution res3a_branch1 1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 +BatchNorm bn3a_branch1 1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512 +Scale scale3a_branch1 1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1 +Convolution res3a_branch2a 1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 8=2 +BatchNorm bn3a_branch2a 1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128 +Scale scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1 +ReLU res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu +Convolution res3a_branch2b 1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3a_branch2b 1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128 +Scale scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1 +ReLU res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu +Convolution res3a_branch2c 1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3a_branch2c 1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512 +Scale scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1 +Eltwise res3a 2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0 +ReLU res3a_relu 1 1 res3a res3a_res3a_relu +Split splitncnn_4 1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1 +Convolution res3b_branch2a 1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3b_branch2a 1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128 +Scale scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1 +ReLU res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu +Convolution res3b_branch2b 1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3b_branch2b 1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128 +Scale scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1 +ReLU res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu +Convolution res3b_branch2c 1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3b_branch2c 1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512 +Scale scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1 +Eltwise res3b 2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0 +ReLU res3b_relu 1 1 res3b res3b_res3b_relu +Split splitncnn_5 1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1 +Convolution res3c_branch2a 1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3c_branch2a 1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128 +Scale scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1 +ReLU res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu +Convolution res3c_branch2b 1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3c_branch2b 1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128 +Scale scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1 +ReLU res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu +Convolution res3c_branch2c 1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3c_branch2c 1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512 +Scale scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1 +Eltwise res3c 2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0 +ReLU res3c_relu 1 1 res3c res3c_res3c_relu +Split splitncnn_6 1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1 +Convolution res3d_branch2a 1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3d_branch2a 1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128 +Scale scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1 +ReLU res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu +Convolution res3d_branch2b 1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2 +BatchNorm bn3d_branch2b 1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128 +Scale scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1 +ReLU res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu +Convolution res3d_branch2c 1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2 +BatchNorm bn3d_branch2c 1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512 +Scale scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1 +Eltwise res3d 2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0 +ReLU res3d_relu 1 1 res3d res3d_res3d_relu +Split splitncnn_7 1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1 +Convolution res4a_branch1 1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 8=2 +BatchNorm bn4a_branch1 1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024 +Scale scale4a_branch1 1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1 +Convolution res4a_branch2a 1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 8=2 +BatchNorm bn4a_branch2a 1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256 +Scale scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1 +ReLU res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu +Convolution res4a_branch2b 1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4a_branch2b 1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256 +Scale scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1 +ReLU res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu +Convolution res4a_branch2c 1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4a_branch2c 1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024 +Scale scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1 +Eltwise res4a 2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0 +ReLU res4a_relu 1 1 res4a res4a_res4a_relu +Split splitncnn_8 1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1 +Convolution res4b_branch2a 1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4b_branch2a 1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256 +Scale scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1 +ReLU res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu +Convolution res4b_branch2b 1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4b_branch2b 1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256 +Scale scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1 +ReLU res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu +Convolution res4b_branch2c 1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4b_branch2c 1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024 +Scale scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1 +Eltwise res4b 2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0 +ReLU res4b_relu 1 1 res4b res4b_res4b_relu +Split splitncnn_9 1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1 +Convolution res4c_branch2a 1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4c_branch2a 1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256 +Scale scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1 +ReLU res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu +Convolution res4c_branch2b 1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4c_branch2b 1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256 +Scale scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1 +ReLU res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu +Convolution res4c_branch2c 1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4c_branch2c 1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024 +Scale scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1 +Eltwise res4c 2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0 +ReLU res4c_relu 1 1 res4c res4c_res4c_relu +Split splitncnn_10 1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1 +Convolution res4d_branch2a 1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4d_branch2a 1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256 +Scale scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1 +ReLU res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu +Convolution res4d_branch2b 1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4d_branch2b 1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256 +Scale scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1 +ReLU res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu +Convolution res4d_branch2c 1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4d_branch2c 1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024 +Scale scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1 +Eltwise res4d 2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0 +ReLU res4d_relu 1 1 res4d res4d_res4d_relu +Split splitncnn_11 1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1 +Convolution res4e_branch2a 1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4e_branch2a 1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256 +Scale scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1 +ReLU res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu +Convolution res4e_branch2b 1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4e_branch2b 1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256 +Scale scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1 +ReLU res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu +Convolution res4e_branch2c 1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4e_branch2c 1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024 +Scale scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1 +Eltwise res4e 2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0 +ReLU res4e_relu 1 1 res4e res4e_res4e_relu +Split splitncnn_12 1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1 +Convolution res4f_branch2a 1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4f_branch2a 1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256 +Scale scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1 +ReLU res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu +Convolution res4f_branch2b 1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2 +BatchNorm bn4f_branch2b 1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256 +Scale scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1 +ReLU res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu +Convolution res4f_branch2c 1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2 +BatchNorm bn4f_branch2c 1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024 +Scale scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1 +Eltwise res4f 2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0 +ReLU res4f_relu 1 1 res4f res4f_res4f_relu +Split splitncnn_13 1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1 +Convolution res5a_branch1 1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 8=2 +BatchNorm bn5a_branch1 1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048 +Scale scale5a_branch1 1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1 +Convolution res5a_branch2a 1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 8=2 +BatchNorm bn5a_branch2a 1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512 +Scale scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1 +ReLU res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu +Convolution res5a_branch2b 1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5a_branch2b 1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512 +Scale scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1 +ReLU res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu +Convolution res5a_branch2c 1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm bn5a_branch2c 1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048 +Scale scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1 +Eltwise res5a 2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0 +ReLU res5a_relu 1 1 res5a res5a_res5a_relu +Split splitncnn_14 1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1 +Convolution res5b_branch2a 1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm bn5b_branch2a 1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512 +Scale scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1 +ReLU res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu +Convolution res5b_branch2b 1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5b_branch2b 1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512 +Scale scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1 +ReLU res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu +Convolution res5b_branch2c 1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm bn5b_branch2c 1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048 +Scale scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1 +Eltwise res5b 2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0 +ReLU res5b_relu 1 1 res5b res5b_res5b_relu +Split splitncnn_15 1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1 +Convolution res5c_branch2a 1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm bn5c_branch2a 1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512 +Scale scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1 +ReLU res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu +Convolution res5c_branch2b 1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2 +BatchNorm bn5c_branch2b 1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512 +Scale scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1 +ReLU res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu +Convolution res5c_branch2c 1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2 +BatchNorm bn5c_branch2c 1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048 +Scale scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1 +Eltwise res5c 2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0 +ReLU res5c_relu 1 1 res5c res5c_res5c_relu +Pooling pool5 1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0 +InnerProduct fc1000 1 1 pool5 fc1000 0=1000 1=1 2=2048000 +Softmax prob 1 1 fc1000 prob 0=0 diff --git a/benchmark/squeezenet_int8.param b/benchmark/squeezenet_int8.param new file mode 100755 index 000000000..5ce2ae78a --- /dev/null +++ b/benchmark/squeezenet_int8.param @@ -0,0 +1,77 @@ +7767517 +75 83 +Input data 0 1 data 0=227 1=227 2=3 +Convolution conv1 1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2 +ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 +Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0 +Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 +Split splitncnn_0 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 +Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 +Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 +ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 +Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0 +Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 +ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 +Split splitncnn_1 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 +Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 +Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 +ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 +Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0 +Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0 +Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 +Split splitncnn_2 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 +Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 +Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 +Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0 +Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 +ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 +Split splitncnn_3 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 +Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 +Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 +Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0 +Pooling pool5 1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0 +Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 +Split splitncnn_4 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 +Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 +ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 +Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 +ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 +Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0 +Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 +ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 +Split splitncnn_5 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 +Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 +ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 +Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 +ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 +Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0 +Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 +ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 +Split splitncnn_6 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 +Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 +Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 +ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 +Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0 +Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 +Split splitncnn_7 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 +Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1 +Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 +ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3 +Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0 +Dropout drop9 1 1 fire9/concat fire9/concat_drop9 +Convolution conv10 1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 8=2 +ReLU relu_conv10 1 1 conv10 conv10_relu_conv10 +Pooling pool10 1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1 +Softmax prob 1 1 pool10 prob 0=0 diff --git a/benchmark/squeezenet_ssd_int8.param b/benchmark/squeezenet_ssd_int8.param new file mode 100755 index 000000000..19fb43c9e --- /dev/null +++ b/benchmark/squeezenet_ssd_int8.param @@ -0,0 +1,181 @@ +7767517 +179 212 +Input data 0 1 data 0=300 1=300 2=3 +Split splitncnn_0 1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6 +Convolution conv1 1 1 data_splitncnn_6 conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2 +ReLU relu_conv1 1 1 conv1 conv1_relu_conv1 +Pooling pool1 1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0 +Convolution fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1 +Split splitncnn_1 1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 +Convolution fire2/expand1x1 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1 +Convolution fire2/expand3x3 1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 +ReLU fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3 +Concat fire2/concat 2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0 +Convolution fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2 +ReLU fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1 +Split splitncnn_2 1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 +Convolution fire3/expand1x1 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2 +ReLU fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1 +Convolution fire3/expand3x3 1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2 +ReLU fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3 +Concat fire3/concat 2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0 +Pooling pool3 1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0 +Convolution fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1 +Split splitncnn_3 1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 +Convolution fire4/expand1x1 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1 +Convolution fire4/expand3x3 1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +ReLU fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3 +Concat fire4/concat 2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0 +Convolution fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2 +ReLU fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1 +Split splitncnn_4 1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 +Convolution fire5/expand1x1 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2 +ReLU fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1 +Convolution fire5/expand3x3 1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +ReLU fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3 +Concat fire5/concat 2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0 +Split splitncnn_5 1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1 +Pooling pool5 1 1 fire5/concat_splitncnn_1 pool5 0=0 1=3 2=2 3=0 4=0 +Convolution fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2 +ReLU fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1 +Split splitncnn_6 1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 +Convolution fire6/expand1x1 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 +ReLU fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1 +Convolution fire6/expand3x3 1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 +ReLU fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3 +Concat fire6/concat 2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0 +Convolution fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2 +ReLU fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1 +Split splitncnn_7 1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 +Convolution fire7/expand1x1 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2 +ReLU fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1 +Convolution fire7/expand3x3 1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2 +ReLU fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3 +Concat fire7/concat 2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0 +Convolution fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2 +ReLU fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1 +Split splitncnn_8 1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 +Convolution fire8/expand1x1 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +ReLU fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1 +Convolution fire8/expand3x3 1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 +ReLU fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3 +Concat fire8/concat 2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0 +Convolution fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2 +ReLU fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1 +Split splitncnn_9 1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 +Convolution fire9/expand1x1 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2 +BatchNorm fire9/expand1x1/bn 1 1 fire9/expand1x1 fire9/expand1x1_fire9/expand1x1/bn 0=256 +Scale fire9/expand1x1/scale 1 1 fire9/expand1x1_fire9/expand1x1/bn fire9/expand1x1_fire9/expand1x1/scale 0=256 1=1 +ReLU fire9/relu_expand1x1 1 1 fire9/expand1x1_fire9/expand1x1/scale fire9/expand1x1_fire9/relu_expand1x1 +Convolution fire9/expand3x3 1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 +BatchNorm fire9/expand3x3/bn 1 1 fire9/expand3x3 fire9/expand3x3_fire9/expand3x3/bn 0=256 +Scale fire9/expand3x3/scale 1 1 fire9/expand3x3_fire9/expand3x3/bn fire9/expand3x3_fire9/expand3x3/scale 0=256 1=1 +ReLU fire9/relu_expand3x3 1 1 fire9/expand3x3_fire9/expand3x3/scale fire9/expand3x3_fire9/relu_expand3x3 +Concat fire9/concat 2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0 +Split splitncnn_10 1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3 +Pooling pool9 1 1 fire9/concat_splitncnn_3 pool9 0=0 1=3 2=2 3=0 4=0 +Convolution fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=49152 8=2 +BatchNorm fire10/squeeze1x1/bn 1 1 fire10/squeeze1x1 fire10/squeeze1x1_fire10/squeeze1x1/bn 0=96 +Scale fire10/squeeze1x1/scale 1 1 fire10/squeeze1x1_fire10/squeeze1x1/bn fire10/squeeze1x1_fire10/squeeze1x1/scale 0=96 1=1 +ReLU fire10/relu_squeeze1x1 1 1 fire10/squeeze1x1_fire10/squeeze1x1/scale fire10/squeeze1x1_fire10/relu_squeeze1x1 +Split splitncnn_11 1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 +Convolution fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2 +BatchNorm fire10/expand1x1/bn 1 1 fire10/expand1x1 fire10/expand1x1_fire10/expand1x1/bn 0=384 +Scale fire10/expand1x1/scale 1 1 fire10/expand1x1_fire10/expand1x1/bn fire10/expand1x1_fire10/expand1x1/scale 0=384 1=1 +ReLU fire10/relu_expand1x1 1 1 fire10/expand1x1_fire10/expand1x1/scale fire10/expand1x1_fire10/relu_expand1x1 +Convolution fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2 +BatchNorm fire10/expand3x3/bn 1 1 fire10/expand3x3 fire10/expand3x3_fire10/expand3x3/bn 0=384 +Scale fire10/expand3x3/scale 1 1 fire10/expand3x3_fire10/expand3x3/bn fire10/expand3x3_fire10/expand3x3/scale 0=384 1=1 +ReLU fire10/relu_expand3x3 1 1 fire10/expand3x3_fire10/expand3x3/scale fire10/expand3x3_fire10/relu_expand3x3 +Concat fire10/concat 2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat 0=0 +Split splitncnn_12 1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3 +Pooling pool10 1 1 fire10/concat_splitncnn_3 pool10 0=0 1=3 2=2 3=0 4=0 +Convolution fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=73728 8=2 +BatchNorm fire11/squeeze1x1/bn 1 1 fire11/squeeze1x1 fire11/squeeze1x1_fire11/squeeze1x1/bn 0=96 +Scale fire11/squeeze1x1/scale 1 1 fire11/squeeze1x1_fire11/squeeze1x1/bn fire11/squeeze1x1_fire11/squeeze1x1/scale 0=96 1=1 +ReLU fire11/relu_squeeze1x1 1 1 fire11/squeeze1x1_fire11/squeeze1x1/scale fire11/squeeze1x1_fire11/relu_squeeze1x1 +Split splitncnn_13 1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 +Convolution fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2 +BatchNorm fire11/expand1x1/bn 1 1 fire11/expand1x1 fire11/expand1x1_fire11/expand1x1/bn 0=384 +Scale fire11/expand1x1/scale 1 1 fire11/expand1x1_fire11/expand1x1/bn fire11/expand1x1_fire11/expand1x1/scale 0=384 1=1 +ReLU fire11/relu_expand1x1 1 1 fire11/expand1x1_fire11/expand1x1/scale fire11/expand1x1_fire11/relu_expand1x1 +Convolution fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2 +BatchNorm fire11/expand3x3/bn 1 1 fire11/expand3x3 fire11/expand3x3_fire11/expand3x3/bn 0=384 +Scale fire11/expand3x3/scale 1 1 fire11/expand3x3_fire11/expand3x3/bn fire11/expand3x3_fire11/expand3x3/scale 0=384 1=1 +ReLU fire11/relu_expand3x3 1 1 fire11/expand3x3_fire11/expand3x3/scale fire11/expand3x3_fire11/relu_expand3x3 +Concat fire11/concat 2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat 0=0 +Split splitncnn_14 1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3 +Convolution conv12_1 1 1 fire11/concat_splitncnn_3 conv12_1 0=128 1=1 2=1 3=1 4=0 5=0 6=98304 8=2 +BatchNorm conv12_1/bn 1 1 conv12_1 conv12_1_conv12_1/bn 0=128 +Scale conv12_1/scale 1 1 conv12_1_conv12_1/bn conv12_1_conv12_1/scale 0=128 1=1 +ReLU conv12_1/relu 1 1 conv12_1_conv12_1/scale conv12_1_conv12_1/relu +Convolution conv12_2 1 1 conv12_1_conv12_1/relu conv12_2 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2 +BatchNorm conv12_2/bn 1 1 conv12_2 conv12_2_conv12_2/bn 0=256 +Scale conv12_2/scale 1 1 conv12_2_conv12_2/bn conv12_2_conv12_2/scale 0=256 1=1 +ReLU conv12_2/relu 1 1 conv12_2_conv12_2/scale conv12_2_conv12_2/relu +Split splitncnn_15 1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3 +Convolution conv13_1 1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2 +BatchNorm conv13_1/bn 1 1 conv13_1 conv13_1_conv13_1/bn 0=64 +Scale conv13_1/scale 1 1 conv13_1_conv13_1/bn conv13_1_conv13_1/scale 0=64 1=1 +ReLU conv13_1/relu 1 1 conv13_1_conv13_1/scale conv13_1_conv13_1/relu +Convolution conv13_2 1 1 conv13_1_conv13_1/relu conv13_2 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2 +BatchNorm conv13_2/bn 1 1 conv13_2 conv13_2_conv13_2/bn 0=128 +Scale conv13_2/scale 1 1 conv13_2_conv13_2/bn conv13_2_conv13_2/scale 0=128 1=1 +ReLU conv13_2/relu 1 1 conv13_2_conv13_2/scale conv13_2_conv13_2/relu +Split splitncnn_16 1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2 +BatchNorm fire5/bn 1 1 fire5/concat_splitncnn_0 fire5/normal 0=256 +Scale fire5/scale 1 1 fire5/normal fire5/normal_fire5/scale 0=256 1=1 +Split splitncnn_17 1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2 +Convolution fire5_mbox_loc 1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +Permute fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3 +Flatten fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat +Convolution fire5_mbox_conf 1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=193536 8=2 +Permute fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3 +Flatten fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat +PriorBox fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000 +Convolution fire9_mbox_loc 1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=110592 8=2 +Permute fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3 +Flatten fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat +Convolution fire9_mbox_conf 1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=580608 8=2 +Permute fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3 +Flatten fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat +PriorBox fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000 +Convolution fire10_mbox_loc 1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2 +Permute fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3 +Flatten fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat +Convolution fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2 +Permute fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3 +Flatten fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat +PriorBox fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000 +Convolution fire11_mbox_loc 1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2 +Permute fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3 +Flatten fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat +Convolution fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2 +Permute fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3 +Flatten fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat +PriorBox fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000 +Convolution conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=55296 8=2 +Permute conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3 +Flatten conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat +Convolution conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=290304 8=2 +Permute conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3 +Flatten conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat +PriorBox conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000 +Convolution conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=18432 8=2 +Permute conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3 +Flatten conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat +Convolution conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=96768 8=2 +Permute conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3 +Flatten conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat +PriorBox conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000 +Concat mbox_loc 6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc 0=0 +Concat mbox_conf 6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf 0=0 +Concat mbox_priorbox 6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1 +Reshape mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0 +Softmax mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1 +Flatten mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten +DetectionOutput detection_out 3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.050000 diff --git a/benchmark/vgg16_int8.param b/benchmark/vgg16_int8.param new file mode 100755 index 000000000..110818999 --- /dev/null +++ b/benchmark/vgg16_int8.param @@ -0,0 +1,42 @@ +7767517 +40 40 +Input data 0 1 data 0=224 1=224 2=3 +Convolution conv1_1 1 1 data conv1_1 0=64 1=3 2=1 3=1 4=1 5=1 6=1728 8=2 +ReLU relu1_1 1 1 conv1_1 conv1_1_relu1_1 +Convolution conv1_2 1 1 conv1_1_relu1_1 conv1_2 0=64 1=3 2=1 3=1 4=1 5=1 6=36864 8=2 +ReLU relu1_2 1 1 conv1_2 conv1_2_relu1_2 +Pooling pool1 1 1 conv1_2_relu1_2 pool1 0=0 1=2 2=2 3=0 4=0 +Convolution conv2_1 1 1 pool1 conv2_1 0=128 1=3 2=1 3=1 4=1 5=1 6=73728 8=2 +ReLU relu2_1 1 1 conv2_1 conv2_1_relu2_1 +Convolution conv2_2 1 1 conv2_1_relu2_1 conv2_2 0=128 1=3 2=1 3=1 4=1 5=1 6=147456 8=2 +ReLU relu2_2 1 1 conv2_2 conv2_2_relu2_2 +Pooling pool2 1 1 conv2_2_relu2_2 pool2 0=0 1=2 2=2 3=0 4=0 +Convolution conv3_1 1 1 pool2 conv3_1 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2 +ReLU relu3_1 1 1 conv3_1 conv3_1_relu3_1 +Convolution conv3_2 1 1 conv3_1_relu3_1 conv3_2 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2 +ReLU relu3_2 1 1 conv3_2 conv3_2_relu3_2 +Convolution conv3_3 1 1 conv3_2_relu3_2 conv3_3 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2 +ReLU relu3_3 1 1 conv3_3 conv3_3_relu3_3 +Pooling pool3 1 1 conv3_3_relu3_3 pool3 0=0 1=2 2=2 3=0 4=0 +Convolution conv4_1 1 1 pool3 conv4_1 0=512 1=3 2=1 3=1 4=1 5=1 6=1179648 8=2 +ReLU relu4_1 1 1 conv4_1 conv4_1_relu4_1 +Convolution conv4_2 1 1 conv4_1_relu4_1 conv4_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 +ReLU relu4_2 1 1 conv4_2 conv4_2_relu4_2 +Convolution conv4_3 1 1 conv4_2_relu4_2 conv4_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 +ReLU relu4_3 1 1 conv4_3 conv4_3_relu4_3 +Pooling pool4 1 1 conv4_3_relu4_3 pool4 0=0 1=2 2=2 3=0 4=0 +Convolution conv5_1 1 1 pool4 conv5_1 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 +ReLU relu5_1 1 1 conv5_1 conv5_1_relu5_1 +Convolution conv5_2 1 1 conv5_1_relu5_1 conv5_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 +ReLU relu5_2 1 1 conv5_2 conv5_2_relu5_2 +Convolution conv5_3 1 1 conv5_2_relu5_2 conv5_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2 +ReLU relu5_3 1 1 conv5_3 conv5_3_relu5_3 +Pooling pool5 1 1 conv5_3_relu5_3 pool5 0=0 1=2 2=2 3=0 4=0 +InnerProduct fc6 1 1 pool5 fc6 0=4096 1=1 2=102760448 +ReLU relu6 1 1 fc6 fc6_relu6 +Dropout drop6 1 1 fc6_relu6 fc6_drop6 +InnerProduct fc7 1 1 fc6_drop6 fc7 0=4096 1=1 2=16777216 +ReLU relu7 1 1 fc7 fc7_relu7 +Dropout drop7 1 1 fc7_relu7 fc7_drop7 +InnerProduct fc8 1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000 +Softmax prob 1 1 fc8 prob 0=0 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5c2d395b0..092ef292d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -183,6 +183,7 @@ ncnn_add_layer(Yolov3DetectionOutput) ncnn_add_layer(PSROIPooling) ncnn_add_layer(ROIAlign OFF) ncnn_add_layer(Packing) +ncnn_add_layer(Requantize) # message("SHADER_SPV_HEX_FILES = ${SHADER_SPV_HEX_FILES}") add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES}) diff --git a/src/benchmark.cpp b/src/benchmark.cpp index 2898228ad..2e0bb0afa 100644 --- a/src/benchmark.cpp +++ b/src/benchmark.cpp @@ -55,14 +55,14 @@ double get_current_time() void benchmark(const Layer* layer, double start, double end) { - fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); + fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); fprintf(stderr, " |"); fprintf(stderr, "\n"); } void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end) { - fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); + fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start); fprintf(stderr, " | feature_map: %4d x %-4d inch: %4d outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c); if (layer->type == "Convolution") { diff --git a/src/layer/arm/convolution_1x1_int8.h b/src/layer/arm/convolution_1x1_int8.h index 70d8f6b25..5af98e42c 100644 --- a/src/layer/arm/convolution_1x1_int8.h +++ b/src/layer/arm/convolution_1x1_int8.h @@ -65,4097 +65,896 @@ static void conv1x1s1_sgemm_transform_kernel_int8_neon(const Mat& _kernel, Mat& #if __aarch64__ /* - * Convolution 1x1 quantized with int8,unroll 16 x 8 + * Convolution 1x1 quantized with sgemm int8 */ -static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt) { + int w = bottom_blob.w; + int h = bottom_blob.h; int inch = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; int outch = top_blob.c; - const signed char* kernel = _kernel; - - int nn_outch = 0; - int remain_outch_start = 0; - - nn_outch = outch >> 3; - remain_outch_start = nn_outch << 3; + const int size = w * h; - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp> 3; + int remain_size_start = nn_size << 3; - Mat out0 = top_blob.channel(p); - Mat out1 = top_blob.channel(p+1); - Mat out2 = top_blob.channel(p+2); - Mat out3 = top_blob.channel(p+3); - Mat out4 = top_blob.channel(p+4); - Mat out5 = top_blob.channel(p+5); - Mat out6 = top_blob.channel(p+6); - Mat out7 = top_blob.channel(p+7); - - out0.fill(0); - out1.fill(0); - out2.fill(0); - out3.fill(0); - out4.fill(0); - out5.fill(0); - out6.fill(0); - out7.fill(0); - - int q = 0; - -#ifdef __clang__ - for (; q+15> 4; - int remain = size & 15; - - int8x16_t _k0 = vld1q_s8(kernel0); - int8x16_t _k1 = vld1q_s8(kernel1); - int8x16_t _k2 = vld1q_s8(kernel2); - int8x16_t _k3 = vld1q_s8(kernel3); - int8x16_t _k4 = vld1q_s8(kernel4); - int8x16_t _k5 = vld1q_s8(kernel5); - int8x16_t _k6 = vld1q_s8(kernel6); - int8x16_t _k7 = vld1q_s8(kernel7); - - if (nn > 0) - { - asm volatile( - "prfm pldl1keep, [%9, #128] \n" - "prfm pldl1keep, [%10, #128] \n" - "prfm pldl1keep, [%11, #128] \n" - "prfm pldl1keep, [%12, #128] \n" - "ld1 {v8.16b}, [%9], #16 \n" // r0" - "ld1 {v9.16b}, [%10], #16 \n" // r1" - "ld1 {v10.16b}, [%11], #16 \n" // r2" - "ld1 {v11.16b}, [%12], #16 \n" // r3" - - "dup v24.16b, %50.b[0] \n" // k00 - "dup v25.16b, %50.b[1] \n" // k01 - "dup v26.16b, %50.b[2] \n" // k02 - "dup v27.16b, %50.b[3] \n" // k03 - - "0: \n" - "smull v28.8h, v8.8b, v24.8b \n" // r0 * k0 - "smull2 v31.8h, v8.16b, v24.16b \n" // r0n * k0 - "prfm pldl1keep, [%13, #128] \n" - "prfm pldl1keep, [%14, #128] \n" - "prfm pldl1keep, [%15, #128] \n" - - "smlal v28.8h, v9.8b, v25.8b \n" // r0 * k1 - "smlal2 v31.8h, v9.16b, v25.16b \n" // r0n * k1 - "prfm pldl1keep, [%16, #128] \n" - "ld1 {v12.16b}, [%13], #16 \n" // r4" - "ld1 {v13.16b}, [%14], #16 \n" // r5" - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "ld1 {v14.16b}, [%15], #16 \n" // r6" - "ld1 {v15.16b}, [%16], #16 \n" // r7" - "dup v24.16b, %50.b[4] \n" // k04 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v25.16b, %50.b[5] \n" // k05 - "dup v26.16b, %50.b[6] \n" // k06 - "dup v27.16b, %50.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" // r4 - "smlal2 v31.8h, v12.16b, v24.16b \n" // r4 - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v29.4s, v30.4s}, [%1] \n" // sum0 - "prfm pldl1keep, [%17, #128] \n" - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - "prfm pldl1keep, [%20, #128] \n" - "ld1 {v16.16b}, [%17], #16 \n" // r8" - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "ld1 {v17.16b}, [%18], #16 \n" // r9" - "ld1 {v18.16b}, [%19], #16 \n" // r10" - "ld1 {v19.16b}, [%20], #16 \n" // r11" - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v24.16b, %50.b[8] \n" // k08 - "dup v25.16b, %50.b[9] \n" // k09 - "dup v26.16b, %50.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" // r8 - "smlal2 v31.8h, v16.16b, v24.16b \n" // r8 - "dup v27.16b, %50.b[11] \n" // k11 - "prfm pldl1keep, [%21, #128] \n" - "prfm pldl1keep, [%22, #128] \n" - - "smlal v28.8h, v17.8b, v25.8b \n" - "smlal2 v31.8h, v17.16b, v25.16b \n" - "prfm pldl1keep, [%23, #128] \n" - "prfm pldl1keep, [%24, #128] \n" - "ld1 {v20.16b}, [%21], #16 \n" // r12" - - "smlal v28.8h, v18.8b, v26.8b \n" - "smlal2 v31.8h, v18.16b, v26.16b \n" - "ld1 {v21.16b}, [%22], #16 \n" // r13" - "ld1 {v22.16b}, [%23], #16 \n" // r14" - "ld1 {v23.16b}, [%24], #16 \n" // r15" - - "smlal v28.8h, v19.8b, v27.8b \n" - "smlal2 v31.8h, v19.16b, v27.16b \n" - "dup v24.16b, %50.b[12] \n" // k12 - "dup v25.16b, %50.b[13] \n" // k13 - "dup v26.16b, %50.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" // r12 - "smlal2 v31.8h, v20.16b, v24.16b \n" // r12 - "dup v27.16b, %50.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "smlal2 v31.8h, v21.16b, v25.16b \n" - "dup v24.16b, %51.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "smlal2 v31.8h, v22.16b, v26.16b \n" - "dup v25.16b, %51.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "smlal2 v31.8h, v23.16b, v27.16b \n" - "dup v26.16b, %51.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "saddw2 v30.4s, v30.4s, v28.8h \n" - - "dup v27.16b, %51.b[3] \n" // k03 - - "st1 {v29.4s, v30.4s}, [%1], #32 \n" // sum0 - - "ld1 {v29.4s, v30.4s}, [%1] \n" // sum0 - "saddw v29.4s, v29.4s, v31.4h \n" - "saddw2 v30.4s, v30.4s, v31.8h \n" - "st1 {v29.4s, v30.4s}, [%1], #32 \n" // sum0 - //########################################### - "smull v28.8h, v8.8b, v24.8b \n" - "smull2 v31.8h, v8.16b, v24.16b \n" - "dup v24.16b, %51.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "smlal2 v31.8h, v9.16b, v25.16b \n" - "dup v25.16b, %51.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "dup v26.16b, %51.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v27.16b, %51.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "smlal2 v31.8h, v12.16b, v24.16b \n" - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v29.4s, v30.4s}, [%2] \n" // sum1 - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "dup v24.16b, %51.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "dup v25.16b, %51.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v26.16b, %51.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "smlal2 v31.8h, v16.16b, v24.16b \n" - "dup v27.16b, %51.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "smlal2 v31.8h, v17.16b, v25.16b \n" - "dup v24.16b, %51.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "smlal2 v31.8h, v18.16b, v26.16b \n" - "dup v25.16b, %51.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "smlal2 v31.8h, v19.16b, v27.16b \n" - "dup v26.16b, %51.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "smlal2 v31.8h, v20.16b, v24.16b \n" - "dup v27.16b, %51.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "smlal2 v31.8h, v21.16b, v25.16b \n" - "dup v24.16b, %52.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "smlal2 v31.8h, v22.16b, v26.16b \n" - "dup v25.16b, %52.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "smlal2 v31.8h, v23.16b, v27.16b \n" - - "saddw v29.4s, v29.4s, v28.4h \n" - "saddw2 v30.4s, v30.4s, v28.8h \n" - - "dup v26.16b, %52.b[2] \n" // k02 - "dup v27.16b, %52.b[3] \n" // k03 - - "st1 {v29.4s, v30.4s}, [%2], #32 \n" - - "ld1 {v29.4s, v30.4s}, [%2] \n" // sum1 - "saddw v29.4s, v29.4s, v31.4h \n" - "saddw2 v30.4s, v30.4s, v31.8h \n" - "st1 {v29.4s, v30.4s}, [%2], #32 \n" - //########################################### // sum1 - - "smull v28.8h, v8.8b, v24.8b \n" - "smull2 v31.8h, v8.16b, v24.16b \n" - "dup v24.16b, %52.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "smlal2 v31.8h, v9.16b, v25.16b \n" - "dup v25.16b, %52.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "dup v26.16b, %52.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v27.16b, %52.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "smlal2 v31.8h, v12.16b, v24.16b \n" - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v29.4s, v30.4s}, [%3] \n" // sum2 - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "dup v24.16b, %52.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "dup v25.16b, %52.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v26.16b, %52.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "smlal2 v31.8h, v16.16b, v24.16b \n" - "dup v27.16b, %52.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "smlal2 v31.8h, v17.16b, v25.16b \n" - "dup v24.16b, %52.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "smlal2 v31.8h, v18.16b, v26.16b \n" - "dup v25.16b, %52.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "smlal2 v31.8h, v19.16b, v27.16b \n" - "dup v26.16b, %52.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "smlal2 v31.8h, v20.16b, v24.16b \n" - "dup v27.16b, %52.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "smlal2 v31.8h, v21.16b, v25.16b \n" - "dup v24.16b, %53.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "smlal2 v31.8h, v22.16b, v26.16b \n" - "dup v25.16b, %53.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "smlal2 v31.8h, v23.16b, v27.16b \n" - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v26.16b, %53.b[2] \n" // k02 - - "saddw2 v30.4s, v30.4s, v28.8h \n" - "dup v27.16b, %53.b[3] \n" // k03 - - "st1 {v29.4s, v30.4s}, [%3], #32 \n" - - "ld1 {v29.4s, v30.4s}, [%3] \n" // sum2 - "saddw v29.4s, v29.4s, v31.4h \n" - "saddw2 v30.4s, v30.4s, v31.8h \n" - "st1 {v29.4s, v30.4s}, [%3], #32 \n" - //########################################### //sum 2 - - "smull v28.8h, v8.8b, v24.8b \n" - "smull2 v31.8h, v8.16b, v24.16b \n" - "dup v24.16b, %53.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "smlal2 v31.8h, v9.16b, v25.16b \n" - "dup v25.16b, %53.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "dup v26.16b, %53.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v27.16b, %53.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "smlal2 v31.8h, v12.16b, v24.16b \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v29.4s, v30.4s}, [%4] \n" // sum3 - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "dup v24.16b, %53.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "dup v25.16b, %53.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v26.16b, %53.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "smlal2 v31.8h, v16.16b, v24.16b \n" - "dup v27.16b, %53.b[11] \n" // k11 + int i = ii * 8; - "smlal v28.8h, v17.8b, v25.8b \n" - "smlal2 v31.8h, v17.16b, v25.16b \n" - "dup v24.16b, %53.b[12] \n" // k12 + const signed char* img0 = bottom_blob.channel(0); + img0 += i; - "smlal v28.8h, v18.8b, v26.8b \n" - "smlal2 v31.8h, v18.16b, v26.16b \n" - "dup v25.16b, %53.b[13] \n" // k13 + signed char* tmpptr = tmp.channel(i/8); - "smlal v28.8h, v19.8b, v27.8b \n" - "smlal2 v31.8h, v19.16b, v27.16b \n" - "dup v26.16b, %53.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "smlal2 v31.8h, v20.16b, v24.16b \n" - "dup v27.16b, %53.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "smlal2 v31.8h, v21.16b, v25.16b \n" - "dup v24.16b, %54.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "smlal2 v31.8h, v22.16b, v26.16b \n" - "dup v25.16b, %54.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "smlal2 v31.8h, v23.16b, v27.16b \n" - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v26.16b, %54.b[2] \n" // k02 - - "saddw2 v30.4s, v30.4s, v28.8h \n" - - "dup v27.16b, %54.b[3] \n" // k03 - - "st1 {v29.4s, v30.4s}, [%4], #32 \n" - - "ld1 {v29.4s, v30.4s}, [%4] \n" // sum3 - "saddw v29.4s, v29.4s, v31.4h \n" - "saddw2 v30.4s, v30.4s, v31.8h \n" - "st1 {v29.4s, v30.4s}, [%4], #32 \n" - //########################################### // sum3 - "smull v28.8h, v8.8b, v24.8b \n" - "smull2 v31.8h, v8.16b, v24.16b \n" - "dup v24.16b, %54.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "smlal2 v31.8h, v9.16b, v25.16b \n" - "dup v25.16b, %54.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "dup v26.16b, %54.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v27.16b, %54.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "smlal2 v31.8h, v12.16b, v24.16b \n" - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v29.4s, v30.4s}, [%5] \n" // sum4 - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "dup v24.16b, %54.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "dup v25.16b, %54.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v26.16b, %54.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "smlal2 v31.8h, v16.16b, v24.16b \n" - "dup v27.16b, %54.b[11] \n" // k11 + for (int q=0; q> 2; - "smlal v28.8h, v19.8b, v27.8b \n" - "smlal2 v31.8h, v19.16b, v27.16b \n" - "dup v26.16b, %54.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "smlal2 v31.8h, v20.16b, v24.16b \n" - "dup v27.16b, %54.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "smlal2 v31.8h, v21.16b, v25.16b \n" - "dup v24.16b, %55.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "smlal2 v31.8h, v22.16b, v26.16b \n" - "dup v25.16b, %55.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "smlal2 v31.8h, v23.16b, v27.16b \n" - "dup v26.16b, %55.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.16b, %55.b[3] \n" // k03 - - "saddw2 v30.4s, v30.4s, v28.8h \n" - - "st1 {v29.4s, v30.4s}, [%5], #32 \n" - - "ld1 {v29.4s, v30.4s}, [%5] \n" // sum4 - "saddw v29.4s, v29.4s, v31.4h \n" - "saddw2 v30.4s, v30.4s, v31.8h \n" - "st1 {v29.4s, v30.4s}, [%5], #32 \n" - //########################################### // sum4 - "smull v28.8h, v8.8b, v24.8b \n" - "smull2 v31.8h, v8.16b, v24.16b \n" - "dup v24.16b, %55.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "smlal2 v31.8h, v9.16b, v25.16b \n" - "dup v25.16b, %55.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "smlal2 v31.8h, v10.16b, v26.16b \n" - "dup v26.16b, %55.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "smlal2 v31.8h, v11.16b, v27.16b \n" - "dup v27.16b, %55.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "smlal2 v31.8h, v12.16b, v24.16b \n" - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v29.4s, v30.4s}, [%6] \n" // sum5 - - "smlal v28.8h, v13.8b, v25.8b \n" - "smlal2 v31.8h, v13.16b, v25.16b \n" - "dup v24.16b, %55.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "smlal2 v31.8h, v14.16b, v26.16b \n" - "dup v25.16b, %55.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "smlal2 v31.8h, v15.16b, v27.16b \n" - "dup v26.16b, %55.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "smlal2 v31.8h, v16.16b, v24.16b \n" - "dup v27.16b, %55.b[11] \n" // k11 + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii=0; ii= 8) - { - remain -= 8; + // sgemm process + int nn_outch = 0; + int remain_outch_start = 0; - asm volatile( - "prfm pldl1keep, [%9, #128] \n" - "prfm pldl1keep, [%10, #128] \n" - "prfm pldl1keep, [%11, #128] \n" - "prfm pldl1keep, [%12, #128] \n" - "ld1 {v8.8b}, [%9], #8 \n" // r0" - "ld1 {v9.8b}, [%10], #8 \n" // r1" - "ld1 {v10.8b}, [%11], #8 \n" // r2" - "ld1 {v11.8b}, [%12], #8 \n" // r3" - - "dup v24.8b, %50.b[0] \n" // k00 - "dup v25.8b, %50.b[1] \n" // k01 - "dup v26.8b, %50.b[2] \n" // k02 - "dup v27.8b, %50.b[3] \n" // k03 - - "smull v28.8h, v8.8b, v24.8b \n" // r0 - "prfm pldl1keep, [%13, #128] \n" - "prfm pldl1keep, [%14, #128] \n" - "prfm pldl1keep, [%15, #128] \n" - - "smlal v28.8h, v9.8b, v25.8b \n" - "prfm pldl1keep, [%16, #128] \n" - "ld1 {v12.8b}, [%13], #8 \n" // r4" - "ld1 {v13.8b}, [%14], #8 \n" // r5" - - "smlal v28.8h, v10.8b, v26.8b \n" - "ld1 {v14.8b}, [%15], #8 \n" // r6" - "ld1 {v15.8b}, [%16], #8 \n" // r7" - "dup v24.8b, %50.b[4] \n" // k04 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v25.8b, %50.b[5] \n" // k05 - "dup v26.8b, %50.b[6] \n" // k06 - "dup v27.8b, %50.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" // r4 - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v29.4s, v30.4s}, [%1] \n" // sum0 - "prfm pldl1keep, [%17, #128] \n" - - "smlal v28.8h, v13.8b, v25.8b \n" - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - "prfm pldl1keep, [%20, #128] \n" - "ld1 {v16.8b}, [%17], #8 \n" // r8" - - "smlal v28.8h, v14.8b, v26.8b \n" - "ld1 {v17.8b}, [%18], #8 \n" // r9" - "ld1 {v18.8b}, [%19], #8 \n" // r10" - "ld1 {v19.8b}, [%20], #8 \n" // r11" - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v24.8b, %50.b[8] \n" // k08 - "dup v25.8b, %50.b[9] \n" // k09 - "dup v26.8b, %50.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" // r8 - "dup v27.8b, %50.b[11] \n" // k11 - "prfm pldl1keep, [%21, #128] \n" - "prfm pldl1keep, [%22, #128] \n" + nn_outch = (outch - remain_outch_start) >> 2; - "smlal v28.8h, v17.8b, v25.8b \n" - "prfm pldl1keep, [%23, #128] \n" - "prfm pldl1keep, [%24, #128] \n" - "ld1 {v20.8b}, [%21], #8 \n" // r12" + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%4, #128] \n" + "vld1.s8 {d4-d7}, [%4]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q5, d7 \n"// a30-a37 + "vmovl.s8 q4, d6 \n"// a20-a27 + "vmovl.s8 q3, d5 \n"// a10-a17 + "vmovl.s8 q2, d4 \n"// a00-a07 - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %51.b[1] \n" // k01 + "vld1.s8 {d0-d1}, [%5]! \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33 k(outch)(inch) + "vmovl.s8 q1, d1 \n"// k02-k32,k03-k33 + "vmovl.s8 q0, d0 \n"// k00-k30,k01-k31 - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %51.b[2] \n" // k02 + "vmlal.s16 q6, d4, d0[0] \n"// sum0 = (a00-a07) * k00 + "vmlal.s16 q7, d5, d0[0] \n" + "vmlal.s16 q8, d4, d0[1] \n"// sum1 = (a00-a07) * k10 + "vmlal.s16 q9, d5, d0[1] \n" + "vmlal.s16 q10, d4, d0[2] \n"// sum2 = (a00-a07) * k20 + "vmlal.s16 q11, d5, d0[2] \n" + "vmlal.s16 q12, d4, d0[3] \n"// sum3 = (a00-a07) * k30 + "vmlal.s16 q13, d5, d0[3] \n" - "saddw v29.4s, v29.4s, v28.4h \n" - "saddw2 v30.4s, v30.4s, v28.8h \n" + "vmlal.s16 q6, d6, d1[0] \n"// sum0 += (a10-a17) * k01 + "vmlal.s16 q7, d7, d1[0] \n" + "vmlal.s16 q8, d6, d1[1] \n"// sum1 += (a10-a17) * k11 + "vmlal.s16 q9, d7, d1[1] \n" + "vmlal.s16 q10, d6, d1[2] \n"// sum2 += (a10-a17) * k21 + "vmlal.s16 q11, d7, d1[2] \n" + "vmlal.s16 q12, d6, d1[3] \n"// sum3 += (a10-a17) * k31 + "vmlal.s16 q13, d7, d1[3] \n" - "dup v27.8b, %51.b[3] \n" // k03 + "vmlal.s16 q6, d8, d2[0] \n"// sum0 += (a20-a27) * k02 + "vmlal.s16 q7, d9, d2[0] \n" + "vmlal.s16 q8, d8, d2[1] \n"// sum1 += (a20-a27) * k12 + "vmlal.s16 q9, d9, d2[1] \n" + "vmlal.s16 q10, d8, d2[2] \n"// sum2 += (a20-a27) * k22 + "vmlal.s16 q11, d9, d2[2] \n" + "vmlal.s16 q12, d8, d2[3] \n"// sum3 += (a20-a27) * k32 + "vmlal.s16 q13, d9, d2[3] \n" - "st1 {v29.4s, v30.4s}, [%1], #32 \n" // sum0 - //########################################### - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %51.b[4] \n" // k04 + "vmlal.s16 q6, d10, d3[0] \n"// sum0 += (a30-a37) * k03 + "vmlal.s16 q7, d11, d3[0] \n" + "vmlal.s16 q8, d10, d3[1] \n"// sum1 += (a30-a37) * k13 + "vmlal.s16 q9, d11, d3[1] \n" + "vmlal.s16 q10, d10, d3[2] \n"// sum2 += (a30-a37) * k23 + "vmlal.s16 q11, d11, d3[2] \n" + "vmlal.s16 q12, d10, d3[3] \n"// sum3 += (a30-a37) * k33 + "vmlal.s16 q13, d11, d3[3] \n" - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %51.b[5] \n" // k05 + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %12, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %51.b[6] \n" // k06 + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%4]! \n"// tmpr a00-a07 a(inch)(data) + "vld1.s8 {d0}, [%5] \n"// kptr k00-k30 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %5, #4 \n" - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %51.b[7] \n" // k07 + "vmlal.s16 q6, d2, d0[0] \n"// sum0 += (a00-a07) * k00 + "vmlal.s16 q7, d3, d0[0] \n" + "vmlal.s16 q8, d2, d0[1] \n"// sum1 += (a00-a07) * k10 + "vmlal.s16 q9, d3, d0[1] \n" + "vmlal.s16 q10, d2, d0[2] \n"// sum2 += (a00-a07) * k20 + "vmlal.s16 q11, d3, d0[2] \n" + "vmlal.s16 q12, d2, d0[3] \n"// sum3 += (a00-a07) * k30 + "vmlal.s16 q13, d3, d0[3] \n" - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v29.4s, v30.4s}, [%2] \n" // sum1 + "subs r4, r4, #1 \n" + "bne 2b \n" - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %51.b[8] \n" // k08 + "3: \n"// store the result to memory + "vst1.s32 {d12-d15}, [%0]! \n" + "vst1.s32 {d16-d19}, [%1]! \n" + "vst1.s32 {d20-d23}, [%2]! \n" + "vst1.s32 {d24-d27}, [%3]! \n" - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %51.b[9] \n" // k09 + : "=r"(outptr0), // %0 + "=r"(outptr1), // %1 + "=r"(outptr2), // %2 + "=r"(outptr3), // %3 + "=r"(tmpptr), // %4 + "=r"(kptr) // %5 + : "0"(outptr0), + "1"(outptr1), + "2"(outptr2), + "3"(outptr3), + "4"(tmpptr), + "5"(kptr), + "r"(inch) // %12 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#else + int sum0_0 = 0; + int sum0_1 = 0; + int sum0_2 = 0; + int sum0_3 = 0; + int sum0_4 = 0; + int sum0_5 = 0; + int sum0_6 = 0; + int sum0_7 = 0; + + int sum1_0 = 0; + int sum1_1 = 0; + int sum1_2 = 0; + int sum1_3 = 0; + int sum1_4 = 0; + int sum1_5 = 0; + int sum1_6 = 0; + int sum1_7 = 0; + + int sum2_0 = 0; + int sum2_1 = 0; + int sum2_2 = 0; + int sum2_3 = 0; + int sum2_4 = 0; + int sum2_5 = 0; + int sum2_6 = 0; + int sum2_7 = 0; + + int sum3_0 = 0; + int sum3_1 = 0; + int sum3_2 = 0; + int sum3_3 = 0; + int sum3_4 = 0; + int sum3_5 = 0; + int sum3_6 = 0; + int sum3_7 = 0; - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %51.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %51.b[11] \n" // k11 + for (int q=0; q> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%4, #128] \n" + "vld1.s8 {d4-d5}, [%4]! \n"// tmpr a00-a03,a10-a13,a20-a23,a30-a33 a(inch)(data) + "vmovl.s8 q3, d5 \n"// a20-a23,a30-a33 + "vmovl.s8 q2, d4 \n"// a00-a04,a10-a14 - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %52.b[6] \n" // k06 + "vld1.s8 {d0-d1}, [%5]! \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33 k(outch)(inch) + "vmovl.s8 q1, d1 \n"// k02-k32,k03-k33 + "vmovl.s8 q0, d0 \n"// k00-k30,k01-k31 - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %52.b[7] \n" // k07 + "vmlal.s16 q6, d4, d0[0] \n"// sum0 = (a00-a03) * k00 + "vmlal.s16 q7, d4, d0[1] \n"// sum1 = (a00-a03) * k10 + "vmlal.s16 q8, d4, d0[2] \n"// sum2 = (a00-a03) * k20 + "vmlal.s16 q9, d4, d0[3] \n"// sum3 = (a00-a03) * k30 - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v29.4s, v30.4s}, [%3] \n" // sum2 + "vmlal.s16 q6, d5, d1[0] \n"// sum0 += (a10-a13) * k01 + "vmlal.s16 q7, d5, d1[1] \n"// sum1 += (a10-a13) * k11 + "vmlal.s16 q8, d5, d1[2] \n"// sum2 += (a10-a13) * k21 + "vmlal.s16 q9, d5, d1[3] \n"// sum3 += (a10-a13) * k31 - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %52.b[8] \n" // k08 + "vmlal.s16 q6, d6, d2[0] \n"// sum0 += (a20-a23) * k02 + "vmlal.s16 q7, d6, d2[1] \n"// sum1 += (a20-a23) * k12 + "vmlal.s16 q8, d6, d2[2] \n"// sum2 += (a20-a23) * k22 + "vmlal.s16 q9, d6, d2[3] \n"// sum3 += (a20-a23) * k32 - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %52.b[9] \n" // k09 + "vmlal.s16 q6, d7, d3[0] \n"// sum0 += (a30-a33) * k03 + "vmlal.s16 q7, d7, d3[1] \n"// sum1 += (a30-a33) * k13 + "vmlal.s16 q8, d7, d3[2] \n"// sum2 += (a30-a33) * k23 + "vmlal.s16 q9, d7, d3[3] \n"// sum3 += (a30-a33) * k33 - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %52.b[10] \n" // k10 + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %12, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %52.b[11] \n" // k11 + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%4] \n"// tmpr a00-a03 a(inch)(data) + "vld1.s8 {d0}, [%5] \n"// kptr k00-k30 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %4, #4 \n" + "add %5, #4 \n" - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %52.b[12] \n" // k12 + "vmlal.s16 q6, d2, d0[0] \n"// sum0 += (a00-a03) * k00 + "vmlal.s16 q7, d2, d0[1] \n"// sum1 += (a00-a03) * k10 + "vmlal.s16 q8, d2, d0[2] \n"// sum2 += (a00-a03) * k20 + "vmlal.s16 q9, d2, d0[3] \n"// sum3 += (a00-a03) * k30 - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %52.b[13] \n" // k13 + "subs r4, r4, #1 \n" + "bne 2b \n" - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %52.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %52.b[15] \n" // k15 + "3: \n"// store the result to memory + "vst1.s32 {d12-d13}, [%0]! \n" + "vst1.s32 {d14-d15}, [%1]! \n" + "vst1.s32 {d16-d17}, [%2]! \n" + "vst1.s32 {d18-d19}, [%3]! \n" - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %53.b[0] \n" // k00 + : "=r"(outptr0), // %0 + "=r"(outptr1), // %1 + "=r"(outptr2), // %2 + "=r"(outptr3), // %3 + "=r"(tmpptr), // %4 + "=r"(kptr) // %5 + : "0"(outptr0), + "1"(outptr1), + "2"(outptr2), + "3"(outptr3), + "4"(tmpptr), + "5"(kptr), + "r"(inch) // %12 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#else + int sum0_0 = 0; + int sum0_1 = 0; + int sum0_2 = 0; + int sum0_3 = 0; + + int sum1_0 = 0; + int sum1_1 = 0; + int sum1_2 = 0; + int sum1_3 = 0; + + int sum2_0 = 0; + int sum2_1 = 0; + int sum2_2 = 0; + int sum2_3 = 0; + + int sum3_0 = 0; + int sum3_1 = 0; + int sum3_2 = 0; + int sum3_3 = 0; - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %53.b[1] \n" // k01 + for (int q=0; q> 2 + "cmp r4, #0 \n" + "beq 1f \n" - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %53.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %53.b[12] \n" // k12 + "0: \n"// for(; nn != 0; nn--) + "pld [%4, #128] \n" + "vld1.s8 {d4}, [%4] \n"// tmpr a00,a10,a20,a30 a(inch)(data) + "add %4, #4 \n" + "vmovl.s8 q2, d4 \n"// a00,a10,a20,a30 - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %53.b[13] \n" // k13 + "vld1.s8 {d0-d1}, [%5]! \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33 k(outch)(inch) + "vmovl.s8 q1, d1 \n"// k02-k32,k03-k33 + "vmovl.s8 q0, d0 \n"// k00-k30,k01-k31 - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %53.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %53.b[15] \n" // k15 + "vmlal.s16 q6, d0, d4[0] \n"// (k00-k30) * a00 + "vmlal.s16 q7, d1, d4[1] \n"// (k01-k31) * a10 + "vmlal.s16 q8, d2, d4[2] \n"// (k02-k32) * a20 + "vmlal.s16 q9, d3, d4[3] \n"// (k03-k33) * a30 - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %54.b[0] \n" // k00 + "subs r4, r4, #1 \n" + "bne 0b \n"// end for - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %54.b[1] \n" // k01 + "vadd.s32 q6, q6, q7 \n" + "vadd.s32 q9, q9, q8 \n" + "vadd.s32 q10, q6, q9 \n" + + "1: \n" + // remain loop + "and r4, %12, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" - "smlal v28.8h, v23.8b, v27.8b \n" - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v26.8b, %54.b[2] \n" // k02 + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%4] \n"// tmpr a00 a(inch)(data) + "vld1.s8 {d0}, [%5] \n"// kptr k00-k30 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %4, #1 \n" + "add %5, #4 \n" - "saddw2 v30.4s, v30.4s, v28.8h \n" + "vmlal.s16 q10, d0, d2[0] \n" - "dup v27.8b, %54.b[3] \n" // k03 + "subs r4, r4, #1 \n" + "bne 2b \n" - "st1 {v29.4s, v30.4s}, [%4], #32 \n" - //########################################### // sum3 - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %54.b[4] \n" // k04 + "3: \n"// store the result to memory + "vst1.s32 {d20[0]}, [%0]! \n" + "vst1.s32 {d20[1]}, [%1]! \n" + "vst1.s32 {d21[0]}, [%2]! \n" + "vst1.s32 {d21[1]}, [%3]! \n" - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %54.b[5] \n" // k05 + : "=r"(outptr0), // %0 + "=r"(outptr1), // %1 + "=r"(outptr2), // %2 + "=r"(outptr3), // %3 + "=r"(tmpptr), // %4 + "=r"(kptr) // %5 + : "0"(outptr0), + "1"(outptr1), + "2"(outptr2), + "3"(outptr3), + "4"(tmpptr), + "5"(kptr), + "r"(inch) // %12 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#else + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %54.b[6] \n" // k06 + for (int q=0; q= 4) - { - remain -= 4; - - asm volatile( - "prfm pldl1keep, [%9, #128] \n" - "prfm pldl1keep, [%10, #128] \n" - "prfm pldl1keep, [%11, #128] \n" - "prfm pldl1keep, [%12, #128] \n" - "ld1 {v8.8b}, [%9], #8 \n" // r0" - "ld1 {v9.8b}, [%10], #8 \n" // r1" - "ld1 {v10.8b}, [%11], #8 \n" // r2" - "ld1 {v11.8b}, [%12], #8 \n" // r3" - - "dup v24.8b, %50.b[0] \n" // k00 - "dup v25.8b, %50.b[1] \n" // k01 - "dup v26.8b, %50.b[2] \n" // k02 - "dup v27.8b, %50.b[3] \n" // k03 - - "smull v28.8h, v8.8b, v24.8b \n" // r0 - "prfm pldl1keep, [%13, #128] \n" - "prfm pldl1keep, [%14, #128] \n" - "prfm pldl1keep, [%15, #128] \n" - - "smlal v28.8h, v9.8b, v25.8b \n" - "prfm pldl1keep, [%16, #128] \n" - "ld1 {v12.8b}, [%13], #8 \n" // r4" - "ld1 {v13.8b}, [%14], #8 \n" // r5" - - "smlal v28.8h, v10.8b, v26.8b \n" - "ld1 {v14.8b}, [%15], #8 \n" // r6" - "ld1 {v15.8b}, [%16], #8 \n" // r7" - "dup v24.8b, %50.b[4] \n" // k04 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v25.8b, %50.b[5] \n" // k05 - "dup v26.8b, %50.b[6] \n" // k06 - "dup v27.8b, %50.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" // r4 - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v29.4s}, [%1] \n" // sum0 - "prfm pldl1keep, [%17, #128] \n" - - "smlal v28.8h, v13.8b, v25.8b \n" - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - "prfm pldl1keep, [%20, #128] \n" - "ld1 {v16.8b}, [%17], #8 \n" // r8" - - "smlal v28.8h, v14.8b, v26.8b \n" - "ld1 {v17.8b}, [%18], #8 \n" // r9" - "ld1 {v18.8b}, [%19], #8 \n" // r10" - "ld1 {v19.8b}, [%20], #8 \n" // r11" - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v24.8b, %50.b[8] \n" // k08 - "dup v25.8b, %50.b[9] \n" // k09 - "dup v26.8b, %50.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" // r8 - "dup v27.8b, %50.b[11] \n" // k11 - "prfm pldl1keep, [%21, #128] \n" - "prfm pldl1keep, [%22, #128] \n" - - "smlal v28.8h, v17.8b, v25.8b \n" - "prfm pldl1keep, [%23, #128] \n" - "prfm pldl1keep, [%24, #128] \n" - "ld1 {v20.8b}, [%21], #8 \n" // r12" - - "smlal v28.8h, v18.8b, v26.8b \n" - "ld1 {v21.8b}, [%22], #8 \n" // r13" - "ld1 {v22.8b}, [%23], #8 \n" // r14" - "ld1 {v23.8b}, [%24], #8 \n" // r15" - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v24.8b, %50.b[12] \n" // k12 - "dup v25.8b, %50.b[13] \n" // k13 - "dup v26.8b, %50.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" // r12 - "dup v27.8b, %50.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %51.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %51.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %51.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %51.b[3] \n" // k03 - - "st1 {v29.4s}, [%1], #16 \n" // sum0 - //########################################### - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %51.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %51.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %51.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %51.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v29.4s}, [%2] \n" // sum1 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %51.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %51.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %51.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %51.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %51.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %51.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %51.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %51.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %52.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %52.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %52.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %52.b[3] \n" // k03 - - "st1 {v29.4s}, [%2], #16 \n" - //########################################### // sum1 - - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %52.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %52.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %52.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %52.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v29.4s}, [%3] \n" // sum2 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %52.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %52.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %52.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %52.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %52.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %52.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %52.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %52.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %53.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %53.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %53.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %53.b[3] \n" // k03 - - "st1 {v29.4s}, [%3], #16 \n" - //########################################### //sum 2 - - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %53.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %53.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %53.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %53.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v29.4s}, [%4] \n" // sum3 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %53.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %53.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %53.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %53.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %53.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %53.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %53.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %53.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %54.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %54.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %54.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %54.b[3] \n" // k03 - - "st1 {v29.4s}, [%4], #16 \n" - //########################################### // sum3 - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %54.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %54.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %54.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %54.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v29.4s}, [%5] \n" // sum4 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %54.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %54.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %54.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %54.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %54.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %54.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %54.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %54.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %55.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %55.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %55.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %55.b[3] \n" // k03 - - "st1 {v29.4s}, [%5], #16 \n" - //########################################### // sum4 - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %55.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %55.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %55.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %55.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v29.4s}, [%6] \n" // sum5 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %55.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %55.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %55.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %55.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %55.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %55.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %55.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %55.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %56.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %56.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %56.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "dup v27.8b, %56.b[3] \n" // k03 - - "st1 {v29.4s}, [%6], #16 \n" - //########################################### // sum5 - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %56.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %56.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %56.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %56.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%7, #128] \n" - "ld1 {v29.4s}, [%7] \n" // sum6 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %56.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %56.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %56.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %56.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %56.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %56.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %56.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %56.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "dup v24.8b, %57.b[0] \n" // k00 - - "smlal v28.8h, v22.8b, v26.8b \n" - "dup v25.8b, %57.b[1] \n" // k01 - - "smlal v28.8h, v23.8b, v27.8b \n" - "dup v26.8b, %57.b[2] \n" // k02 - - "saddw v29.4s, v29.4s, v28.4h \n" - "saddw2 v30.4s, v30.4s, v28.8h \n" - - "dup v27.8b, %57.b[3] \n" // k03 - - "st1 {v29.4s}, [%7], #16 \n" - //########################################### // sum6 - "smull v28.8h, v8.8b, v24.8b \n" - "dup v24.8b, %57.b[4] \n" // k04 - - "smlal v28.8h, v9.8b, v25.8b \n" - "dup v25.8b, %57.b[5] \n" // k05 - - "smlal v28.8h, v10.8b, v26.8b \n" - "dup v26.8b, %57.b[6] \n" // k06 - - "smlal v28.8h, v11.8b, v27.8b \n" - "dup v27.8b, %57.b[7] \n" // k07 - - "smlal v28.8h, v12.8b, v24.8b \n" - "prfm pldl1keep, [%8, #128] \n" - "ld1 {v29.4s}, [%8] \n" // sum7 - - "smlal v28.8h, v13.8b, v25.8b \n" - "dup v24.8b, %57.b[8] \n" // k08 - - "smlal v28.8h, v14.8b, v26.8b \n" - "dup v25.8b, %57.b[9] \n" // k09 - - "smlal v28.8h, v15.8b, v27.8b \n" - "dup v26.8b, %57.b[10] \n" // k10 - - "smlal v28.8h, v16.8b, v24.8b \n" - "dup v27.8b, %57.b[11] \n" // k11 - - "smlal v28.8h, v17.8b, v25.8b \n" - "dup v24.8b, %57.b[12] \n" // k12 - - "smlal v28.8h, v18.8b, v26.8b \n" - "dup v25.8b, %57.b[13] \n" // k13 - - "smlal v28.8h, v19.8b, v27.8b \n" - "dup v26.8b, %57.b[14] \n" // k14 - - "smlal v28.8h, v20.8b, v24.8b \n" - "dup v27.8b, %57.b[15] \n" // k15 - - "smlal v28.8h, v21.8b, v25.8b \n" - "sub %9, %9, #4 \n" - - "smlal v28.8h, v22.8b, v26.8b \n" - "sub %10, %10, #4 \n" - "sub %11, %11, #4 \n" - "sub %12, %12, #4 \n" - - "smlal v28.8h, v23.8b, v27.8b \n" - "sub %13, %13, #4 \n" - "sub %14, %14, #4 \n" - "sub %15, %15, #4 \n" - "sub %16, %16, #4 \n" - - "saddw v29.4s, v29.4s, v28.4h \n" - "sub %17, %17, #4 \n" - "sub %18, %18, #4 \n" - "sub %19, %19, #4 \n" - "sub %20, %20, #4 \n" - - "st1 {v29.4s}, [%8], #16 \n" - //########################################### // sum7 - "sub %21, %21, #4 \n" - "sub %22, %22, #4 \n" - "sub %23, %23, #4 \n" - "sub %24, %24, #4 \n" - : "=r"(nn), // %0 - "=r"(outptr0),// %1 - "=r"(outptr1),// %2 - "=r"(outptr2),// %3 - "=r"(outptr3),// %4 - "=r"(outptr4),// %5 - "=r"(outptr5),// %6 - "=r"(outptr6),// %7 - "=r"(outptr7),// %8 - "=r"(r0), // %9 - "=r"(r1), // %10 - "=r"(r2), // %11 - "=r"(r3), // %12 - "=r"(r4), // %13 - "=r"(r5), // %14 - "=r"(r6), // %15 - "=r"(r7), // %16 - "=r"(r8), // %17 - "=r"(r9), // %18 - "=r"(r10), // %19 - "=r"(r11), // %20 - "=r"(r12), // %21 - "=r"(r13), // %22 - "=r"(r14), // %23 - "=r"(r15) // %24 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(outptr4), - "6"(outptr5), - "7"(outptr6), - "8"(outptr7), - "9"(r0), - "10"(r1), - "11"(r2), - "12"(r3), - "13"(r4), - "14"(r5), - "15"(r6), - "16"(r7), - "17"(r8), - "18"(r9), - "19"(r10), - "20"(r11), - "21"(r12), - "22"(r13), - "23"(r14), - "24"(r15), - "w"(_k0), // %50 - "w"(_k1), // %51 - "w"(_k2), // %52 - "w"(_k3), // %53 - "w"(_k4), // %54 - "w"(_k5), // %55 - "w"(_k6), // %56 - "w"(_k7) // %57 - : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31" - ); - } - - for (; remain>0; remain--) - { - // TODO neon optimize - int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7] + *r8 * kernel0[8] + *r9 * kernel0[9] + *r10 * kernel0[10] + *r11 * kernel0[11] + *r12 * kernel0[12] + *r13 * kernel0[13] + *r14 * kernel0[14] + *r15 * kernel0[15]; - int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7] + *r8 * kernel1[8] + *r9 * kernel1[9] + *r10 * kernel1[10] + *r11 * kernel1[11] + *r12 * kernel1[12] + *r13 * kernel1[13] + *r14 * kernel1[14] + *r15 * kernel1[15]; - int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7] + *r8 * kernel2[8] + *r9 * kernel2[9] + *r10 * kernel2[10] + *r11 * kernel2[11] + *r12 * kernel2[12] + *r13 * kernel2[13] + *r14 * kernel2[14] + *r15 * kernel2[15]; - int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7] + *r8 * kernel3[8] + *r9 * kernel3[9] + *r10 * kernel3[10] + *r11 * kernel3[11] + *r12 * kernel3[12] + *r13 * kernel3[13] + *r14 * kernel3[14] + *r15 * kernel3[15]; - int sum4 = (int)*r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3] + *r4 * kernel4[4] + *r5 * kernel4[5] + *r6 * kernel4[6] + *r7 * kernel4[7] + *r8 * kernel4[8] + *r9 * kernel4[9] + *r10 * kernel4[10] + *r11 * kernel4[11] + *r12 * kernel4[12] + *r13 * kernel4[13] + *r14 * kernel4[14] + *r15 * kernel4[15]; - int sum5 = (int)*r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3] + *r4 * kernel5[4] + *r5 * kernel5[5] + *r6 * kernel5[6] + *r7 * kernel5[7] + *r8 * kernel5[8] + *r9 * kernel5[9] + *r10 * kernel5[10] + *r11 * kernel5[11] + *r12 * kernel5[12] + *r13 * kernel5[13] + *r14 * kernel5[14] + *r15 * kernel5[15]; - int sum6 = (int)*r0 * kernel6[0] + *r1 * kernel6[1] + *r2 * kernel6[2] + *r3 * kernel6[3] + *r4 * kernel6[4] + *r5 * kernel6[5] + *r6 * kernel6[6] + *r7 * kernel6[7] + *r8 * kernel6[8] + *r9 * kernel6[9] + *r10 * kernel6[10] + *r11 * kernel6[11] + *r12 * kernel6[12] + *r13 * kernel6[13] + *r14 * kernel6[14] + *r15 * kernel6[15]; - int sum7 = (int)*r0 * kernel7[0] + *r1 * kernel7[1] + *r2 * kernel7[2] + *r3 * kernel7[3] + *r4 * kernel7[4] + *r5 * kernel7[5] + *r6 * kernel7[6] + *r7 * kernel7[7] + *r8 * kernel7[8] + *r9 * kernel7[9] + *r10 * kernel7[10] + *r11 * kernel7[11] + *r12 * kernel7[12] + *r13 * kernel7[13] + *r14 * kernel7[14] + *r15 * kernel7[15]; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; - *outptr4 += sum4; - *outptr5 += sum5; - *outptr6 += sum6; - *outptr7 += sum7; - - r0++; - r1++; - r2++; - r3++; - r4++; - r5++; - r6++; - r7++; - r8++; - r9++; - r10++; - r11++; - r12++; - r13++; - r14++; - r15++; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - outptr4++; - outptr5++; - outptr6++; - outptr7++; - } - } -#else // f**k the gcc limit the num of asm operand less than 30 - for (; q+7> 4; - int remain = size & 15; - - asm volatile( - "ld1 {v0.16b}, [%0] \n" - "ld1 {v1.16b}, [%1] \n" - "ld1 {v2.16b}, [%2] \n" - "ld1 {v3.16b}, [%3] \n" - "ld1 {v4.16b}, [%4] \n" - "ld1 {v5.16b}, [%5] \n" - "ld1 {v6.16b}, [%6] \n" - "ld1 {v7.16b}, [%7] \n" - : - : "r"(kernel0), - "r"(kernel1), - "r"(kernel2), - "r"(kernel3), - "r"(kernel4), - "r"(kernel5), - "r"(kernel6), - "r"(kernel7) - : "cc", "memory" - ); - - if (nn > 0) - { - asm volatile( - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - "prfm pldl1keep, [%20, #128] \n" - "prfm pldl1keep, [%21, #128] \n" - "prfm pldl1keep, [%22, #128] \n" - "prfm pldl1keep, [%23, #128] \n" - "prfm pldl1keep, [%24, #128] \n" - "prfm pldl1keep, [%25, #128] \n" - "ld1 {v8.16b}, [%18], #16 \n" // r0" - "ld1 {v9.16b}, [%19], #16 \n" // r1" - "ld1 {v10.16b}, [%20], #16 \n" // r2" - "ld1 {v11.16b}, [%21], #16 \n" // r3" - "ld1 {v12.16b}, [%22], #16 \n" // r4" - "ld1 {v13.16b}, [%23], #16 \n" // r5" - "ld1 {v14.16b}, [%24], #16 \n" // r6" - "ld1 {v15.16b}, [%25], #16 \n" // r7" - - "0: \n" - - "dup v16.16b, v0.16b[0] \n" // k00 - "dup v17.16b, v0.16b[1] \n" // k01 - "dup v18.16b, v0.16b[2] \n" // k02 - "dup v19.16b, v0.16b[3] \n" // k03 - "dup v20.16b, v0.16b[4] \n" // k04 - "dup v21.16b, v0.16b[5] \n" // k05 - "dup v22.16b, v0.16b[6] \n" // k06 - "dup v23.16b, v0.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v1.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v1.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v1.16b[2] \n" // k02 - - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v26.4s, v27.4s}, [%1] \n" // sum0 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v1.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v1.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v1.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - - "st1 {v26.4s, v27.4s}, [%1], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%1] \n" // sum0n - "dup v22.16b, v1.16b[6] \n" // k06 - "dup v23.16b, v1.16b[7] \n" // k07 - - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%1], #32 \n" - //########################################### - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v2.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v2.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v2.16b[2] \n" // k02 - - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v26.4s, v27.4s}, [%2] \n" // sum1 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v2.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v2.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v2.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - - "st1 {v26.4s, v27.4s}, [%2], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%2] \n" // sum1n - "dup v22.16b, v2.16b[6] \n" // k06 - "dup v23.16b, v2.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - - "st1 {v28.4s, v29.4s}, [%2], #32 \n" - //########################################### - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v3.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v3.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v3.16b[2] \n" // k02 - - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v26.4s, v27.4s}, [%3] \n" // sum2 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v3.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v3.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v3.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%3], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%3] \n" // sum2n - "dup v22.16b, v3.16b[6] \n" // k06 - "dup v23.16b, v3.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%3], #32 \n" - //########################################## - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v4.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v4.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v4.16b[2] \n" // k02 - - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v26.4s, v27.4s}, [%4] \n" // sum3 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v4.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v4.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v4.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%4], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%4] \n" // sum3n - "dup v22.16b, v4.16b[6] \n" // k06 - "dup v23.16b, v4.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - - "st1 {v28.4s, v29.4s}, [%4], #32 \n" - //########################################## - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v5.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v5.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v5.16b[2] \n" // k02 - - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v26.4s, v27.4s}, [%5] \n" // sum4 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v5.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v5.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v5.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%5], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%5] \n" // sum4n - "dup v22.16b, v5.16b[6] \n" // k06 - "dup v23.16b, v5.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%5], #32 \n" - //########################################## - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v6.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v6.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v6.16b[2] \n" // k02 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v6.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v6.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v6.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v26.4s, v27.4s}, [%6] \n" // sum5 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%6], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%6] \n" // sum5n - "dup v22.16b, v6.16b[6] \n" // k06 - "dup v23.16b, v6.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%6], #32 \n" - //########################################## - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "dup v16.16b, v7.16b[0] \n" // k00 - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "dup v17.16b, v7.16b[1] \n" // k01 - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - "dup v18.16b, v7.16b[2] \n" // k02 - - "prfm pldl1keep, [%7, #128] \n" - "ld1 {v26.4s, v27.4s}, [%7] \n" // sum6 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "dup v19.16b, v7.16b[3] \n" // k03 - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "dup v20.16b, v7.16b[4] \n" // k04 - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "dup v21.16b, v7.16b[5] \n" // k05 - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%7], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%7] \n" // sum6n - "dup v22.16b, v7.16b[6] \n" // k06 - "dup v23.16b, v7.16b[7] \n" // k07 - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%7], #32 \n" - //########################################## - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smull2 v25.8h, v8.16b, v16.16b \n" - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal2 v25.8h, v9.16b, v17.16b \n" - "prfm pldl1keep, [%20, #128] \n" - "prfm pldl1keep, [%21, #128] \n" - - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal2 v25.8h, v10.16b, v18.16b \n" - "prfm pldl1keep, [%22, #128] \n" - "prfm pldl1keep, [%23, #128] \n" - - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal2 v25.8h, v11.16b, v19.16b \n" - - "prfm pldl1keep, [%8, #128] \n" - "ld1 {v26.4s, v27.4s}, [%8] \n" // sum7 - - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal2 v25.8h, v12.16b, v20.16b \n" - "prfm pldl1keep, [%24, #128] \n" - "prfm pldl1keep, [%25, #128] \n" - - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal2 v25.8h, v13.16b, v21.16b \n" - "ld1 {v8.16b}, [%18], #16 \n" // r0" - "ld1 {v9.16b}, [%19], #16 \n" // r1" - - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal2 v25.8h, v14.16b, v22.16b \n" - "ld1 {v10.16b}, [%20], #16 \n" // r2" - "ld1 {v11.16b}, [%21], #16 \n" // r3" - - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - "smlal2 v25.8h, v15.16b, v23.16b \n" - "ld1 {v12.16b}, [%22], #16 \n" // r4" - "ld1 {v13.16b}, [%23], #16 \n" // r5" - - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%8], #32 \n" - - "ld1 {v28.4s, v29.4s}, [%8] \n" // sum7n - "ld1 {v14.16b}, [%24], #16 \n" // r6" - "ld1 {v15.16b}, [%25], #16 \n" // r7" - "saddw v28.4s, v28.4s, v25.4h \n" - "saddw2 v29.4s, v29.4s, v25.8h \n" - "st1 {v28.4s, v29.4s}, [%8], #32 \n" - "subs %w0, %w0, #1 \n" - "bne 0b \n" - "sub %18, %18, #16 \n" - "sub %19, %19, #16 \n" - "sub %20, %20, #16 \n" - "sub %21, %21, #16 \n" - "sub %22, %22, #16 \n" - "sub %23, %23, #16 \n" - "sub %24, %24, #16 \n" - "sub %25, %25, #16 \n" - //########################################## - : "=r"(nn), // %0 - "=r"(outptr0),// %1 - "=r"(outptr1),// %2 - "=r"(outptr2),// %3 - "=r"(outptr3),// %4 - "=r"(outptr4),// %5 - "=r"(outptr5),// %6 - "=r"(outptr6),// %7 - "=r"(outptr7) // %8 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(outptr4), - "6"(outptr5), - "7"(outptr6), - "8"(outptr7), - "r"(r0), // %18 - "r"(r1), // %19 - "r"(r2), // %20 - "r"(r3), // %21 - "r"(r4), // %22 - "r"(r5), // %23 - "r"(r6), // %24 - "r"(r7) // %25 - : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" - ); - } - - if (remain == 8) - { - remain -= 8; - - asm volatile( - "prfm pldl1keep, [%18, #128] \n" - "prfm pldl1keep, [%19, #128] \n" - "prfm pldl1keep, [%20, #128] \n" - "prfm pldl1keep, [%21, #128] \n" - "prfm pldl1keep, [%22, #128] \n" - "prfm pldl1keep, [%23, #128] \n" - "prfm pldl1keep, [%24, #128] \n" - "prfm pldl1keep, [%25, #128] \n" - "ld1 {v8.8b}, [%18], #8 \n" // r0" - "ld1 {v9.8b}, [%19], #8 \n" // r1" - "ld1 {v10.8b}, [%20], #8 \n" // r2" - "ld1 {v11.8b}, [%21], #8 \n" // r3" - "ld1 {v12.8b}, [%22], #8 \n" // r4" - "ld1 {v13.8b}, [%23], #8 \n" // r5" - "ld1 {v14.8b}, [%24], #8 \n" // r6" - "ld1 {v15.8b}, [%25], #8 \n" // r7" - - "dup v16.8b, v0.16b[0] \n" // k00 - "dup v17.8b, v0.16b[1] \n" // k01 - "dup v18.8b, v0.16b[2] \n" // k02 - "dup v19.8b, v0.16b[3] \n" // k03 - "dup v20.8b, v0.16b[4] \n" // k04 - "dup v21.8b, v0.16b[5] \n" // k05 - "dup v22.8b, v0.16b[6] \n" // k06 - "dup v23.8b, v0.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v26.4s, v27.4s}, [%1] \n" // sum0 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%1], #32 \n" - //########################################### - "dup v16.8b, v1.16b[0] \n" // k00 - "dup v17.8b, v1.16b[1] \n" // k01 - "dup v18.8b, v1.16b[2] \n" // k02 - "dup v19.8b, v1.16b[3] \n" // k03 - "dup v20.8b, v1.16b[4] \n" // k04 - "dup v21.8b, v1.16b[5] \n" // k05 - "dup v22.8b, v1.16b[6] \n" // k06 - "dup v23.8b, v1.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v26.4s, v27.4s}, [%2] \n" // sum1 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%2], #32 \n" - //########################################### - "dup v16.8b, v2.16b[0] \n" // k00 - "dup v17.8b, v2.16b[1] \n" // k01 - "dup v18.8b, v2.16b[2] \n" // k02 - "dup v19.8b, v2.16b[3] \n" // k03 - "dup v20.8b, v2.16b[4] \n" // k04 - "dup v21.8b, v2.16b[5] \n" // k05 - "dup v22.8b, v2.16b[6] \n" // k06 - "dup v23.8b, v2.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v26.4s, v27.4s}, [%3] \n" // sum2 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%3], #32 \n" - //########################################## - "dup v16.8b, v3.16b[0] \n" // k00 - "dup v17.8b, v3.16b[1] \n" // k01 - "dup v18.8b, v3.16b[2] \n" // k02 - "dup v19.8b, v3.16b[3] \n" // k03 - "dup v20.8b, v3.16b[4] \n" // k04 - "dup v21.8b, v3.16b[5] \n" // k05 - "dup v22.8b, v3.16b[6] \n" // k06 - "dup v23.8b, v3.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v26.4s, v27.4s}, [%4] \n" // sum3 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%4], #32 \n" - //########################################## - "dup v16.8b, v4.16b[0] \n" // k00 - "dup v17.8b, v4.16b[1] \n" // k01 - "dup v18.8b, v4.16b[2] \n" // k02 - "dup v19.8b, v4.16b[3] \n" // k03 - "dup v20.8b, v4.16b[4] \n" // k04 - "dup v21.8b, v4.16b[5] \n" // k05 - "dup v22.8b, v4.16b[6] \n" // k06 - "dup v23.8b, v4.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v26.4s, v27.4s}, [%5] \n" // sum4 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%5], #32 \n" - //########################################## - "dup v16.8b, v5.16b[0] \n" // k00 - "dup v17.8b, v5.16b[1] \n" // k01 - "dup v18.8b, v5.16b[2] \n" // k02 - "dup v19.8b, v5.16b[3] \n" // k03 - "dup v20.8b, v5.16b[4] \n" // k04 - "dup v21.8b, v5.16b[5] \n" // k05 - "dup v22.8b, v5.16b[6] \n" // k06 - "dup v23.8b, v5.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v26.4s, v27.4s}, [%6] \n" // sum5 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%6], #32 \n" - //########################################## - "dup v16.8b, v6.16b[0] \n" // k00 - "dup v17.8b, v6.16b[1] \n" // k01 - "dup v18.8b, v6.16b[2] \n" // k02 - "dup v19.8b, v6.16b[3] \n" // k03 - "dup v20.8b, v6.16b[4] \n" // k04 - "dup v21.8b, v6.16b[5] \n" // k05 - "dup v22.8b, v6.16b[6] \n" // k06 - "dup v23.8b, v6.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%7, #128] \n" - "ld1 {v26.4s, v27.4s}, [%7] \n" // sum6 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%7], #32 \n" - //########################################## - "dup v16.8b, v7.16b[0] \n" // k00 - "dup v17.8b, v7.16b[1] \n" // k01 - "dup v18.8b, v7.16b[2] \n" // k02 - "dup v19.8b, v7.16b[3] \n" // k03 - "dup v20.8b, v7.16b[4] \n" // k04 - "dup v21.8b, v7.16b[5] \n" // k05 - "dup v22.8b, v7.16b[6] \n" // k06 - "dup v23.8b, v7.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%8, #128] \n" - "ld1 {v26.4s, v27.4s}, [%8] \n" // sum7 - "saddw v26.4s, v26.4s, v24.4h \n" - "saddw2 v27.4s, v27.4s, v24.8h \n" - "st1 {v26.4s, v27.4s}, [%8], #32 \n" - //########################################## - : "=r"(nn), // %0 - "=r"(outptr0),// %1 - "=r"(outptr1),// %2 - "=r"(outptr2),// %3 - "=r"(outptr3),// %4 - "=r"(outptr4),// %5 - "=r"(outptr5),// %6 - "=r"(outptr6),// %7 - "=r"(outptr7) // %8 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(outptr4), - "6"(outptr5), - "7"(outptr6), - "8"(outptr7), - "r"(r0), // %18 - "r"(r1), // %19 - "r"(r2), // %20 - "r"(r3), // %21 - "r"(r4), // %22 - "r"(r5), // %23 - "r"(r6), // %24 - "r"(r7) // %25 - : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" - ); - } - - if (remain == 4) - { - remain -= 4; - - asm volatile( - "ld1 {v8.8b}, [%18], #8 \n" // r0" - "ld1 {v9.8b}, [%19], #8 \n" // r1" - "ld1 {v10.8b}, [%20], #8 \n" // r2" - "ld1 {v11.8b}, [%21], #8 \n" // r3" - "ld1 {v12.8b}, [%22], #8 \n" // r4" - "ld1 {v13.8b}, [%23], #8 \n" // r5" - "ld1 {v14.8b}, [%24], #8 \n" // r6" - "ld1 {v15.8b}, [%25], #8 \n" // r7" - - "dup v16.8b, v0.16b[0] \n" // k00 - "dup v17.8b, v0.16b[1] \n" // k01 - "dup v18.8b, v0.16b[2] \n" // k02 - "dup v19.8b, v0.16b[3] \n" // k03 - "dup v20.8b, v0.16b[4] \n" // k04 - "dup v21.8b, v0.16b[5] \n" // k05 - "dup v22.8b, v0.16b[6] \n" // k06 - "dup v23.8b, v0.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%1, #128] \n" - "ld1 {v26.4s}, [%1] \n" // sum0 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%1], #16 \n" - //########################################### - "dup v16.8b, v1.16b[0] \n" // k00 - "dup v17.8b, v1.16b[1] \n" // k01 - "dup v18.8b, v1.16b[2] \n" // k02 - "dup v19.8b, v1.16b[3] \n" // k03 - "dup v20.8b, v1.16b[4] \n" // k04 - "dup v21.8b, v1.16b[5] \n" // k05 - "dup v22.8b, v1.16b[6] \n" // k06 - "dup v23.8b, v1.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%2, #128] \n" - "ld1 {v26.4s}, [%2] \n" // sum1 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%2], #16 \n" - //########################################### - "dup v16.8b, v2.16b[0] \n" // k00 - "dup v17.8b, v2.16b[1] \n" // k01 - "dup v18.8b, v2.16b[2] \n" // k02 - "dup v19.8b, v2.16b[3] \n" // k03 - "dup v20.8b, v2.16b[4] \n" // k04 - "dup v21.8b, v2.16b[5] \n" // k05 - "dup v22.8b, v2.16b[6] \n" // k06 - "dup v23.8b, v2.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%3, #128] \n" - "ld1 {v26.4s}, [%3] \n" // sum2 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%3], #16 \n" - //########################################## - "dup v16.8b, v3.16b[0] \n" // k00 - "dup v17.8b, v3.16b[1] \n" // k01 - "dup v18.8b, v3.16b[2] \n" // k02 - "dup v19.8b, v3.16b[3] \n" // k03 - "dup v20.8b, v3.16b[4] \n" // k04 - "dup v21.8b, v3.16b[5] \n" // k05 - "dup v22.8b, v3.16b[6] \n" // k06 - "dup v23.8b, v3.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v26.4s}, [%4] \n" // sum3 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%4], #16 \n" - //########################################## - "dup v16.8b, v4.16b[0] \n" // k00 - "dup v17.8b, v4.16b[1] \n" // k01 - "dup v18.8b, v4.16b[2] \n" // k02 - "dup v19.8b, v4.16b[3] \n" // k03 - "dup v20.8b, v4.16b[4] \n" // k04 - "dup v21.8b, v4.16b[5] \n" // k05 - "dup v22.8b, v4.16b[6] \n" // k06 - "dup v23.8b, v4.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%5, #128] \n" - "ld1 {v26.4s}, [%5] \n" // sum4 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%5], #16 \n" - //########################################## - "dup v16.8b, v5.16b[0] \n" // k00 - "dup v17.8b, v5.16b[1] \n" // k01 - "dup v18.8b, v5.16b[2] \n" // k02 - "dup v19.8b, v5.16b[3] \n" // k03 - "dup v20.8b, v5.16b[4] \n" // k04 - "dup v21.8b, v5.16b[5] \n" // k05 - "dup v22.8b, v5.16b[6] \n" // k06 - "dup v23.8b, v5.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%6, #128] \n" - "ld1 {v26.4s}, [%6] \n" // sum5 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%6], #16 \n" - //########################################## - "dup v16.8b, v6.16b[0] \n" // k00 - "dup v17.8b, v6.16b[1] \n" // k01 - "dup v18.8b, v6.16b[2] \n" // k02 - "dup v19.8b, v6.16b[3] \n" // k03 - "dup v20.8b, v6.16b[4] \n" // k04 - "dup v21.8b, v6.16b[5] \n" // k05 - "dup v22.8b, v6.16b[6] \n" // k06 - "dup v23.8b, v6.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%7, #128] \n" - "ld1 {v26.4s}, [%7] \n" // sum6 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%7], #16 \n" - //########################################## - "dup v16.8b, v7.16b[0] \n" // k00 - "dup v17.8b, v7.16b[1] \n" // k01 - "dup v18.8b, v7.16b[2] \n" // k02 - "dup v19.8b, v7.16b[3] \n" // k03 - "dup v20.8b, v7.16b[4] \n" // k04 - "dup v21.8b, v7.16b[5] \n" // k05 - "dup v22.8b, v7.16b[6] \n" // k06 - "dup v23.8b, v7.16b[7] \n" // k07 - - "smull v24.8h, v8.8b, v16.8b \n" // r0 * k0 - "smlal v24.8h, v9.8b, v17.8b \n" // r0 * k1 - "smlal v24.8h, v10.8b, v18.8b \n" // r0 * k2 - "smlal v24.8h, v11.8b, v19.8b \n" // r0 * k3 - "smlal v24.8h, v12.8b, v20.8b \n" // r0 * k4 - "smlal v24.8h, v13.8b, v21.8b \n" // r0 * k5 - "smlal v24.8h, v14.8b, v22.8b \n" // r0 * k6 - "smlal v24.8h, v15.8b, v23.8b \n" // r0 * k7 - - "prfm pldl1keep, [%8, #128] \n" - "ld1 {v26.4s}, [%8] \n" // sum7 - "saddw v26.4s, v26.4s, v24.4h \n" - "st1 {v26.4s}, [%8], #16 \n" - "sub %18, %18, #4 \n" - "sub %19, %19, #4 \n" - "sub %20, %20, #4 \n" - "sub %21, %21, #4 \n" - "sub %22, %22, #4 \n" - "sub %23, %23, #4 \n" - "sub %24, %24, #4 \n" - "sub %25, %25, #4 \n" - //########################################## - : "=r"(nn), // %0 - "=r"(outptr0),// %1 - "=r"(outptr1),// %2 - "=r"(outptr2),// %3 - "=r"(outptr3),// %4 - "=r"(outptr4),// %5 - "=r"(outptr5),// %6 - "=r"(outptr6),// %7 - "=r"(outptr7) // %8 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(outptr4), - "6"(outptr5), - "7"(outptr6), - "8"(outptr7), - "r"(r0), // %18 - "r"(r1), // %19 - "r"(r2), // %20 - "r"(r3), // %21 - "r"(r4), // %22 - "r"(r5), // %23 - "r"(r6), // %24 - "r"(r7) // %25 - : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29" - ); - } - - for (; remain>0; remain--) - { - // TODO neon optimize - int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7]; - int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7]; - int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7]; - int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7]; - int sum4 = (int)*r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3] + *r4 * kernel4[4] + *r5 * kernel4[5] + *r6 * kernel4[6] + *r7 * kernel4[7]; - int sum5 = (int)*r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3] + *r4 * kernel5[4] + *r5 * kernel5[5] + *r6 * kernel5[6] + *r7 * kernel5[7]; - int sum6 = (int)*r0 * kernel6[0] + *r1 * kernel6[1] + *r2 * kernel6[2] + *r3 * kernel6[3] + *r4 * kernel6[4] + *r5 * kernel6[5] + *r6 * kernel6[6] + *r7 * kernel6[7]; - int sum7 = (int)*r0 * kernel7[0] + *r1 * kernel7[1] + *r2 * kernel7[2] + *r3 * kernel7[3] + *r4 * kernel7[4] + *r5 * kernel7[5] + *r6 * kernel7[6] + *r7 * kernel7[7]; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; - *outptr4 += sum4; - *outptr5 += sum5; - *outptr6 += sum6; - *outptr7 += sum7; - - r0++; - r1++; - r2++; - r3++; - r4++; - r5++; - r6++; - r7++; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - outptr4++; - outptr5++; - outptr6++; - outptr7++; - } - } -#endif - - for (; q> 3; - int remain = size & 7; - - int8x8_t _k0 = vdup_n_s8(k0); - int8x8_t _k1 = vdup_n_s8(k1); - int8x8_t _k2 = vdup_n_s8(k2); - int8x8_t _k3 = vdup_n_s8(k3); - int8x8_t _k4 = vdup_n_s8(k4); - int8x8_t _k5 = vdup_n_s8(k5); - int8x8_t _k6 = vdup_n_s8(k6); - int8x8_t _k7 = vdup_n_s8(k7); - - for (; nn>0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - - int32x4_t _out0 = vld1q_s32(outptr0); - int32x4_t _out0n = vld1q_s32(outptr0+4); - int32x4_t _out1 = vld1q_s32(outptr1); - int32x4_t _out1n = vld1q_s32(outptr1+4); - int32x4_t _out2 = vld1q_s32(outptr2); - int32x4_t _out2n = vld1q_s32(outptr2+4); - int32x4_t _out3 = vld1q_s32(outptr3); - int32x4_t _out3n = vld1q_s32(outptr3+4); - int32x4_t _out4 = vld1q_s32(outptr4); - int32x4_t _out4n = vld1q_s32(outptr4+4); - int32x4_t _out5 = vld1q_s32(outptr5); - int32x4_t _out5n = vld1q_s32(outptr5+4); - int32x4_t _out6 = vld1q_s32(outptr6); - int32x4_t _out6n = vld1q_s32(outptr6+4); - int32x4_t _out7 = vld1q_s32(outptr7); - int32x4_t _out7n = vld1q_s32(outptr7+4); - - int16x8_t _out0_s16 = vmull_s8(_r0, _k0); - int16x8_t _out1_s16 = vmull_s8(_r0, _k1); - int16x8_t _out2_s16 = vmull_s8(_r0, _k2); - int16x8_t _out3_s16 = vmull_s8(_r0, _k3); - int16x8_t _out4_s16 = vmull_s8(_r0, _k4); - int16x8_t _out5_s16 = vmull_s8(_r0, _k5); - int16x8_t _out6_s16 = vmull_s8(_r0, _k6); - int16x8_t _out7_s16 = vmull_s8(_r0, _k7); - - _out0 = vaddw_s16(_out0, vget_low_s16(_out0_s16)); - _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16)); - _out1 = vaddw_s16(_out1, vget_low_s16(_out1_s16)); - _out1n = vaddw_s16(_out1n, vget_high_s16(_out1_s16)); - _out2 = vaddw_s16(_out2, vget_low_s16(_out2_s16)); - _out2n = vaddw_s16(_out2n, vget_high_s16(_out2_s16)); - _out3 = vaddw_s16(_out3, vget_low_s16(_out3_s16)); - _out3n = vaddw_s16(_out3n, vget_high_s16(_out3_s16)); - _out4 = vaddw_s16(_out4, vget_low_s16(_out4_s16)); - _out4n = vaddw_s16(_out4n, vget_high_s16(_out4_s16)); - _out5 = vaddw_s16(_out5, vget_low_s16(_out5_s16)); - _out5n = vaddw_s16(_out5n, vget_high_s16(_out5_s16)); - _out6 = vaddw_s16(_out6, vget_low_s16(_out6_s16)); - _out6n = vaddw_s16(_out6n, vget_high_s16(_out6_s16)); - _out7 = vaddw_s16(_out7, vget_low_s16(_out7_s16)); - _out7n = vaddw_s16(_out7n, vget_high_s16(_out7_s16)); - - vst1q_s32(outptr0, _out0); - vst1q_s32(outptr0+4, _out0n); - vst1q_s32(outptr1, _out1); - vst1q_s32(outptr1+4, _out1n); - vst1q_s32(outptr2, _out2); - vst1q_s32(outptr2+4, _out2n); - vst1q_s32(outptr3, _out3); - vst1q_s32(outptr3+4, _out3n); - vst1q_s32(outptr4, _out4); - vst1q_s32(outptr4+4, _out4n); - vst1q_s32(outptr5, _out5); - vst1q_s32(outptr5+4, _out5n); - vst1q_s32(outptr6, _out6); - vst1q_s32(outptr6+4, _out6n); - vst1q_s32(outptr7, _out7); - vst1q_s32(outptr7+4, _out7n); - - r0 += 8; - outptr0 += 8; - outptr1 += 8; - outptr2 += 8; - outptr3 += 8; - outptr4 += 8; - outptr5 += 8; - outptr6 += 8; - outptr7 += 8; - } - - for (; remain>0; remain--) - { - // TODO neon optimize - int sum0 = (int)*r0 * k0; - int sum1 = (int)*r0 * k1; - int sum2 = (int)*r0 * k2; - int sum3 = (int)*r0 * k3; - int sum4 = (int)*r0 * k4; - int sum5 = (int)*r0 * k5; - int sum6 = (int)*r0 * k6; - int sum7 = (int)*r0 * k7; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; - *outptr4 += sum4; - *outptr5 += sum5; - *outptr6 += sum6; - *outptr7 += sum7; - - r0++; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - outptr4++; - outptr5++; - outptr6++; - outptr7++; - } - } - } + remain_outch_start += nn_outch << 2; #pragma omp parallel for num_threads(opt.num_threads) for (int p=remain_outch_start; p> 3; - int remain = size & 7; - - int8x8_t _k0 = vdup_n_s8(k0); - int8x8_t _k1 = vdup_n_s8(k1); - int8x8_t _k2 = vdup_n_s8(k2); - int8x8_t _k3 = vdup_n_s8(k3); - int8x8_t _k4 = vdup_n_s8(k4); - int8x8_t _k5 = vdup_n_s8(k5); - int8x8_t _k6 = vdup_n_s8(k6); - int8x8_t _k7 = vdup_n_s8(k7); - - for (; nn>0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - int8x8_t _r1 = vld1_s8(r1); - int8x8_t _r2 = vld1_s8(r2); - int8x8_t _r3 = vld1_s8(r3); - int8x8_t _r4 = vld1_s8(r4); - int8x8_t _r5 = vld1_s8(r5); - int8x8_t _r6 = vld1_s8(r6); - int8x8_t _r7 = vld1_s8(r7); - - int32x4_t _out0 = vld1q_s32(outptr); - int32x4_t _out0n = vld1q_s32(outptr+4); - - int16x8_t _out0_s16 = vmull_s8(_r0, _k0); - _out0_s16 = vmlal_s8(_out0_s16, _r1, _k1); - _out0_s16 = vmlal_s8(_out0_s16, _r2, _k2); - _out0_s16 = vmlal_s8(_out0_s16, _r3, _k3); - _out0_s16 = vmlal_s8(_out0_s16, _r4, _k4); - _out0_s16 = vmlal_s8(_out0_s16, _r5, _k5); - _out0_s16 = vmlal_s8(_out0_s16, _r6, _k6); - _out0_s16 = vmlal_s8(_out0_s16, _r7, _k7); - - _out0 = vaddw_s16(_out0, vget_low_s16(_out0_s16)); - _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16)); - - vst1q_s32(outptr, _out0); - vst1q_s32(outptr+4, _out0n); - - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - r4 += 8; - r5 += 8; - r6 += 8; - r7 += 8; - outptr += 8; - } + const int bias0 = 0; - for (; remain>0; remain--) - { - int sum = (int)*r0 * k0; - int sum1 = (int)*r1 * k1; - int sum2 = (int)*r2 * k2; - int sum3 = (int)*r3 * k3; - int sum4 = (int)*r4 * k4; - int sum5 = (int)*r5 * k5; - int sum6 = (int)*r6 * k6; - int sum7 = (int)*r7 * k7; - - *outptr += sum + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7; - - r0++; - r1++; - r2++; - r3++; - r4++; - r5++; - r6++; - r7++; - outptr++; - } + int* outptr0 = out0; - } + int i = 0; - for (; q> 3; - int remain = size & 7; - - int8x8_t _k0 = vdup_n_s8(k0); - - for (; nn>0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - - int32x4_t _out0 = vld1q_s32(outptr); - int32x4_t _out0n = vld1q_s32(outptr+4); - - int16x8_t _out0_s16 = vmull_s8(_r0, _k0); - - _out0 = vaddw_s16(_out0, vget_low_s16(_out0_s16)); - _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16)); - - vst1q_s32(outptr, _out0); - vst1q_s32(outptr+4, _out0n); - - r0 += 8; - outptr += 8; - } - - for (; remain>0; remain--) - { - int sum = (int)*r0 * k0; + const signed char* tmpptr = tmp.channel(i/8); + const signed char* kptr = kernel.channel(p/4 + p%4); +#if 0 //__ARM_NEON + asm volatile( + // inch loop + "vmov.s32 q6, #0 \n" + "vmov.s32 q7, #0 \n" - *outptr += sum; + "lsr r4, %6, #2 \n"// r4 = nn = inch >> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%2, #128] \n" + "vld1.s8 {d4-d7}, [%1]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q5, d7 \n"// a30-a37 + "vmovl.s8 q4, d6 \n"// a20-a27 + "vmovl.s8 q3, d5 \n"// a10-a17 + "vmovl.s8 q2, d4 \n"// a00-a07 - r0++; - outptr++; - } - } - } -} -#else // __aarch64__ -/* - * Convolution 1x1 quantized with int8,unroll 8 x 4 - */ -static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) -{ - int inch = bottom_blob.c; + "vld1.s8 {d0}, [%2] \n"// kptr k00,k01,k02,k03 k(outch)(inch) + "vmovl.s8 q0, d0 \n"// k00,k01,k02,k03 + "add %2, #4 \n" - int outw = top_blob.w; - int outh = top_blob.h; - int outch = top_blob.c; + "vmlal.s16 q6, d4, d0[0] \n"// (a00-a07) * k00 + "vmlal.s16 q7, d5, d0[0] \n" + "vmlal.s16 q6, d6, d0[1] \n"// (a10-a17) * k01 + "vmlal.s16 q7, d7, d0[1] \n" + "vmlal.s16 q6, d8, d0[2] \n"// (a20-a27) * k02 + "vmlal.s16 q7, d9, d0[2] \n" + "vmlal.s16 q6, d10, d0[3] \n"// (a30-a37) * k03 + "vmlal.s16 q7, d11, d0[3] \n" - const signed char* kernel = _kernel; + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %6, #3 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" - int nn_outch = outch >> 2; - int remain_outch_start = nn_outch << 2; + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%1]! \n"// tmpr a00-a07 a(inch)(data) + "vld1.s8 {d0}, [%2] \n"// kptr k00 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %2, #1 \n" - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp> 3; - int remain = size & 7; - - if (nn > 0) + for (int q=0; q= 4) - { - remain -= 4; - - asm volatile( - "0: \n" - //ld r0-r7 - "pld [%5, #64] \n" - "vld1.s8 {d0}, [%5 :64] \n" //r0 - - "pld [%6, #64] \n" - "vld1.s8 {d1}, [%6 :64] \n" //r1 - - "pld [%7, #64] \n" - "vld1.s8 {d2}, [%7 :64] \n" //r2 - - "pld [%8, #64] \n" - "vld1.s8 {d3}, [%8 :64] \n" //r3 - - "pld [%9, #64] \n" - "vld1.s8 {d4}, [%9 :64] \n" //r4 - - "pld [%10, #64] \n" - "vld1.s8 {d5}, [%10 :64] \n" //r5 - - "pld [%11, #64] \n" - "vld1.s8 {d6}, [%11 :64] \n" //r6 - - "pld [%12, #64] \n" - "vld1.s8 {d7}, [%12 :64] \n" //r7 - - "add %5, #4 \n" - "add %6, #4 \n" - "add %7, #4 \n" - "add %8, #4 \n" - "add %9, #4 \n" - "add %10, #4 \n" - "add %11, #4 \n" - "add %12, #4 \n" - //########################################### - //load inch kernel_0 k0-k7 - "vdup.s8 d8, d18[0] \n" - "vdup.s8 d9, d18[1] \n" - "vdup.s8 d10, d18[2] \n" - "vdup.s8 d11, d18[3] \n" - "vdup.s8 d12, d18[4] \n" - "vdup.s8 d13, d18[5] \n" - "vdup.s8 d14, d18[6] \n" - "vdup.s8 d15, d18[7] \n" - - //mla - "vmull.s8 q8, d0, d8 \n" - "vmlal.s8 q8, d1, d9 \n" - "vmlal.s8 q8, d2, d10 \n" - "vmlal.s8 q8, d3, d11 \n" - "vmlal.s8 q8, d4, d12 \n" - "vmlal.s8 q8, d5, d13 \n" - "vmlal.s8 q8, d6, d14 \n" - "vmlal.s8 q8, d7, d15 \n" - - //outptr0_s32 - "pld [%1, #128] \n" - "vld1.32 {d20-d21}, [%1:128] \n" //outptr0_s32 - "vaddw.s16 q10, q10, d16 \n" - "vst1.32 {d20-d21}, [%1:128]!\n" - //########################################### - //load inch kernel_1 k0-k7 - "vdup.s8 d8, d19[0] \n" - "vdup.s8 d9, d19[1] \n" - "vdup.s8 d10, d19[2] \n" - "vdup.s8 d11, d19[3] \n" - "vdup.s8 d12, d19[4] \n" - "vdup.s8 d13, d19[5] \n" - "vdup.s8 d14, d19[6] \n" - "vdup.s8 d15, d19[7] \n" - - //mla - "vmull.s8 q8, d0, d8 \n" - "vmlal.s8 q8, d1, d9 \n" - "vmlal.s8 q8, d2, d10 \n" - "vmlal.s8 q8, d3, d11 \n" - "vmlal.s8 q8, d4, d12 \n" - "vmlal.s8 q8, d5, d13 \n" - "vmlal.s8 q8, d6, d14 \n" - "vmlal.s8 q8, d7, d15 \n" - - //outptr1_s32 - "pld [%2, #128] \n" - "vld1.32 {d20-d21}, [%2:128] \n" //outptr1_s32 - "vaddw.s16 q10, q10, d16 \n" - "vst1.32 {d20-d21}, [%2:128]!\n" - //############################################ - //load inch kernel_2 k0-k7 - "vdup.s8 d8, d24[0] \n" - "vdup.s8 d9, d24[1] \n" - "vdup.s8 d10, d24[2] \n" - "vdup.s8 d11, d24[3] \n" - "vdup.s8 d12, d24[4] \n" - "vdup.s8 d13, d24[5] \n" - "vdup.s8 d14, d24[6] \n" - "vdup.s8 d15, d24[7] \n" - - //mla - "vmull.s8 q8, d0, d8 \n" - "vmlal.s8 q8, d1, d9 \n" - "vmlal.s8 q8, d2, d10 \n" - "vmlal.s8 q8, d3, d11 \n" - "vmlal.s8 q8, d4, d12 \n" - "vmlal.s8 q8, d5, d13 \n" - "vmlal.s8 q8, d6, d14 \n" - "vmlal.s8 q8, d7, d15 \n" - - //outptr2_s32 - "pld [%3, #256] \n" - "vld1.32 {d20-d21}, [%3:128] \n" //outptr2_s32 - "vaddw.s16 q10, q10, d16 \n" - "vst1.32 {d20-d21}, [%3:128]!\n" - //############################################# - //load inch kernel_3 k0-k7 - "vdup.s8 d8, d25[0] \n" - "vdup.s8 d9, d25[1] \n" - "vdup.s8 d10, d25[2] \n" - "vdup.s8 d11, d25[3] \n" - "vdup.s8 d12, d25[4] \n" - "vdup.s8 d13, d25[5] \n" - "vdup.s8 d14, d25[6] \n" - "vdup.s8 d15, d25[7] \n" - - //mla - "vmull.s8 q8, d0, d8 \n" - "vmlal.s8 q8, d1, d9 \n" - "vmlal.s8 q8, d2, d10 \n" - "vmlal.s8 q8, d3, d11 \n" - "vmlal.s8 q8, d4, d12 \n" - "vmlal.s8 q8, d5, d13 \n" - "vmlal.s8 q8, d6, d14 \n" - "vmlal.s8 q8, d7, d15 \n" - - //outptr3_s32 - "pld [%4, #256] \n" - "vld1.32 {d20-d21}, [%4:128] \n" //outptr3_s32 - "vaddw.s16 q10, q10, d16 \n" - "vst1.32 {d20-d21}, [%4:128]!\n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(outptr1), // %2 - "=r"(outptr2), // %3 - "=r"(outptr3), // %4 - "=r"(r0), // %5 - "=r"(r1), // %6 - "=r"(r2), // %7 - "=r"(r3), // %8 - "=r"(r4), // %9 - "=r"(r5), // %10 - "=r"(r6), // %11 - "=r"(r7) // %12 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(r0), - "6"(r1), - "7"(r2), - "8"(r3), - "9"(r4), - "10"(r5), - "11"(r6), - "12"(r7) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q10", "q11" - ); - } + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + outptr0[4] = sum4; + outptr0[5] = sum5; + outptr0[6] = sum6; + outptr0[7] = sum7; - for (; remain>0; remain--) - { - //ToDo Neon - int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7]; - int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7]; - int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7]; - int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7]; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; - - r0++; - r1++; - r2++; - r3++; - r4++; - r5++; - r6++; - r7++; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - } - } + outptr0 += 8; +#endif // __ARM_NEON + } - for (; q> 3; - int remain = size & 7; - - int8x8_t _k0 = vdup_n_s8(k0); - int8x8_t _k1 = vdup_n_s8(k1); - int8x8_t _k2 = vdup_n_s8(k2); - int8x8_t _k3 = vdup_n_s8(k3); - - if (nn > 0) - { - asm volatile( - "0: \n" - //load r0 - "pld [%5, #64] \n" - "vld1.s8 {d8}, [%5 :64]! \n" - - //mla - "vmull.s8 q5, d8, %12 \n" - //outptr0_s32 - "pld [%1, #256] \n" - "vld1.32 {d12-d15}, [%1] \n" - "vmovl.s16 q8, d10 \n" - "vmovl.s16 q9, d11 \n" - "vadd.s32 q6, q8 \n" - "vadd.s32 q7, q9 \n" - "vst1.32 {d12-d15}, [%1]! \n" - - //mla - "vmull.s8 q5, d8, %13 \n" - //outptr1_s32 - "pld [%2, #256] \n" - "vld1.32 {d12-d15}, [%2] \n" - "vaddw.s16 q6, q6, d10 \n" - "vaddw.s16 q7, q7, d11 \n" - "vst1.32 {d12-d15}, [%2]! \n" - - //mla - "vmull.s8 q5, d8, %14 \n" - //outptr0_s32 - "pld [%3, #256] \n" - "vld1.32 {d12-d15}, [%3] \n" - "vaddw.s16 q6, q6, d10 \n" - "vaddw.s16 q7, q7, d11 \n" - "vst1.32 {d12-d15}, [%3]! \n" - - //mla - "vmull.s8 q5, d8, %15 \n" - //outptr0_s32 - "pld [%4, #256] \n" - "vld1.32 {d12-d15}, [%4] \n" - "vaddw.s16 q6, q6, d10 \n" - "vaddw.s16 q7, q7, d11 \n" - "vst1.32 {d12-d15}, [%4]! \n" - - "subs %0, #1 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(outptr1), // %2 - "=r"(outptr2), // %3 - "=r"(outptr3), // %4 - "=r"(r0) // %5 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(r0), - "w"(_k0), // %12 - "w"(_k1), // %13 - "w"(_k2), // %14 - "w"(_k3) // %15 - : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9" - ); - } + const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4); + const signed char* kptr = kernel.channel(p/4 + p%4); +#if 0 //__ARM_NEON + asm volatile( + // inch loop + "vmov.s32 q6, #0 \n" - for (; remain>0; remain--) - { - // TODO neon optimize - int sum0 = (int)*r0 * k0; - int sum1 = (int)*r0 * k1; - int sum2 = (int)*r0 * k2; - int sum3 = (int)*r0 * k3; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; - - r0++; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - } - } - } + "lsr r4, %6, #2 \n"// r4 = nn = inch >> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%2, #128] \n" + "vld1.s8 {d4-d5}, [%1]! \n"// tmpr a00-a03,a10-a13,a20-a23,a30-a33 a(inch)(data) + "vmovl.s8 q3, d5 \n"// a20-a23,a30-a33 + "vmovl.s8 q2, d4 \n"// a00-a03,a10-a13 - #pragma omp parallel for num_threads(opt.num_threads) - for (int p=remain_outch_start; p> 3; - int remain = size & 7; + : "=r"(outptr0), // %0 + "=r"(tmpptr), // %1 + "=r"(kptr) // %2 + : "0"(outptr0), + "1"(tmpptr), + "2"(kptr), + "r"(inch) // %6 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6" + ); +#else + int sum0 = bias0; + int sum1 = bias0; + int sum2 = bias0; + int sum3 = bias0; - if (nn > 0) + for (int q=0; q0; remain--) - { - //ToDo Neon - int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7]; - - *outptr0 += sum0; - - r0++; - r1++; - r2++; - r3++; - r4++; - r5++; - r6++; - r7++; - outptr0++; - } + outptr0[0] = sum0; + outptr0[1] = sum1; + outptr0[2] = sum2; + outptr0[3] = sum3; + + outptr0 += 4; +#endif // __ARM_NEON } - for (; q> 3; - int remain = size & 7; + const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4 + i%4); + const signed char* kptr = kernel.channel(p/4 + p%4); - int8x8_t _k0 = vdup_n_s8(k0); + int q = 0; + int sum0 = bias0; - if (nn > 0) + for (; q0; remain--) - { - int sum0 = (int)*r0 * k0; - - *outptr0 += sum0; + outptr0[0] = sum0; - r0++; - outptr0++; - } + outptr0++; } - } -} + } +// // NOTE sgemm int8 +// for (; p> 3; int remain_size_start = nn_size << 3; @@ -4390,41 +1189,41 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); #else - int sum0_0 = biasptr[0]; - int sum0_1 = biasptr[0]; - int sum0_2 = biasptr[0]; - int sum0_3 = biasptr[0]; - int sum0_4 = biasptr[0]; - int sum0_5 = biasptr[0]; - int sum0_6 = biasptr[0]; - int sum0_7 = biasptr[0]; - - int sum1_0 = biasptr[1]; - int sum1_1 = biasptr[1]; - int sum1_2 = biasptr[1]; - int sum1_3 = biasptr[1]; - int sum1_4 = biasptr[1]; - int sum1_5 = biasptr[1]; - int sum1_6 = biasptr[1]; - int sum1_7 = biasptr[1]; - - int sum2_0 = biasptr[2]; - int sum2_1 = biasptr[2]; - int sum2_2 = biasptr[2]; - int sum2_3 = biasptr[2]; - int sum2_4 = biasptr[2]; - int sum2_5 = biasptr[2]; - int sum2_6 = biasptr[2]; - int sum2_7 = biasptr[2]; - - int sum3_0 = biasptr[3]; - int sum3_1 = biasptr[3]; - int sum3_2 = biasptr[3]; - int sum3_3 = biasptr[3]; - int sum3_4 = biasptr[3]; - int sum3_5 = biasptr[3]; - int sum3_6 = biasptr[3]; - int sum3_7 = biasptr[3]; + int sum0_0 = 0; + int sum0_1 = 0; + int sum0_2 = 0; + int sum0_3 = 0; + int sum0_4 = 0; + int sum0_5 = 0; + int sum0_6 = 0; + int sum0_7 = 0; + + int sum1_0 = 0; + int sum1_1 = 0; + int sum1_2 = 0; + int sum1_3 = 0; + int sum1_4 = 0; + int sum1_5 = 0; + int sum1_6 = 0; + int sum1_7 = 0; + + int sum2_0 = 0; + int sum2_1 = 0; + int sum2_2 = 0; + int sum2_3 = 0; + int sum2_4 = 0; + int sum2_5 = 0; + int sum2_6 = 0; + int sum2_7 = 0; + + int sum3_0 = 0; + int sum3_1 = 0; + int sum3_2 = 0; + int sum3_3 = 0; + int sum3_4 = 0; + int sum3_5 = 0; + int sum3_6 = 0; + int sum3_7 = 0; for (int q=0; q 0; remain--) - { - //ToDo Neon - int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] + - (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] + - (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] + - (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7]; - - *outptr0 += sum0; - - r0 += 2; - r1 += 2; - r2 += 2; - r3 += 2; - r4 += 2; - r5 += 2; - r6 += 2; - r7 += 2; - outptr0++; - } - - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; - r3 += tailstep; - r4 += tailstep; - r5 += tailstep; - r6 += tailstep; - r7 += tailstep; - } - } - - for (; q 0; remain--) - { - //ToDo Neon - int sum0 = (int)*r0 * (int)kernel0[0]; + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} - *outptr0 += sum0; +static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 1; + int kernel_h = 1; - r0 += 2; - outptr0++; - } + int stride_w = 2; + int stride_h = 2; - r0 += tailstep; - } - } - } + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); } diff --git a/src/layer/arm/convolution_3x3_int8.h b/src/layer/arm/convolution_3x3_int8.h index 759b2eade..857053c51 100644 --- a/src/layer/arm/convolution_3x3_int8.h +++ b/src/layer/arm/convolution_3x3_int8.h @@ -69,6 +69,185 @@ static void conv3x3s1_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel } } +static void conv3x3s1_winograd23_transform_kernel_int8_neon(const Mat& kernel, std::vector &kernel_tm2, int inch, int outch) +{ + Mat kernel_tm(4*4, inch, outch, 2ul); + + // G + const short ktm[4][3] = { + { 2, 0, 0}, + { 1, 1, 1}, + { 1, -1, 1}, + { 0, 0, 2} + }; + + #pragma omp parallel for + for (int p = 0; p(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[4][3]; + for (int i=0; i<4; i++) + { + tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j=0; j<4; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i=0; i<4; i++) + { + kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } + + for (int r=0; r<4; r++) + { + Mat kernel_tm_test(4*8, inch, outch/8 + (outch%8)/4 + outch%4, 2u); + + int p = 0; + for (; p+7 &kernel_tm_test, const Option& opt) { int w = bottom_blob.w; + int h = bottom_blob.h; int inch = bottom_blob.c; int outw = top_blob.w; int outh = top_blob.h; int outch = top_blob.c; - const signed char* kernel = _kernel; + // pad to 2n+2, winograd F(2,3) + Mat bottom_blob_bordered = bottom_blob; - int nn_outch = outch >> 1; - int remain_outch_start = nn_outch << 1; + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp < nn_outch; pp++) + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); + + // double start = ncnn::get_current_time(); + // BEGIN transform input + Mat bottom_blob_tm; { - int p = pp * 2; - - Mat out0 = top_blob.channel(p); - Mat out1 = top_blob.channel(p+1); + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; - out0.fill(0); - out1.fill(0); - - const signed char* kernel0 = (const signed char *)kernel + p * inch * 9; - const signed char* kernel1 = (const signed char *)kernel + (p + 1) * inch * 9; - - for (int q=0; q> 3; - int remain = outw & 7; + const signed char* r0 = img + w * j * 2; + const signed char* r1 = r0 + w; + const signed char* r2 = r1 + w; + const signed char* r3 = r2 + w; - for (; nn > 0; nn--) + for (int i = 0; i(q); + short* out_tm1 = bottom_blob_tm.channel(tiles*1+j*nRowBlocks+i).row(q); + short* out_tm2 = bottom_blob_tm.channel(tiles*2+j*nRowBlocks+i).row(q); + short* out_tm3 = bottom_blob_tm.channel(tiles*3+j*nRowBlocks+i).row(q); + + short d0[4],d1[4],d2[4],d3[4]; + short w0[4],w1[4],w2[4],w3[4]; + short t0[4],t1[4],t2[4],t3[4]; + // load + for (int n = 0; n < 4; n++) + { + d0[n] = r0[n]; + d1[n] = r1[n]; + d2[n] = r2[n]; + d3[n] = r3[n]; + } + // w = B_t * d + for (int n = 0; n < 4; n++) + { + w0[n] = d0[n] - d2[n]; + w1[n] = d1[n] + d2[n]; + w2[n] = d2[n] - d1[n]; + w3[n] = d3[n] - d1[n]; + } + // transpose d to d_t + { + t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3]; + t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3]; + t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3]; + t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3]; + } + // U = B_t * d_t + for (int n = 0; n < 4; n++) + { + d0[n] = t0[n] - t2[n]; + d1[n] = t1[n] + t2[n]; + d2[n] = t2[n] - t1[n]; + d3[n] = t3[n] - t1[n]; + } + // save to out_tm + for (int n = 0; n < 4; n++) + { + out_tm0[n] = d0[n]; + out_tm1[n] = d1[n]; + out_tm2[n] = d2[n]; + out_tm3[n] = d3[n]; + } + + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + } + } + } + } + bottom_blob_bordered = Mat(); - vst1q_s32(outptr0n, sum1_s32); - vst1q_s32(outptr0n+4, sum1n_s32); + // double end = ncnn::get_current_time(); + // printf("trans A : %.3f ms\n", end - start); + // start = ncnn::get_current_time(); - // outch 1 - _sum0 = vmull_s8(_r0, _k10); - _sum0 = vmlal_s8(_sum0, _r01, _k11); - _sum0 = vmlal_s8(_sum0, _r02, _k12); + // BEGIN dot + Mat top_blob_tm; + { + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; - _sum0 = vmlal_s8(_sum0, _r1, _k13); - _sum0 = vmlal_s8(_sum0, _r11, _k14); - _sum0 = vmlal_s8(_sum0, _r12, _k15); + int nColBlocks = h_tm/4; // may be the block num in FeatherCNN + int nRowBlocks = w_tm/4; - _sum0 = vmlal_s8(_sum0, _r2, _k16); - _sum0 = vmlal_s8(_sum0, _r21, _k17); - _sum0 = vmlal_s8(_sum0, _r22, _k18); + const int tiles = nColBlocks * nRowBlocks; - _sum1 = vmull_s8(_r1, _k10); - _sum1 = vmlal_s8(_sum1, _r11, _k11); - _sum1 = vmlal_s8(_sum1, _r12, _k12); + top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator); - _sum1 = vmlal_s8(_sum1, _r2, _k13); - _sum1 = vmlal_s8(_sum1, _r21, _k14); - _sum1 = vmlal_s8(_sum1, _r22, _k15); + for (int r=0; r<4; r++) + { + int nn_outch = 0; + int remain_outch_start = 0; - _sum1 = vmlal_s8(_sum1, _r3, _k16); - _sum1 = vmlal_s8(_sum1, _r31, _k17); - _sum1 = vmlal_s8(_sum1, _r32, _k18); + nn_outch = outch >> 3; + remain_outch_start = nn_outch << 3; - sum0_s32 = vld1q_s32(outptr1); - sum0n_s32 = vld1q_s32(outptr1+4); + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2; - sum1_s32 = vaddw_s16(sum1_s32, vget_low_s16(_sum1)); - sum1n_s32 = vaddw_s16(sum1n_s32, vget_high_s16(_sum1)); + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp0; remain--) + for (int i=0; i> 3; - int remain = outw & 7; - - for (; nn > 0; nn--) - { - // outch 0 - int8x8_t _r0 = vld1_s8(r0); - int8x8_t _r0n = vld1_s8(r0+8); - int8x8_t _r01 = vext_s8(_r0, _r0n, 1); - int8x8_t _r02 = vext_s8(_r0, _r0n, 2); - - int16x8_t _sum0 = vmull_s8(_r0, _k00); - _sum0 = vmlal_s8(_sum0, _r01, _k01); - _sum0 = vmlal_s8(_sum0, _r02, _k02); - - int8x8_t _r1 = vld1_s8(r1); - int8x8_t _r1n = vld1_s8(r1+8); - int8x8_t _r11 = vext_s8(_r1, _r1n, 1); - int8x8_t _r12 = vext_s8(_r1, _r1n, 2); - _sum0 = vmlal_s8(_sum0, _r1, _k03); - _sum0 = vmlal_s8(_sum0, _r11, _k04); - _sum0 = vmlal_s8(_sum0, _r12, _k05); - - int8x8_t _r2 = vld1_s8(r2); - int8x8_t _r2n = vld1_s8(r2+8); - int8x8_t _r21 = vext_s8(_r2, _r2n, 1); - int8x8_t _r22 = vext_s8(_r2, _r2n, 2); - _sum0 = vmlal_s8(_sum0, _r2, _k06); - _sum0 = vmlal_s8(_sum0, _r21, _k07); - _sum0 = vmlal_s8(_sum0, _r22, _k08); - - int32x4_t sum0_s32 = vld1q_s32(outptr0); - int32x4_t sum0n_s32 = vld1q_s32(outptr0+4); + remain_outch_start += nn_outch << 2; - sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0)); - sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0)); + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=remain_outch_start; p0; remain--) + for (int j=0; j> 2; + outRow0[1] = o0[1] >> 2; + outRow1[0] = o1[0] >> 2; + outRow1[1] = o1[1] >> 2; - *outptr0 += sum0; - *outptr1 += sum1; + out_tile += 16; - r0++; - r1++; - r2++; - outptr0++; - outptr1++; + outRow0 += 2; + outRow1 += 2; } - r0 += 2; - r1 += 2; - r2 += 2; + outRow0 += outw; + outRow1 += outw; } - - kernel0 += 9; - kernel1 += 9; - } + } } + // END transform output + // end = ncnn::get_current_time(); + // printf("trans C : %.3f ms\n", end - start); + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); +} + +static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + int nn_outch = outch >> 2; + int remain_outch_start = nn_outch << 2; #pragma omp parallel for num_threads(opt.num_threads) - for (int p=remain_outch_start; p> 3; + int remain = outw & 7; +#else + int remain = outw; +#endif // __ARM_NEON + +#if 0 //__ARM_NEON + if (nn > 0) + { + asm volatile( + "0: \n" + "vld1.s8 {d0-d3}, [%8]! \n"// d0=(k00-k30 k01-k31) d1=(k02-k32 k03-k33) d2=(k04-k34 k05-k35) d3=(k06-k36 k07-k37) + // r0 + "pld [%5, #128] \n" + "vld1.s8 {d8-d9}, [%5] \n"// d8=r00-r07 d9=r08-r015 q4 + "add %5, #8 \n" + "pld [%1, #128] \n" + "vld1.s32 {d12-d15}, [%1] \n"// sum00-sum07 q6 q7 + "pld [%2, #128] \n" + "vld1.s32 {d16-d19}, [%2] \n"// sum10-sum17 q8 q9 + "pld [%3, #128] \n" + "vld1.s32 {d20-d23}, [%3] \n"// sum20-sum27 q10 q11 + "pld [%4, #128] \n" + "vld1.s32 {d24-d27}, [%4] \n"// sum30-sum37 q12 q13 + + "vmovl.s8 q3, d3 \n"// d6(k06-k36) d7(k07-k37) + "vmovl.s8 q2, d2 \n"// d4(k04-k34) d5(k05-k35) + "vmovl.s8 q1, d1 \n"// d2(k02-k32) d3(k03-k33) + "vmovl.s8 q0, d0 \n"// d0(k00-k30) d1(k01-k31) + "vmovl.s8 q5, d8 \n"// d10(r00-r03) d11(r04-r07) + + "vmlal.s16 q6, d10, d0[0] \n"// sum(00-07) += (r00-r07) * k00 + "vmlal.s16 q7, d11, d0[0] \n" + "vmlal.s16 q8, d10, d0[1] \n"// sum(10-17) += (r00-r07) * k10 + "vmlal.s16 q9, d11, d0[1] \n" + "vmlal.s16 q10, d10, d0[2] \n"// sum(20-27) += (r00-r07) * k20 + "vmlal.s16 q11, d11, d0[2] \n" + "vmlal.s16 q12, d10, d0[3] \n"// sum(30-37) += (r00-r07) * k30 + "vmlal.s16 q13, d11, d0[3] \n" + + "vext.s8 q4, q4, #1 \n"// d8=r01-r08 q4 + "vmovl.s8 q5, d8 \n"// d10(r01-r04) d11(r05-r08) + + "vmlal.s16 q6, d10, d1[0] \n"// sum(00-07) += (r01-r08) * k01 + "vmlal.s16 q7, d11, d1[0] \n" + "vmlal.s16 q8, d10, d1[1] \n"// sum(10-17) += (r01-r08) * k11 + "vmlal.s16 q9, d11, d1[1] \n" + "vmlal.s16 q10, d10, d1[2] \n"// sum(20-27) += (r01-r08) * k21 + "vmlal.s16 q11, d11, d1[2] \n" + "vmlal.s16 q12, d10, d1[3] \n"// sum(30-37) += (r01-r08) * k31 + "vmlal.s16 q13, d11, d1[3] \n" + + "vext.s8 q4, q4, #1 \n"// d8=r02-r09 q4 + "vmovl.s8 q5, d8 \n"// d10(r02-r05) d11(r06-r09) + + "vmlal.s16 q6, d10, d2[0] \n"// sum(00-07) += (r02-r09) * k02 + "vmlal.s16 q7, d11, d2[0] \n" + "vmlal.s16 q8, d10, d2[1] \n"// sum(10-17) += (r02-r09) * k12 + "vmlal.s16 q9, d11, d2[1] \n" + "vmlal.s16 q10, d10, d2[2] \n"// sum(20-27) += (r02-r09) * k22 + "vmlal.s16 q11, d11, d2[2] \n" + "vmlal.s16 q12, d10, d2[3] \n"// sum(30-37) += (r02-r09) * k32 + "vmlal.s16 q13, d11, d2[3] \n" + + // r1 + "pld [%6, #128] \n" + "vld1.s8 {d8-d9}, [%6] \n"// d8=r10-r17 d9=r18-r115 q4 + "add %6, #8 \n" + "vmovl.s8 q5, d8 \n"// d10(r10-r13) d11(r14-r17) + + "vmlal.s16 q6, d10, d3[0] \n"// sum(00-07) += (r10-r17) * k03 + "vmlal.s16 q7, d11, d3[0] \n" + "vmlal.s16 q8, d10, d3[1] \n"// sum(10-17) += (r10-r17) * k13 + "vmlal.s16 q9, d11, d3[1] \n" + "vmlal.s16 q10, d10, d3[2] \n"// sum(20-27) += (r10-r17) * k23 + "vmlal.s16 q11, d11, d3[2] \n" + "vmlal.s16 q12, d10, d3[3] \n"// sum(30-37) += (r10-r17) * k33 + "vmlal.s16 q13, d11, d3[3] \n" + + "vext.s8 q4, q4, #1 \n"// d8=r11-r18 q4 + "vmovl.s8 q5, d8 \n"// d10(r11-r14) d11(r15-r18) + + "vmlal.s16 q6, d10, d4[0] \n"// sum(00-07) += (r11-r18) * k04 + "vmlal.s16 q7, d11, d4[0] \n" + "vmlal.s16 q8, d10, d4[1] \n"// sum(10-17) += (r11-r18) * k14 + "vmlal.s16 q9, d11, d4[1] \n" + "vmlal.s16 q10, d10, d4[2] \n"// sum(20-27) += (r11-r18) * k24 + "vmlal.s16 q11, d11, d4[2] \n" + "vmlal.s16 q12, d10, d4[3] \n"// sum(30-37) += (r11-r18) * k34 + "vmlal.s16 q13, d11, d4[3] \n" + + "vext.s8 q4, q4, #1 \n"// d8=r12-r19 q4 + "vmovl.s8 q5, d8 \n"// d10(r12-r15) d11(r16-r19) + + "vmlal.s16 q6, d10, d5[0] \n"// sum(00-07) += (r12-r19) * k05 + "vmlal.s16 q7, d11, d5[0] \n" + "vmlal.s16 q8, d10, d5[1] \n"// sum(10-17) += (r12-r19) * k15 + "vmlal.s16 q9, d11, d5[1] \n" + "vmlal.s16 q10, d10, d5[2] \n"// sum(20-27) += (r12-r19) * k25 + "vmlal.s16 q11, d11, d5[2] \n" + "vmlal.s16 q12, d10, d5[3] \n"// sum(30-37) += (r12-r19) * k35 + "vmlal.s16 q13, d11, d5[3] \n" + + // r2 + "pld [%7, #128] \n" + "vld1.s8 {d8-d9}, [%7] \n"// d8=r20-r27 d9=r28-r215 q4 + "add %7, #8 \n" + "vmovl.s8 q5, d8 \n"// d10(r20-r23) d11(r24-r27) + + "vmlal.s16 q6, d10, d6[0] \n"// sum(00-07) += (r20-r27) * k06 + "vmlal.s16 q7, d11, d6[0] \n" + "vmlal.s16 q8, d10, d6[1] \n"// sum(10-17) += (r20-r27) * k16 + "vmlal.s16 q9, d11, d6[1] \n" + "vmlal.s16 q10, d10, d6[2] \n"// sum(20-27) += (r20-r27) * k26 + "vmlal.s16 q11, d11, d6[2] \n" + "vmlal.s16 q12, d10, d6[3] \n"// sum(30-37) += (r20-r27) * k36 + "vmlal.s16 q13, d11, d6[3] \n" + + "vext.s8 q4, q4, #1 \n"// d8=r21-r28 q4 + "vmovl.s8 q5, d8 \n"// d10(r21-r24) d11(r25-r28) + + "vmlal.s16 q6, d10, d7[0] \n"// sum(00-07) += (r21-r28) * k07 + "vmlal.s16 q7, d11, d7[0] \n" + "vmlal.s16 q8, d10, d7[1] \n"// sum(10-17) += (r21-r28) * k17 + "vmlal.s16 q9, d11, d7[1] \n" + "vmlal.s16 q10, d10, d7[2] \n"// sum(20-27) += (r21-r28) * k27 + "vmlal.s16 q11, d11, d7[2] \n" + "vmlal.s16 q12, d10, d7[3] \n"// sum(30-37) += (r21-r28) * k37 + "vmlal.s16 q13, d11, d7[3] \n" + + "vld1.s8 {d0}, [%8] \n"// d0(k08-k38 xx-xx) + "add %8, #4 \n" + "vmovl.s8 q0, d0 \n"// d0(k08-k38) d1(xx-xx) + + "vext.s8 q4, q4, #1 \n"// d8=r22-r29 q4 + "vmovl.s8 q5, d8 \n"// d10(r22-r25) d11(r26-r29) + + "vmlal.s16 q6, d10, d0[0] \n"// sum(00-07) += (r22-r29) * k08 + "vmlal.s16 q7, d11, d0[0] \n" + "vmlal.s16 q8, d10, d0[1] \n"// sum(10-17) += (r22-r29) * k18 + "vmlal.s16 q9, d11, d0[1] \n" + "vmlal.s16 q10, d10, d0[2] \n"// sum(20-27) += (r22-r29) * k28 + "vmlal.s16 q11, d11, d0[2] \n" + "vmlal.s16 q12, d10, d0[3] \n"// sum(30-37) += (r22-r29) * k38 + "vmlal.s16 q13, d11, d0[3] \n" + + "vst1.s32 {d12-d15}, [%1]! \n"// sum00-sum07 q6 q7 + "vst1.s32 {d16-d19}, [%2]! \n"// sum10-sum17 q8 q9 + "vst1.s32 {d20-d23}, [%3]! \n"// sum20-sum27 q10 q11 + "vst1.s32 {d24-d27}, [%4]! \n"// sum30-sum37 q12 q13 + + "sub %8, #36 \n" + "subs %0, #1 \n" + + "bne 0b \n" + + : "=r"(nn), // %0 + "=r"(outptr0), // %1 + "=r"(outptr1), // %2 + "=r"(outptr2), // %3 + "=r"(outptr3), // %4 + "=r"(r0), // %5 + "=r"(r1), // %6 + "=r"(r2), // %7 + "=r"(ktmp) // %8 + : "0"(nn), + "1"(outptr0), + "2"(outptr1), + "3"(outptr2), + "4"(outptr3), + "5"(r0), + "6"(r1), + "7"(r2), + "8"(ktmp) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" //q14 q15 not be used... + ); + } +#endif +#if 0 //__ARM_NEON + if (remain >= 4) + { + remain -= 4; + asm volatile( + "vld1.s8 {d0-d3}, [%7]! \n"// d0=(k00-k30 k01-k31) d1=(k02-k32 k03-k33) d2=(k04-k34 k05-k35) d3=(k06-k36 k07-k37) + // r0 + "vld1.s8 {d8}, [%4] \n"// d8=r00-r07 + "add %4, #4 \n" + "vld1.s32 {d12-d13}, [%0] \n"// sum00-sum03 q6 + "vld1.s32 {d16-d17}, [%1] \n"// sum10-sum13 q8 + "vld1.s32 {d20-d21}, [%2] \n"// sum20-sum23 q10 + "vld1.s32 {d24-d25}, [%3] \n"// sum30-sum33 q12 + + "vmovl.s8 q3, d3 \n"// d6(k06-k36) d7(k07-k37) + "vmovl.s8 q2, d2 \n"// d4(k04-k34) d5(k05-k35) + "vmovl.s8 q1, d1 \n"// d2(k02-k32) d3(k03-k33) + "vmovl.s8 q0, d0 \n"// d0(k00-k30) d1(k01-k31) + "vmovl.s8 q5, d8 \n"// d10(r00-r03) + + "vmlal.s16 q6, d10, d0[0] \n"// sum(00-03) += (r00-r03) * k00 + "vmlal.s16 q8, d10, d0[1] \n"// sum(10-13) += (r00-r03) * k10 + "vmlal.s16 q10, d10, d0[2] \n"// sum(20-23) += (r00-r03) * k20 + "vmlal.s16 q12, d10, d0[3] \n"// sum(30-33) += (r00-r03) * k30 + + "vext.s8 d8, d8, #1 \n"// d8=r01-r08 + "vmovl.s8 q5, d8 \n"// d10(r01-r04) + + "vmlal.s16 q6, d10, d1[0] \n"// sum(00-03) += (r01-r04) * k01 + "vmlal.s16 q8, d10, d1[1] \n"// sum(10-13) += (r01-r04) * k11 + "vmlal.s16 q10, d10, d1[2] \n"// sum(20-23) += (r01-r04) * k21 + "vmlal.s16 q12, d10, d1[3] \n"// sum(30-33) += (r01-r04) * k31 + + "vext.s8 d8, d8, #1 \n"// d8=r02-r09 + "vmovl.s8 q5, d8 \n"// d10(r02-r05) + + "vmlal.s16 q6, d10, d2[0] \n"// sum(00-03) += (r02-r05) * k02 + "vmlal.s16 q8, d10, d2[1] \n"// sum(10-13) += (r02-r05) * k12 + "vmlal.s16 q10, d10, d2[2] \n"// sum(20-23) += (r02-r05) * k22 + "vmlal.s16 q12, d10, d2[3] \n"// sum(30-33) += (r02-r05) * k32 + + // r1 + "vld1.s8 {d8}, [%5] \n"// d8=r10-r17 + "add %5, #4 \n" + "vmovl.s8 q5, d8 \n"// d10(r10-r13) + + "vmlal.s16 q6, d10, d3[0] \n"// sum(00-03) += (r10-r13) * k03 + "vmlal.s16 q8, d10, d3[1] \n"// sum(10-13) += (r10-r13) * k13 + "vmlal.s16 q10, d10, d3[2] \n"// sum(20-23) += (r10-r13) * k23 + "vmlal.s16 q12, d10, d3[3] \n"// sum(30-33) += (r10-r13) * k33 + + "vext.s8 d8, d8, #1 \n"// d8=r11-r18 + "vmovl.s8 q5, d8 \n"// d10(r11-r14) + + "vmlal.s16 q6, d10, d4[0] \n"// sum(00-03) += (r11-r14) * k04 + "vmlal.s16 q8, d10, d4[1] \n"// sum(10-13) += (r11-r14) * k14 + "vmlal.s16 q10, d10, d4[2] \n"// sum(20-23) += (r11-r14) * k24 + "vmlal.s16 q12, d10, d4[3] \n"// sum(30-33) += (r11-r14) * k34 + + "vext.s8 d8, d8, #1 \n"// d8=r12-r19 q4 + "vmovl.s8 q5, d8 \n"// d10(r12-r15) + + "vmlal.s16 q6, d10, d5[0] \n"// sum(00-03) += (r12-r15) * k05 + "vmlal.s16 q8, d10, d5[1] \n"// sum(10-13) += (r12-r15) * k15 + "vmlal.s16 q10, d10, d5[2] \n"// sum(20-23) += (r12-r15) * k25 + "vmlal.s16 q12, d10, d5[3] \n"// sum(30-33) += (r12-r15) * k35 + + // r2 + "vld1.s8 {d8}, [%6] \n"// d8=r20-r27 + "add %6, #4 \n" + "vmovl.s8 q5, d8 \n"// d10(r20-r23) + + "vmlal.s16 q6, d10, d6[0] \n"// sum(00-03) += (r20-r23) * k06 + "vmlal.s16 q8, d10, d6[1] \n"// sum(10-13) += (r20-r23) * k16 + "vmlal.s16 q10, d10, d6[2] \n"// sum(20-23) += (r20-r23) * k26 + "vmlal.s16 q12, d10, d6[3] \n"// sum(30-33) += (r20-r23) * k36 + + "vext.s8 q4, q4, #1 \n"// d8=r21-r28 q4 + "vmovl.s8 q5, d8 \n"// d10(r21-r24) + + "vmlal.s16 q6, d10, d7[0] \n"// sum(00-03) += (r21-r24) * k07 + "vmlal.s16 q8, d10, d7[1] \n"// sum(10-13) += (r21-r24) * k17 + "vmlal.s16 q10, d10, d7[2] \n"// sum(20-23) += (r21-r24) * k27 + "vmlal.s16 q12, d10, d7[3] \n"// sum(30-33) += (r21-r24) * k37 + + "vld1.s8 {d0}, [%7] \n"// d0(k08-k38 xx-xx) + "add %7, #4 \n" + "vmovl.s8 q0, d0 \n"// d0(k08-k38) d1(xx-xx) + + "vext.s8 d8, d8, #1 \n"// d8=r22-r25 + "vmovl.s8 q5, d8 \n"// d10(r22-r25) + + "vmlal.s16 q6, d10, d0[0] \n"// sum(00-03) += (r22-r25) * k08 + "vmlal.s16 q8, d10, d0[1] \n"// sum(10-13) += (r22-r25) * k18 + "vmlal.s16 q10, d10, d0[2] \n"// sum(20-23) += (r22-r25) * k28 + "vmlal.s16 q12, d10, d0[3] \n"// sum(30-33) += (r22-r25) * k38 + + "vst1.s32 {d12-d13}, [%0]! \n"// sum00-sum03 q6 + "vst1.s32 {d16-d17}, [%1]! \n"// sum10-sum13 q8 + "vst1.s32 {d20-d21}, [%2]! \n"// sum20-sum23 q10 + "vst1.s32 {d24-d25}, [%3]! \n"// sum30-sum33 q12 + + "sub %7, #36 \n" + + : "=r"(outptr0), // %0 + "=r"(outptr1), // %1 + "=r"(outptr2), // %2 + "=r"(outptr3), // %3 + "=r"(r0), // %4 + "=r"(r1), // %5 + "=r"(r2), // %6 + "=r"(ktmp) // %7 + : "0"(outptr0), + "1"(outptr1), + "2"(outptr2), + "3"(outptr3), + "4"(r0), + "5"(r1), + "6"(r2), + "7"(ktmp) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13" //q14 q15 not be used... + ); + } +#endif + for (; remain>0; remain--) + { +#if 0 //__ARM_NEON + asm volatile( + "vld1.s8 {d0[]}, [%4]! \n"// d0(r00) + "vld1.s8 {d1[]}, [%4]! \n"// d1(r01) + + "vld1.s8 {d4-d7}, [%7]! \n"// d4(k00-k30 k01-k31) d5(k02-k32 k03-k33) d6(k04-k34 k05-k35) d7(k06-k36 k07-k37) + + "vsli.64 d0, d1, #32 \n"// d0(r00 r00 r00 r00 r01 r01 r01 r01) + + "vld1.s8 {d2[]}, [%4] \n"// d2(r02 r02 r02 r02 r02 r02 r02 r02) + "sub %4, %4, #2 \n" + "vld1.s8 {d3[]}, [%5]! \n"// d3(r10 r10 r10 r10 r10 r10 r10 r10) + + "vmovl.s8 q5, d7 \n"// d10(k06-k36) d11(k07-k37) + "vmovl.s8 q4, d6 \n"// d8(k04-k34) d9(k05-k35) + "vmovl.s8 q3, d5 \n"// d6(k02-k32) d7(k03-k33) + "vmovl.s8 q2, d4 \n"// d4(k00-k30) d5(k01-k31) + + "vmovl.s8 q0, d0 \n"// d0(r00 r00 r00 r00) d1(r01 r01 r01 r01) + + "vsli.64 d2, d3, #32 \n"// d2(r02 r02 r02 r02 r10 r10 r10 r10) + + "vmull.s16 q8, d0, d4 \n"// (r00) * (k00-k30) + "vmull.s16 q9, d1, d5 \n"// (r01) * (k01-k31) + + "vmovl.s8 q10, d2 \n"// d20(r02 r02 r02 r02) d21(r10 r10 r10 r10) + + "vld1.s8 {d0[]}, [%5]! \n"// d0(r11 r11 r11 r11 r11 r11 r11 r11) + "vld1.s8 {d1[]}, [%5] \n"// d1(r12 r12 r12 r12 r12 r12 r12 r12) + "sub %5, %5, #2 \n" + + "vsli.64 d0, d1, #32 \n"// d0(r11 r11 r11 r11 r12 r12 r12 r12) + + "vmlal.s16 q8, d20, d6 \n"// (r02) * (k02-k32) + "vmlal.s16 q9, d21, d7 \n"// (r10) * (k03-k33) + + "vmovl.s8 q0, d0 \n"// d0(r11 r11 r11 r11 ) d1(r12 r12 r12 r12) + + "vld1.s8 {d2[]}, [%6]! \n"// d2(r20 r20 r20 r20 r20 r20 r20 r20) + "vld1.s8 {d3[]}, [%6]! \n"// d3(r21 r21 r21 r21 r21 r21 r21 r21) + + "vsli.64 d2, d3, #32 \n"// d2(r20 r20 r20 r20 r21 r21 r21 r21) + + "vmlal.s16 q8, d0, d8 \n"// (r11) * (k04-k34) + "vmlal.s16 q9, d1, d9 \n"// (r12) * (k05-k35) + + "vmovl.s8 q2, d2 \n"// d4(r20 r20 r20 r20) d5(r21 r21 r21 r21) + + "vld1.s8 {d0[]}, [%6] \n"// d0(r22 r22 r22 r22 r22 r22 r22 r22) + "sub %6, %6, #2 \n" + "veor d1, d1, d1 \n"// d1 = 0 + + "vld1.s8 {d6}, [%7] \n"// d6 = k08-k38 xxxx + "sub %7, #32 \n" + + "vsli.64 d0, d1, #32 \n"// d0(r22 r22 r22 r22 0 0 0 0) + "vmovl.s8 q4, d6 \n"// d8(k08-k38) + "vmovl.s8 q0, d0 \n"// d0(r22 r22 r22 r22) d1(0 0 0 0) + + "vmlal.s16 q8, d4, d10 \n"// (r20) * (k06-k36) + "vmlal.s16 q9, d5, d11 \n"// (r21) * (k07-k37) + + "vld1.s32 {d20[0]}, [%0] \n" + + "vmlal.s16 q8, d0, d8 \n"// (r22) * (k08-k38) + + "vld1.s32 {d20[1]}, [%1] \n" + + "vadd.s32 q8, q8, q9 \n" + + "vld1.s32 {d21[0]}, [%2] \n" + "vld1.s32 {d21[1]}, [%3] \n" + + "vadd.s32 q10, q10, q8 \n" + + "vst1.s32 {d20[0]}, [%0]! \n" + "vst1.s32 {d20[1]}, [%1]! \n" + "vst1.s32 {d21[0]}, [%2]! \n" + "vst1.s32 {d21[1]}, [%3]! \n" + + : "=r"(outptr0), // %0 + "=r"(outptr1), // %1 + "=r"(outptr2), // %2 + "=r"(outptr3), // %3 + "=r"(r0), // %4 + "=r"(r1), // %5 + "=r"(r2), // %6 + "=r"(ktmp) // %7 + : "0"(outptr0), + "1"(outptr1), + "2"(outptr2), + "3"(outptr3), + "4"(r0), + "5"(r1), + "6"(r2), + "7"(ktmp) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", "q10" + ); +#else + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + sum0 += r0[0] * ktmp[0]; + sum1 += r0[0] * ktmp[1]; + sum2 += r0[0] * ktmp[2]; + sum3 += r0[0] * ktmp[3]; + + sum0 += r0[1] * ktmp[4]; + sum1 += r0[1] * ktmp[5]; + sum2 += r0[1] * ktmp[6]; + sum3 += r0[1] * ktmp[7]; + ktmp += 8; + + sum0 += r0[2] * ktmp[0]; + sum1 += r0[2] * ktmp[1]; + sum2 += r0[2] * ktmp[2]; + sum3 += r0[2] * ktmp[3]; + + sum0 += r1[0] * ktmp[4]; + sum1 += r1[0] * ktmp[5]; + sum2 += r1[0] * ktmp[6]; + sum3 += r1[0] * ktmp[7]; + ktmp += 8; + + sum0 += r1[1] * ktmp[0]; + sum1 += r1[1] * ktmp[1]; + sum2 += r1[1] * ktmp[2]; + sum3 += r1[1] * ktmp[3]; + + sum0 += r1[2] * ktmp[4]; + sum1 += r1[2] * ktmp[5]; + sum2 += r1[2] * ktmp[6]; + sum3 += r1[2] * ktmp[7]; + ktmp += 8; + + sum0 += r2[0] * ktmp[0]; + sum1 += r2[0] * ktmp[1]; + sum2 += r2[0] * ktmp[2]; + sum3 += r2[0] * ktmp[3]; + + sum0 += r2[1] * ktmp[4]; + sum1 += r2[1] * ktmp[5]; + sum2 += r2[1] * ktmp[6]; + sum3 += r2[1] * ktmp[7]; + ktmp += 8; + + sum0 += r2[2] * ktmp[0]; + sum1 += r2[2] * ktmp[1]; + sum2 += r2[2] * ktmp[2]; + sum3 += r2[2] * ktmp[3]; + ktmp += 8; + + *outptr0 += sum0; + *outptr1 += sum1; + *outptr2 += sum2; + *outptr3 += sum3; + + ktmp -= 8*5; + + outptr0++; + outptr1++; + outptr2++; + outptr3++; +#endif + r0++; + r1++; + r2++; + } + + r0 += 2; + r1 += 2; + r2 += 2; + } + + ktmp += 4*9; + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=remain_outch_start; p> 3; + int remain = outw & 7; +#else + int remain = outw; +#endif // __ARM_NEON + +#if 0 //__ARM_NEON + for (; nn >0; nn--) + { + // r0 + int8x8_t _r0 = vld1_s8(r0); + int8x8_t _r0n = vld1_s8(r0+8); + int8x8_t _r01 = vext_s8(_r0, _r0n, 1); + int8x8_t _r02 = vext_s8(_r0, _r0n, 2); + int16x8_t _r0_s16 = vmovl_s8(_r0); // r00 - r07 + int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08 + int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09 + + int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00 + int32x4_t _sum0n = vmull_lane_s16(vget_high_s16(_r0_s16), _k0123, 0); + + int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01 + int32x4_t _sum1n = vmull_lane_s16(vget_high_s16(_r01_s16), _k0123, 1); + + int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02 + int32x4_t _sum2n = vmull_lane_s16(vget_high_s16(_r02_s16), _k0123, 2); + + // r1 + int8x8_t _r1 = vld1_s8(r1); + int8x8_t _r1n = vld1_s8(r1+8); + int8x8_t _r11 = vext_s8(_r1, _r1n, 1); + int8x8_t _r12 = vext_s8(_r1, _r1n, 2); + int16x8_t _r1_s16 = vmovl_s8(_r1); // r10 - r17 + int16x8_t _r11_s16 = vmovl_s8(_r11); // r11 - r18 + int16x8_t _r12_s16 = vmovl_s8(_r12); // r12 - r19 + + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r1_s16), _k0123, 3); // (r10 - r17) * k03 + _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r1_s16), _k0123, 3); + + _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r11_s16), _k4567, 0); // (r11 - r18) * k04 + _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r11_s16), _k4567, 0); + + _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r12_s16), _k4567, 1); // (r12 - r19) * k05 + _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r12_s16), _k4567, 1); + + int32x4_t _sum4 = vmull_lane_s16(vget_low_s16(_r1_s16), _k0123, 0); // (r10 - r17) * k00 + int32x4_t _sum4n = vmull_lane_s16(vget_high_s16(_r1_s16), _k0123, 0); + + int32x4_t _sum5 = vmull_lane_s16(vget_low_s16(_r11_s16), _k0123, 1); // (r11 - r18) * k01 + int32x4_t _sum5n = vmull_lane_s16(vget_high_s16(_r11_s16), _k0123, 1); + + int32x4_t _sum6 = vmull_lane_s16(vget_low_s16(_r12_s16), _k0123, 2); // (r12 - r19) * k02 + int32x4_t _sum6n = vmull_lane_s16(vget_high_s16(_r12_s16), _k0123, 2); + + // r2 + int8x8_t _r2 = vld1_s8(r2); + int8x8_t _r2n = vld1_s8(r2+8); + int8x8_t _r21 = vext_s8(_r2, _r2n, 1); + int8x8_t _r22 = vext_s8(_r2, _r2n, 2); + int16x8_t _r2_s16 = vmovl_s8(_r2); // r20 - r27 + int16x8_t _r21_s16 = vmovl_s8(_r21); // r21 - r28 + int16x8_t _r22_s16 = vmovl_s8(_r22); // r22 - r29 + + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r2_s16), _k4567, 2); // (r20 - r27) * k06 + _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r2_s16), _k4567, 2); + + _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r21_s16), _k4567, 3); // (r21 - r28) * k07 + _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r21_s16), _k4567, 3); + + _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r22_s16), _k8xxx, 0); // (r22 - r29) * k08 + _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r22_s16), _k8xxx, 0); + + _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r2_s16), _k0123, 3); // (r20 - r27) * k03 + _sum4n = vmlal_lane_s16(_sum4n, vget_high_s16(_r2_s16), _k0123, 3); + + _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r21_s16), _k4567, 0); // (r21 - r28) * k04 + _sum5n = vmlal_lane_s16(_sum5n, vget_high_s16(_r21_s16), _k4567, 0); + + _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r22_s16), _k4567, 1); // (r22 - r29) * k05 + _sum6n = vmlal_lane_s16(_sum6n, vget_high_s16(_r22_s16), _k4567, 1); + + // load output sum0 sum0n + int32x4_t _out00 = vld1q_s32(outptr0); + int32x4_t _out01 = vld1q_s32(outptr0+4); + int32x4_t _out10 = vld1q_s32(outptr0n); + int32x4_t _out11 = vld1q_s32(outptr0n+4); + + // r3 + int8x8_t _r3 = vld1_s8(r3); + int8x8_t _r3n = vld1_s8(r3+8); + int8x8_t _r31 = vext_s8(_r3, _r3n, 1); + int8x8_t _r32 = vext_s8(_r3, _r3n, 2); + int16x8_t _r3_s16 = vmovl_s8(_r3); // r30 - r37 + int16x8_t _r31_s16 = vmovl_s8(_r31); // r31 - r38 + int16x8_t _r32_s16 = vmovl_s8(_r32); // r32 - r39 + + _sum0 = vaddq_s32(_sum0, _sum1); + _sum0n = vaddq_s32(_sum0n, _sum1n); + _sum2 = vaddq_s32(_sum2, _sum0); + _sum2n = vaddq_s32(_sum2n, _sum0n); + + _out00 = vaddq_s32(_out00, _sum2); + _out01 = vaddq_s32(_out01, _sum2n); + + vst1q_s32(outptr0, _out00); + vst1q_s32(outptr0+4, _out01); + + _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r3_s16), _k4567, 2); // (r30 - r37) * k06 + _sum4n = vmlal_lane_s16(_sum4n, vget_high_s16(_r3_s16), _k4567, 2); + + _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r31_s16), _k4567, 3); // (r31 - r38) * k07 + _sum5n = vmlal_lane_s16(_sum5n, vget_high_s16(_r31_s16), _k4567, 3); - int8x8_t _k00 = vdup_n_s8(kernel0[0]); - int8x8_t _k01 = vdup_n_s8(kernel0[1]); - int8x8_t _k02 = vdup_n_s8(kernel0[2]); - int8x8_t _k03 = vdup_n_s8(kernel0[3]); - int8x8_t _k04 = vdup_n_s8(kernel0[4]); - int8x8_t _k05 = vdup_n_s8(kernel0[5]); - int8x8_t _k06 = vdup_n_s8(kernel0[6]); - int8x8_t _k07 = vdup_n_s8(kernel0[7]); - int8x8_t _k08 = vdup_n_s8(kernel0[8]); + _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r32_s16), _k8xxx, 0); // (r32 - r39) * k08 + _sum6n = vmlal_lane_s16(_sum6n, vget_high_s16(_r32_s16), _k8xxx, 0); - for (; i+1 < outh; i+=2) - { - int nn = outw >> 3; - int remain = outw & 7; + _sum4 = vaddq_s32(_sum4, _sum5); + _sum4n = vaddq_s32(_sum4n, _sum5n); + _sum6 = vaddq_s32(_sum6, _sum4); + _sum6n = vaddq_s32(_sum6n, _sum4n); + + _out10 = vaddq_s32(_out10, _sum6); + _out11 = vaddq_s32(_out11, _sum6n); + + vst1q_s32(outptr0n, _out10); + vst1q_s32(outptr0n+4, _out11); - for (; nn > 0; nn--) + r0 += 8; + r1 += 8; + r2 += 8; + r3 += 8; + outptr0 += 8; + outptr0n += 8; + } +#endif +#if 0 //__ARM_NEON + if (remain >= 4) { + remain -= 4; + + // r0 int8x8_t _r0 = vld1_s8(r0); int8x8_t _r0n = vld1_s8(r0+8); int8x8_t _r01 = vext_s8(_r0, _r0n, 1); int8x8_t _r02 = vext_s8(_r0, _r0n, 2); + int16x8_t _r0_s16 = vmovl_s8(_r0); // r00 - r07 + int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08 + int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09 - int16x8_t _sum0 = vmull_s8(_r0, _k00); - _sum0 = vmlal_s8(_sum0, _r01, _k01); - _sum0 = vmlal_s8(_sum0, _r02, _k02); + int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00 + int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01 + int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02 + // r1 int8x8_t _r1 = vld1_s8(r1); int8x8_t _r1n = vld1_s8(r1+8); int8x8_t _r11 = vext_s8(_r1, _r1n, 1); int8x8_t _r12 = vext_s8(_r1, _r1n, 2); - _sum0 = vmlal_s8(_sum0, _r1, _k03); - _sum0 = vmlal_s8(_sum0, _r11, _k04); - _sum0 = vmlal_s8(_sum0, _r12, _k05); + int16x8_t _r1_s16 = vmovl_s8(_r1); // r10 - r17 + int16x8_t _r11_s16 = vmovl_s8(_r11); // r11 - r18 + int16x8_t _r12_s16 = vmovl_s8(_r12); // r12 - r19 + + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r1_s16), _k0123, 3); // (r10 - r17) * k03 + _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r11_s16), _k4567, 0); // (r11 - r18) * k04 + _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r12_s16), _k4567, 1); // (r12 - r19) * k05 - int16x8_t _sum1 = vmull_s8(_r1, _k00); - _sum1 = vmlal_s8(_sum1, _r11, _k01); - _sum1 = vmlal_s8(_sum1, _r12, _k02); + int32x4_t _sum4 = vmull_lane_s16(vget_low_s16(_r1_s16), _k0123, 0); // (r10 - r17) * k00 + int32x4_t _sum5 = vmull_lane_s16(vget_low_s16(_r11_s16), _k0123, 1); // (r11 - r18) * k01 + int32x4_t _sum6 = vmull_lane_s16(vget_low_s16(_r12_s16), _k0123, 2); // (r12 - r19) * k02 + // r2 int8x8_t _r2 = vld1_s8(r2); int8x8_t _r2n = vld1_s8(r2+8); int8x8_t _r21 = vext_s8(_r2, _r2n, 1); int8x8_t _r22 = vext_s8(_r2, _r2n, 2); - _sum0 = vmlal_s8(_sum0, _r2, _k06); - _sum0 = vmlal_s8(_sum0, _r21, _k07); - _sum0 = vmlal_s8(_sum0, _r22, _k08); + int16x8_t _r2_s16 = vmovl_s8(_r2); // r20 - r27 + int16x8_t _r21_s16 = vmovl_s8(_r21); // r21 - r28 + int16x8_t _r22_s16 = vmovl_s8(_r22); // r22 - r29 + + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r2_s16), _k4567, 2); // (r20 - r27) * k06 + _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r21_s16), _k4567, 3); // (r21 - r28) * k07 + _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r22_s16), _k8xxx, 0); // (r22 - r29) * k08 + + _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r2_s16), _k0123, 3); // (r20 - r27) * k03 + _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r21_s16), _k4567, 0); // (r21 - r28) * k04 + _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r22_s16), _k4567, 1); // (r22 - r29) * k05 - _sum1 = vmlal_s8(_sum1, _r2, _k03); - _sum1 = vmlal_s8(_sum1, _r21, _k04); - _sum1 = vmlal_s8(_sum1, _r22, _k05); + // load output sum0 sum0n + int32x4_t _out00 = vld1q_s32(outptr0); + int32x4_t _out10 = vld1q_s32(outptr0n); + // r3 int8x8_t _r3 = vld1_s8(r3); int8x8_t _r3n = vld1_s8(r3+8); int8x8_t _r31 = vext_s8(_r3, _r3n, 1); int8x8_t _r32 = vext_s8(_r3, _r3n, 2); - _sum1 = vmlal_s8(_sum1, _r3, _k06); - _sum1 = vmlal_s8(_sum1, _r31, _k07); - _sum1 = vmlal_s8(_sum1, _r32, _k08); + int16x8_t _r3_s16 = vmovl_s8(_r3); // r30 - r37 + int16x8_t _r31_s16 = vmovl_s8(_r31); // r31 - r38 + int16x8_t _r32_s16 = vmovl_s8(_r32); // r32 - r39 - int32x4_t sum0_s32 = vld1q_s32(outptr0); - int32x4_t sum0n_s32 = vld1q_s32(outptr0+4); + _sum0 = vaddq_s32(_sum0, _sum1); + _sum2 = vaddq_s32(_sum2, _sum0); + _out00 = vaddq_s32(_out00, _sum2); - sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0)); - sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0)); + vst1q_s32(outptr0, _out00); - vst1q_s32(outptr0, sum0_s32); - vst1q_s32(outptr0+4, sum0n_s32); + _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r3_s16), _k4567, 2); // (r30 - r37) * k06 + _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r31_s16), _k4567, 3); // (r31 - r38) * k07 + _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r32_s16), _k8xxx, 0); // (r32 - r39) * k08 - int32x4_t sum1_s32 = vld1q_s32(outptr0n); - int32x4_t sum1n_s32 = vld1q_s32(outptr0n+4); + _sum4 = vaddq_s32(_sum4, _sum5); + _sum6 = vaddq_s32(_sum6, _sum4); - sum1_s32 = vaddw_s16(sum1_s32, vget_low_s16(_sum1)); - sum1n_s32 = vaddw_s16(sum1n_s32, vget_high_s16(_sum1)); + _out10 = vaddq_s32(_out10, _sum6); - vst1q_s32(outptr0n, sum1_s32); - vst1q_s32(outptr0n+4, sum1n_s32); + vst1q_s32(outptr0n, _out10); - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - outptr0 += 8; - outptr0n += 8; + r0 += 4; + r1 += 4; + r2 += 4; + r3 += 4; + outptr0 += 4; + outptr0n += 4; } - +#endif for (; remain>0; remain--) { - // Todo neon - int sum0 = 0; - int sum0n = 0; +#if 0 //__ARM_NEON + asm volatile( + "vld1.s8 {d0[0]}, [%2]! \n" + "vld1.s8 {d0[1]}, [%2]! \n" + "vld1.s8 {d0[2]}, [%2] \n" + "sub %2, #2 \n" - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - sum0n += (int)r1[0] * kernel0[0]; - sum0n += (int)r1[1] * kernel0[1]; - sum0n += (int)r1[2] * kernel0[2]; - sum0n += (int)r2[0] * kernel0[3]; - sum0n += (int)r2[1] * kernel0[4]; - sum0n += (int)r2[2] * kernel0[5]; - sum0n += (int)r3[0] * kernel0[6]; - sum0n += (int)r3[1] * kernel0[7]; - sum0n += (int)r3[2] * kernel0[8]; + "vld1.s8 {d0[3]}, [%3]! \n" + "vld1.s8 {d0[4]}, [%3]! \n" + "vld1.s8 {d0[5]}, [%3] \n" + "sub %3, #2 \n" - *outptr0 += sum0; - *outptr0n += sum0n; + "vld1.s8 {d0[6]}, [%4]! \n" + "vld1.s8 {d0[7]}, [%4]! \n"// d0(r00 r01 r02 r10 r11 r12 r20 r21) - r0++; - r1++; - r2++; - r3++; - outptr0++; - outptr0n++; - } + "vld1.s8 {d4[]}, [%4] \n"// d4(r22 r22 r22 r22 r22 r22 r22 r22) + "sub %4, #2 \n" - r0 += 2 + w; - r1 += 2 + w; - r2 += 2 + w; - r3 += 2 + w; + "vext.s8 d1, d0, d4, #3 \n"// d1(r10 r11 r12 r22 r21 r22 r22 r22) - outptr0 += outw; - outptr0n += outw; - } + "vld1.s8 {d1[6]}, [%5]! \n" + "vld1.s8 {d1[7]}, [%5]! \n"// d1(r10 r11 r12 r22 r21 r22 r30 r31) - for (; i < outh; i++) - { - int nn = outw >> 3; - int remain = outw & 7; + "vld1.s8 {d2}, [%6]! \n"// d2(k00 k01 k02 k10 k11 k12 k20 k21) - for (; nn > 0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - int8x8_t _r0n = vld1_s8(r0+8); - int8x8_t _r01 = vext_s8(_r0, _r0n, 1); - int8x8_t _r02 = vext_s8(_r0, _r0n, 2); + "vld1.s8 {d5[]}, [%5] \n"// d5(r32 r32 r32 r32 r32 r32 r32 r32) + "sub %5, #2 \n" - int16x8_t _sum0 = vmull_s8(_r0, _k00); - _sum0 = vmlal_s8(_sum0, _r01, _k01); - _sum0 = vmlal_s8(_sum0, _r02, _k02); + "veor d3, d1, d1 \n"// d3(00 00 00 00 00 00 00 00) - int8x8_t _r1 = vld1_s8(r1); - int8x8_t _r1n = vld1_s8(r1+8); - int8x8_t _r11 = vext_s8(_r1, _r1n, 1); - int8x8_t _r12 = vext_s8(_r1, _r1n, 2); - _sum0 = vmlal_s8(_sum0, _r1, _k03); - _sum0 = vmlal_s8(_sum0, _r11, _k04); - _sum0 = vmlal_s8(_sum0, _r12, _k05); + "vmull.s8 q8, d0, d2 \n"// sum0 = (r00 - r21) * (k00 - k21) + "vmull.s8 q9, d1, d2 \n"// sum1 = (r10 - r31) * (k00 - k21) - int8x8_t _r2 = vld1_s8(r2); - int8x8_t _r2n = vld1_s8(r2+8); - int8x8_t _r21 = vext_s8(_r2, _r2n, 1); - int8x8_t _r22 = vext_s8(_r2, _r2n, 2); - _sum0 = vmlal_s8(_sum0, _r2, _k06); - _sum0 = vmlal_s8(_sum0, _r21, _k07); - _sum0 = vmlal_s8(_sum0, _r22, _k08); + "vld1.s8 {d3[0]}, [%6] \n"// d3(k22 00 00 00 00 00 00 00) + "sub %6, #8 \n" - int32x4_t sum0_s32 = vld1q_s32(outptr0); - int32x4_t sum0n_s32 = vld1q_s32(outptr0+4); + "vmull.s8 q10, d4, d3 \n"// r22 * k22 + "vmull.s8 q11, d5, d3 \n"// r22 * k22 - sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0)); - sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0)); + "vld1.s32 {d6[0]}, [%0] \n" - vst1q_s32(outptr0, sum0_s32); - vst1q_s32(outptr0+4, sum0n_s32); + "vaddl.s16 q10, d16, d18 \n" + "vaddl.s16 q11, d18, d22 \n" + "vaddw.s16 q10, q10, d17 \n" + "vaddw.s16 q11, q11, d19 \n" - r0 += 8; - r1 += 8; - r2 += 8; - outptr0 += 8; - } + "vld1.s32 {d6[1]}, [%1] \n" - for (; remain>0; remain--) - { + "vpadd.s32 d20, d20, d21 \n" + "vpadd.s32 d22, d22, d23 \n" + "vpadd.s32 d20, d20, d22 \n" + "vadd.s32 d6, d6, d20 \n" + + "vst1.s32 {d6[0]}, [%0]! \n" + "vst1.s32 {d6[1]}, [%1]! \n" + + : "=r"(outptr0), // %0 + "=r"(outptr0n), // %1 + "=r"(r0), // %2 + "=r"(r1), // %3 + "=r"(r2), // %4 + "=r"(r3), // %5 + "=r"(ktmp) // %6 + : "0"(outptr0), + "1"(outptr0n), + "2"(r0), + "3"(r1), + "4"(r2), + "5"(r3), + "6"(ktmp) + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11" + ); +#else int sum0 = 0; + int sum0n = 0; + + sum0 += r0[0] * ktmp[0]; + sum0 += r0[1] * ktmp[1]; + sum0 += r0[2] * ktmp[2]; + sum0 += r1[0] * ktmp[3]; + sum0 += r1[1] * ktmp[4]; + sum0 += r1[2] * ktmp[5]; + sum0 += r2[0] * ktmp[6]; + sum0 += r2[1] * ktmp[7]; + sum0 += r2[2] * ktmp[8]; - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; + sum0n += r1[0] * ktmp[0]; + sum0n += r1[1] * ktmp[1]; + sum0n += r1[2] * ktmp[2]; + sum0n += r2[0] * ktmp[3]; + sum0n += r2[1] * ktmp[4]; + sum0n += r2[2] * ktmp[5]; + sum0n += r3[0] * ktmp[6]; + sum0n += r3[1] * ktmp[7]; + sum0n += r3[2] * ktmp[8]; *outptr0 += sum0; + *outptr0n += sum0n; + outptr0++; + outptr0n++; +#endif r0++; r1++; r2++; - outptr0++; - } - - r0 += 2; - r1 += 2; - r2 += 2; - } - kernel0 += 9; - } - } -} - -static void conv3x3s2_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) -{ - int w = bottom_blob.w; - int inch = bottom_blob.c; - - int outw = top_blob.w; - int outh = top_blob.h; - int outch = top_blob.c; - - const int tailstep = w - 2 * outw + w; - - const signed char* kernel = _kernel; - - int nn_outch = outch >> 2; - int remain_outch_start = nn_outch << 2; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp < nn_outch; pp++) - { - int p = pp * 4; - - Mat out0 = top_blob.channel(p); - Mat out1 = top_blob.channel(p + 1); - Mat out2 = top_blob.channel(p + 2); - Mat out3 = top_blob.channel(p + 3); - - out0.fill(0.f); - out1.fill(0.f); - out2.fill(0.f); - out3.fill(0.f); - - const signed char* kernel0 = (const signed char*)kernel + p * inch * 9; - const signed char* kernel1 = (const signed char*)kernel + (p + 1) * inch * 9; - const signed char* kernel2 = (const signed char*)kernel + (p + 2) * inch * 9; - const signed char* kernel3 = (const signed char*)kernel + (p + 3) * inch * 9; - - for (int q=0; q> 3; int remain = outw & 7; +#else + int remain = outw; +#endif // __ARM_NEON - if (nn > 0) - { - asm volatile( - "0: \n" - // r0 - "prfm pldl1keep, [%5, #128] \n" - "ld2 {v4.8b, v5.8b}, [%5], #16 \n" - "ld2 {v6.8b, v7.8b}, [%5] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[0] \n" - "dup v10.8b, %17.b[0] \n" - "dup v11.8b, %18.b[0] \n" - "dup v12.8b, %19.b[0] \n" - - "smull v13.8h, v4.8b, v9.8b \n" - "smull v14.8h, v4.8b, v10.8b \n" - "smull v15.8h, v4.8b, v11.8b \n" - "smull v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[1] \n" - "dup v10.8b, %17.b[1] \n" - "dup v11.8b, %18.b[1] \n" - "dup v12.8b, %19.b[1] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[2] \n" - "dup v10.8b, %17.b[2] \n" - "dup v11.8b, %18.b[2] \n" - "dup v12.8b, %19.b[2] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // r1 - "prfm pldl1keep, [%6, #128] \n" - "ld2 {v4.8b, v5.8b}, [%6], #16 \n" - "ld2 {v6.8b, v7.8b}, [%6] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[3] \n" - "dup v10.8b, %17.b[3] \n" - "dup v11.8b, %18.b[3] \n" - "dup v12.8b, %19.b[3] \n" - - "smlal v13.8h, v4.8b, v9.8b \n" - "smlal v14.8h, v4.8b, v10.8b \n" - "smlal v15.8h, v4.8b, v11.8b \n" - "smlal v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[4] \n" - "dup v10.8b, %17.b[4] \n" - "dup v11.8b, %18.b[4] \n" - "dup v12.8b, %19.b[4] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[5] \n" - "dup v10.8b, %17.b[5] \n" - "dup v11.8b, %18.b[5] \n" - "dup v12.8b, %19.b[5] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // r2 - "prfm pldl1keep, [%7, #128] \n" - "ld2 {v4.8b, v5.8b}, [%7], #16 \n" - "ld2 {v6.8b, v7.8b}, [%7] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[6] \n" - "dup v10.8b, %17.b[6] \n" - "dup v11.8b, %18.b[6] \n" - "dup v12.8b, %19.b[6] \n" - - "smlal v13.8h, v4.8b, v9.8b \n" - "smlal v14.8h, v4.8b, v10.8b \n" - "smlal v15.8h, v4.8b, v11.8b \n" - "smlal v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[7] \n" - "dup v10.8b, %17.b[7] \n" - "dup v11.8b, %18.b[7] \n" - "dup v12.8b, %19.b[7] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[8] \n" - "dup v10.8b, %17.b[8] \n" - "dup v11.8b, %18.b[8] \n" - "dup v12.8b, %19.b[8] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // sum0 - sum3 - "prfm pldl1keep, [%1, #128] \n" - "prfm pldl1keep, [%2, #128] \n" - "prfm pldl1keep, [%3, #128] \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v17.4s, v18.4s}, [%1] \n" - "ld1 {v19.4s, v20.4s}, [%2] \n" - "ld1 {v21.4s, v22.4s}, [%3] \n" - "ld1 {v23.4s, v24.4s}, [%4] \n" - - "saddw v17.4s, v17.4s, v13.4h \n" - "saddw2 v18.4s, v18.4s, v13.8h \n" - "saddw v19.4s, v19.4s, v14.4h \n" - "saddw2 v20.4s, v20.4s, v14.8h \n" - "saddw v21.4s, v21.4s, v15.4h \n" - "saddw2 v22.4s, v22.4s, v15.8h \n" - "saddw v23.4s, v23.4s, v16.4h \n" - "saddw2 v24.4s, v24.4s, v16.8h \n" - "st1 {v17.4s, v18.4s}, [%1], #32\n" - "st1 {v19.4s, v20.4s}, [%2], #32\n" - "st1 {v21.4s, v22.4s}, [%3], #32\n" - "st1 {v23.4s, v24.4s}, [%4], #32\n" - "subs %w0, %w0, #1 \n" - "bne 0b \n" - : "=r"(nn), //%0 - "=r"(outptr0), //%1 - "=r"(outptr1), //%2 - "=r"(outptr2), //%3 - "=r"(outptr3), //%4 - "=r"(r0), //%5 - "=r"(r1), //%6 - "=r"(r2) //%7 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(r0), - "6"(r1), - "7"(r2), - "w"(_k0), //%16 - "w"(_k1), //%17 - "w"(_k2), //%18 - "w"(_k3) //%19 - : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" - ); - } - - if (remain >= 4) +#if 0 //__ARM_NEON + for (; nn >0; nn--) { - remain -= 4; - - asm volatile( // r0 - "prfm pldl1keep, [%5, #128] \n" - "ld2 {v4.8b, v5.8b}, [%5], #16 \n" - "ld2 {v6.8b, v7.8b}, [%5] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[0] \n" - "dup v10.8b, %17.b[0] \n" - "dup v11.8b, %18.b[0] \n" - "dup v12.8b, %19.b[0] \n" - - "smull v13.8h, v4.8b, v9.8b \n" - "smull v14.8h, v4.8b, v10.8b \n" - "smull v15.8h, v4.8b, v11.8b \n" - "smull v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[1] \n" - "dup v10.8b, %17.b[1] \n" - "dup v11.8b, %18.b[1] \n" - "dup v12.8b, %19.b[1] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[2] \n" - "dup v10.8b, %17.b[2] \n" - "dup v11.8b, %18.b[2] \n" - "dup v12.8b, %19.b[2] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // r1 - "prfm pldl1keep, [%6, #128] \n" - "ld2 {v4.8b, v5.8b}, [%6], #16 \n" - "ld2 {v6.8b, v7.8b}, [%6] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[3] \n" - "dup v10.8b, %17.b[3] \n" - "dup v11.8b, %18.b[3] \n" - "dup v12.8b, %19.b[3] \n" - - "smlal v13.8h, v4.8b, v9.8b \n" - "smlal v14.8h, v4.8b, v10.8b \n" - "smlal v15.8h, v4.8b, v11.8b \n" - "smlal v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[4] \n" - "dup v10.8b, %17.b[4] \n" - "dup v11.8b, %18.b[4] \n" - "dup v12.8b, %19.b[4] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[5] \n" - "dup v10.8b, %17.b[5] \n" - "dup v11.8b, %18.b[5] \n" - "dup v12.8b, %19.b[5] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // r2 - "prfm pldl1keep, [%7, #128] \n" - "ld2 {v4.8b, v5.8b}, [%7], #16 \n" - "ld2 {v6.8b, v7.8b}, [%7] \n" - "ext v8.8b, v4.8b, v6.8b, #1 \n" - - "dup v9.8b, %16.b[6] \n" - "dup v10.8b, %17.b[6] \n" - "dup v11.8b, %18.b[6] \n" - "dup v12.8b, %19.b[6] \n" - - "smlal v13.8h, v4.8b, v9.8b \n" - "smlal v14.8h, v4.8b, v10.8b \n" - "smlal v15.8h, v4.8b, v11.8b \n" - "smlal v16.8h, v4.8b, v12.8b \n" - - "dup v9.8b, %16.b[7] \n" - "dup v10.8b, %17.b[7] \n" - "dup v11.8b, %18.b[7] \n" - "dup v12.8b, %19.b[7] \n" - - "smlal v13.8h, v5.8b, v9.8b \n" - "smlal v14.8h, v5.8b, v10.8b \n" - "smlal v15.8h, v5.8b, v11.8b \n" - "smlal v16.8h, v5.8b, v12.8b \n" - - "dup v9.8b, %16.b[8] \n" - "dup v10.8b, %17.b[8] \n" - "dup v11.8b, %18.b[8] \n" - "dup v12.8b, %19.b[8] \n" - - "smlal v13.8h, v8.8b, v9.8b \n" - "smlal v14.8h, v8.8b, v10.8b \n" - "smlal v15.8h, v8.8b, v11.8b \n" - "smlal v16.8h, v8.8b, v12.8b \n" - // sum0 - sum3 - "prfm pldl1keep, [%1, #128] \n" - "prfm pldl1keep, [%2, #128] \n" - "prfm pldl1keep, [%3, #128] \n" - "prfm pldl1keep, [%4, #128] \n" - "ld1 {v17.4s}, [%1] \n" - "ld1 {v19.4s}, [%2] \n" - "ld1 {v21.4s}, [%3] \n" - "ld1 {v23.4s}, [%4] \n" - - "saddw v17.4s, v17.4s, v13.4h \n" - "saddw v19.4s, v19.4s, v14.4h \n" - "saddw v21.4s, v21.4s, v15.4h \n" - "saddw v23.4s, v23.4s, v16.4h \n" - - "st1 {v17.4s}, [%1], #16 \n" - "st1 {v19.4s}, [%2], #16 \n" - "st1 {v21.4s}, [%3], #16 \n" - "st1 {v23.4s}, [%4], #16 \n" - "sub %5, %5, #8 \n" - "sub %6, %6, #8 \n" - "sub %7, %7, #8 \n" - : "=r"(nn), //%0 - "=r"(outptr0), //%1 - "=r"(outptr1), //%2 - "=r"(outptr2), //%3 - "=r"(outptr3), //%4 - "=r"(r0), //%5 - "=r"(r1), //%6 - "=r"(r2) //%7 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(outptr2), - "4"(outptr3), - "5"(r0), - "6"(r1), - "7"(r2), - "w"(_k0), //%16 - "w"(_k1), //%17 - "w"(_k2), //%18 - "w"(_k3) //%19 - : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24" - ); - } - - for (; remain>0; remain--) - { - int sum0 = 0; - int sum1 = 0; - int sum2 = 0; - int sum3 = 0; - - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - sum1 += (int)r0[0] * kernel1[0]; - sum1 += (int)r0[1] * kernel1[1]; - sum1 += (int)r0[2] * kernel1[2]; - sum1 += (int)r1[0] * kernel1[3]; - sum1 += (int)r1[1] * kernel1[4]; - sum1 += (int)r1[2] * kernel1[5]; - sum1 += (int)r2[0] * kernel1[6]; - sum1 += (int)r2[1] * kernel1[7]; - sum1 += (int)r2[2] * kernel1[8]; - - sum2 += (int)r0[0] * kernel2[0]; - sum2 += (int)r0[1] * kernel2[1]; - sum2 += (int)r0[2] * kernel2[2]; - sum2 += (int)r1[0] * kernel2[3]; - sum2 += (int)r1[1] * kernel2[4]; - sum2 += (int)r1[2] * kernel2[5]; - sum2 += (int)r2[0] * kernel2[6]; - sum2 += (int)r2[1] * kernel2[7]; - sum2 += (int)r2[2] * kernel2[8]; - - sum3 += (int)r0[0] * kernel3[0]; - sum3 += (int)r0[1] * kernel3[1]; - sum3 += (int)r0[2] * kernel3[2]; - sum3 += (int)r1[0] * kernel3[3]; - sum3 += (int)r1[1] * kernel3[4]; - sum3 += (int)r1[2] * kernel3[5]; - sum3 += (int)r2[0] * kernel3[6]; - sum3 += (int)r2[1] * kernel3[7]; - sum3 += (int)r2[2] * kernel3[8]; - - *outptr0 += sum0; - *outptr1 += sum1; - *outptr2 += sum2; - *outptr3 += sum3; + int8x8_t _r0 = vld1_s8(r0); + int8x8_t _r0n = vld1_s8(r0+8); + int8x8_t _r01 = vext_s8(_r0, _r0n, 1); + int8x8_t _r02 = vext_s8(_r0, _r0n, 2); + int16x8_t _r0_s16 = vmovl_s8(_r0); // r00 - r07 + int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08 + int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09 - r0 += 2; - r1 += 2; - r2 += 2; - outptr0++; - outptr1++; - outptr2++; - outptr3++; - } + int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00 + int32x4_t _sum0n = vmull_lane_s16(vget_high_s16(_r0_s16), _k0123, 0); - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; - } + int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01 + int32x4_t _sum1n = vmull_lane_s16(vget_high_s16(_r01_s16), _k0123, 1); - kernel0 += 9; - kernel1 += 9; - kernel2 += 9; - kernel3 += 9; - } - } + int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02 + int32x4_t _sum2n = vmull_lane_s16(vget_high_s16(_r02_s16), _k0123, 2); - #pragma omp parallel for num_threads(opt.num_threads) - for (int p=remain_outch_start; p> 3; - int remain = outw & 7; -#else - int remain = outw; -#endif // __ARM_NEON + // load output sum0 sum0n + int32x4_t _out00 = vld1q_s32(outptr0); + int32x4_t _out01 = vld1q_s32(outptr0+4); -#if __ARM_NEON - for (; nn >0; nn--) - { - int8x8x2_t _r0 = vld2_s8(r0); - int8x8x2_t _r0n = vld2_s8(r0+16); - int8x8_t _r00 = _r0.val[0]; - int8x8_t _r01 = _r0.val[1]; - int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1); - - int16x8_t _sum = vmull_s8(_r00, _k0); - _sum = vmlal_s8(_sum, _r01, _k1); - _sum = vmlal_s8(_sum, _r02, _k2); - - int8x8x2_t _r1 = vld2_s8(r1); - int8x8x2_t _r1n = vld2_s8(r1+16); - int8x8_t _r10 = _r1.val[0]; - int8x8_t _r11 = _r1.val[1]; - int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1); - _sum = vmlal_s8(_sum, _r10, _k3); - _sum = vmlal_s8(_sum, _r11, _k4); - _sum = vmlal_s8(_sum, _r12, _k5); - - int8x8x2_t _r2 = vld2_s8(r2); - int8x8x2_t _r2n = vld2_s8(r2+16); - int8x8_t _r20 = _r2.val[0]; - int8x8_t _r21 = _r2.val[1]; - int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1); - _sum = vmlal_s8(_sum, _r20, _k6); - _sum = vmlal_s8(_sum, _r21, _k7); - _sum = vmlal_s8(_sum, _r22, _k8); - - int32x4_t sum0_s32 = vld1q_s32(outptr0); - int32x4_t sum0n_s32 = vld1q_s32(outptr0+4); - - sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum)); - sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum)); - - vst1q_s32(outptr0, sum0_s32); - vst1q_s32(outptr0+4, sum0n_s32); + _sum0 = vaddq_s32(_sum0, _sum1); + _sum0n = vaddq_s32(_sum0n, _sum1n); + _sum2 = vaddq_s32(_sum2, _sum0); + _sum2n = vaddq_s32(_sum2n, _sum0n); - r0 += 16; - r1 += 16; - r2 += 16; - outptr0 += 8; - } -#endif -#if __ARM_NEON - if (remain >= 4) - { - remain -= 4; + _out00 = vaddq_s32(_out00, _sum2); + _out01 = vaddq_s32(_out01, _sum2n); - int8x8x2_t _r0 = vld2_s8(r0); - int8x8x2_t _r0n = vld2_s8(r0+16); - int8x8_t _r00 = _r0.val[0]; - int8x8_t _r01 = _r0.val[1]; - int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1); - - int16x8_t _sum = vmull_s8(_r00, _k0); - _sum = vmlal_s8(_sum, _r01, _k1); - _sum = vmlal_s8(_sum, _r02, _k2); - - int8x8x2_t _r1 = vld2_s8(r1); - int8x8x2_t _r1n = vld2_s8(r1+16); - int8x8_t _r10 = _r1.val[0]; - int8x8_t _r11 = _r1.val[1]; - int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1); - _sum = vmlal_s8(_sum, _r10, _k3); - _sum = vmlal_s8(_sum, _r11, _k4); - _sum = vmlal_s8(_sum, _r12, _k5); - - int8x8x2_t _r2 = vld2_s8(r2); - int8x8x2_t _r2n = vld2_s8(r2+16); - int8x8_t _r20 = _r2.val[0]; - int8x8_t _r21 = _r2.val[1]; - int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1); - _sum = vmlal_s8(_sum, _r20, _k6); - _sum = vmlal_s8(_sum, _r21, _k7); - _sum = vmlal_s8(_sum, _r22, _k8); - - int32x4_t sum0_s32 = vld1q_s32(outptr0); - sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum)); - vst1q_s32(outptr0, sum0_s32); + vst1q_s32(outptr0, _out00); + vst1q_s32(outptr0+4, _out01); r0 += 8; r1 += 8; r2 += 8; - outptr0 += 4; - } + r3 += 8; + outptr0 += 8; + outptr0n += 8; + } #endif for (; remain>0; remain--) { int sum0 = 0; - - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - + + sum0 += r0[0] * ktmp[0]; + sum0 += r0[1] * ktmp[1]; + sum0 += r0[2] * ktmp[2]; + sum0 += r1[0] * ktmp[3]; + sum0 += r1[1] * ktmp[4]; + sum0 += r1[2] * ktmp[5]; + sum0 += r2[0] * ktmp[6]; + sum0 += r2[1] * ktmp[7]; + sum0 += r2[2] * ktmp[8]; + *outptr0 += sum0; - r0 += 2; - r1 += 2; - r2 += 2; + r0++; + r1++; + r2++; outptr0++; } - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; + r0 += 2; + r1 += 2; + r2 += 2; } - kernel0 += 9; - } - } + ktmp += 9; + } + } } -#else // __aarch64__ -static void conv3x3s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) + +static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) { int w = bottom_blob.w; int inch = bottom_blob.c; @@ -1356,1207 +1748,1225 @@ static void conv3x3s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat int outh = top_blob.h; int outch = top_blob.c; - const signed char* kernel = _kernel; + const int tailstep = w - 2*outw + w; - int nn_outch = outch >> 1; - int remain_outch_start = nn_outch << 1; + int nn_outch = outch >> 3; + int remain_outch_start = nn_outch << 3; #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp < nn_outch; pp++) + for (int pp=0; pp> 3; - int remain = outw & 7; +#if __ARM_NEON + int nn = outw >> 2; + int remain = outw & 3; +#else + int remain = outw; +#endif // __ARM_NEON - if (nn > 0) - { - asm volatile( - "vld1.8 {d26-d27}, [%0] \n" - "vld1.8 {d28-d29}, [%1] \n" - : "=r"(kernel0), // %0 - "=r"(kernel1) // %1 - : "0"(kernel0), - "1"(kernel1) - : "cc", "memory" - ); +#if __ARM_NEON + for (; nn>0; nn--) + { + // load output ch 0-7 + int32x4_t _sum0 = vld1q_s32(outptr0);// out0 + int32x4_t _sum1 = vld1q_s32(outptr1);// out1 + int32x4_t _sum2 = vld1q_s32(outptr2);// out2 + int32x4_t _sum3 = vld1q_s32(outptr3);// out3 + int32x4_t _sum4 = vld1q_s32(outptr4);// out4 + int32x4_t _sum5 = vld1q_s32(outptr5);// out5 + int32x4_t _sum6 = vld1q_s32(outptr6);// out6 + int32x4_t _sum7 = vld1q_s32(outptr7);// out7 - asm volatile( - "0: \n" - "pld [%5, #128] \n" - "vld1.32 {d0-d1}, [%5] \n"// r0 - "add %5, #8 \n" - "vext.8 d2, d0, d1, #1 \n" - "vext.8 d3, d0, d1, #2 \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0 - "vmlal.s8 q2, d2, d30 \n"// k1 - "vmlal.s8 q2, d3, d31 \n"// k2 - - "pld [%6, #128] \n" - "vld1.32 {d6-d7}, [%6] \n"// r1 - "add %6, #8 \n" - "vext.8 d8, d6, d7, #1 \n" - "vext.8 d9, d6, d7, #2 \n" - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3 - "vmlal.s8 q2, d8, d30 \n"// k4 - "vmlal.s8 q2, d9, d31 \n"// k5 - - "pld [%7, #128] \n" - "vld1.32 {d10-d11}, [%7] \n"// r2 - "add %7, #8 \n" - "vext.8 d12, d10, d11, #1 \n" - "vext.8 d13, d10, d11, #2 \n" - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6 - "vmlal.s8 q2, d12, d30 \n"// k7 - "vmlal.s8 q2, d13, d31 \n"// k8 - - "pld [%8, #128] \n" - "vld1.32 {d14-d15}, [%8] \n"// r3 - "add %8, #8 \n" - "vext.8 d16, d14, d15, #1 \n" - "vext.8 d17, d14, d15, #2 \n" - - "pld [%1, #128] \n" - "vld1.32 {d18-d21}, [%1] \n"// sum0 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%1]! \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d6, d1 \n"// k0 - "vmlal.s8 q2, d8, d30 \n"// k1 - "vmlal.s8 q2, d9, d31 \n"// k2 - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d10, d1 \n"// k3 - "vmlal.s8 q2, d12, d30 \n"// k4 - "vmlal.s8 q2, d13, d31 \n"// k5 - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d14, d1 \n"// k6 - "vmlal.s8 q2, d16, d30 \n"// k7 - "vmlal.s8 q2, d17, d31 \n"// k8 - - "pld [%2, #128] \n" - "vld1.32 {d18-d21}, [%2] \n"// sum0n - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%2]! \n" - - "vdup.s8 d1, d28[0] \n" - "vdup.s8 d30, d28[1] \n" - "vdup.s8 d31, d28[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0n - "vmlal.s8 q2, d2, d30 \n"// k1n - "vmlal.s8 q2, d3, d31 \n"// k2n - - "vdup.s8 d1, d28[3] \n" - "vdup.s8 d30, d28[4] \n" - "vdup.s8 d31, d28[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3n - "vmlal.s8 q2, d8, d30 \n"// k4n - "vmlal.s8 q2, d9, d31 \n"// k5n - - "vdup.s8 d1, d28[6] \n" - "vdup.s8 d30, d28[7] \n" - "vdup.s8 d31, d29[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6n - "vmlal.s8 q2, d12, d30 \n"// k7n - "vmlal.s8 q2, d13, d31 \n"// k8n - - "pld [%3, #128] \n" - "vld1.32 {d18-d21}, [%3] \n"// sum1 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%3]! \n" - - "vdup.s8 d1, d28[0] \n" - "vdup.s8 d30, d28[1] \n" - "vdup.s8 d31, d28[2] \n" - "vmull.s8 q2, d6, d1 \n"// k0n - "vmlal.s8 q2, d8, d30 \n"// k1n - "vmlal.s8 q2, d9, d31 \n"// k2n - - "vdup.s8 d1, d28[3] \n" - "vdup.s8 d30, d28[4] \n" - "vdup.s8 d31, d28[5] \n" - "vmlal.s8 q2, d10, d1 \n"// k3n - "vmlal.s8 q2, d12, d30 \n"// k4n - "vmlal.s8 q2, d13, d31 \n"// k5n - - "vdup.s8 d1, d28[6] \n" - "vdup.s8 d30, d28[7] \n" - "vdup.s8 d31, d29[0] \n" - "vmlal.s8 q2, d14, d1 \n"// k6n - "vmlal.s8 q2, d16, d30 \n"// k7n - "vmlal.s8 q2, d17, d31 \n"// k8n - - "pld [%4, #128] \n" - "vld1.32 {d18-d21}, [%4] \n"// sum1n - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%4]! \n" - - "subs %0, #1 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(outptr0n), // %2 - "=r"(outptr1), // %3 - "=r"(outptr1n), // %4 - "=r"(r0), // %5 - "=r"(r1), // %6 - "=r"(r2), // %7 - "=r"(r3) // %8 - : "0"(nn), - "1"(outptr0), - "2"(outptr0n), - "3"(outptr1), - "4"(outptr1n), - "5"(r0), - "6"(r1), - "7"(r2), - "8"(r3) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q15" - ); - } + // r0 + int8x8x2_t _r0_s8 = vld2_s8(r0); + int8x8_t _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1); + // k0 - k2 + int8x8_t _k0_8 = vld1_s8(ktmp); //(k00-k70) + int8x8_t _k1_8 = vld1_s8(ktmp+8); //(k01-k71) + int8x8_t _k2_8 = vld1_s8(ktmp+16); //(k02-k72) + + int16x8_t _r0 = vmovl_s8(_r0_s8.val[0]); + int16x8_t _r1 = vmovl_s8(_r0_s8.val[1]); + int16x8_t _r2 = vmovl_s8(_r2_s8); + + int16x8_t _k0 = vmovl_s8(_k0_8); + int16x8_t _k1 = vmovl_s8(_k1_8); + int16x8_t _k2 = vmovl_s8(_k2_8); + // dot row 1 k0 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7); + // dot row 1 k1 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7); + // dot row 1 k2 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7); + + // r1 + _r0_s8 = vld2_s8(r1); + _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1); + // k3 - k5 + _k0_8 = vld1_s8(ktmp+24); //(k03-k73) + _k1_8 = vld1_s8(ktmp+32); //(k04-k74) + _k2_8 = vld1_s8(ktmp+40); //(k05-k75) + + _r0 = vmovl_s8(_r0_s8.val[0]); + _r1 = vmovl_s8(_r0_s8.val[1]); + _r2 = vmovl_s8(_r2_s8); + + _k0 = vmovl_s8(_k0_8); + _k1 = vmovl_s8(_k1_8); + _k2 = vmovl_s8(_k2_8); + // dot row 2 k3 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7); + // dot row 2 k4 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7); + // dot row 2 k5 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7); + + // r2 + _r0_s8 = vld2_s8(r2); + _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1); + // k6 - k8 + _k0_8 = vld1_s8(ktmp+48); //(k06-k76) + _k1_8 = vld1_s8(ktmp+56); //(k07-k77) + _k2_8 = vld1_s8(ktmp+64); //(k08-k78) + + _r0 = vmovl_s8(_r0_s8.val[0]); + _r1 = vmovl_s8(_r0_s8.val[1]); + _r2 = vmovl_s8(_r2_s8); + + _k0 = vmovl_s8(_k0_8); + _k1 = vmovl_s8(_k1_8); + _k2 = vmovl_s8(_k2_8); + // dot row 2 k6 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7); + // dot row 2 k7 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7); + // dot row 2 k8 + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1); + _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2); + _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3); + _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4); + _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5); + _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6); + _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7); + + // save s32 to memory + vst1q_s32(outptr0, _sum0); + vst1q_s32(outptr1, _sum1); + vst1q_s32(outptr2, _sum2); + vst1q_s32(outptr3, _sum3); + vst1q_s32(outptr4, _sum4); + vst1q_s32(outptr5, _sum5); + vst1q_s32(outptr6, _sum6); + vst1q_s32(outptr7, _sum7); + + r0 += 8; + r1 += 8; + r2 += 8; + outptr0 += 4; + outptr1 += 4; + outptr2 += 4; + outptr3 += 4; + outptr4 += 4; + outptr5 += 4; + outptr6 += 4; + outptr7 += 4; + } + +#endif // __ARM_NEON for (; remain>0; remain--) { - int sum0 = 0; - int sum0n = 0; - int sum1 = 0; - int sum1n = 0; - - //ToDo Neon - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - sum1 += (int)r0[0] * kernel1[0]; - sum1 += (int)r0[1] * kernel1[1]; - sum1 += (int)r0[2] * kernel1[2]; - sum1 += (int)r1[0] * kernel1[3]; - sum1 += (int)r1[1] * kernel1[4]; - sum1 += (int)r1[2] * kernel1[5]; - sum1 += (int)r2[0] * kernel1[6]; - sum1 += (int)r2[1] * kernel1[7]; - sum1 += (int)r2[2] * kernel1[8]; - - sum0n += (int)r1[0] * kernel0[0]; - sum0n += (int)r1[1] * kernel0[1]; - sum0n += (int)r1[2] * kernel0[2]; - sum0n += (int)r2[0] * kernel0[3]; - sum0n += (int)r2[1] * kernel0[4]; - sum0n += (int)r2[2] * kernel0[5]; - sum0n += (int)r3[0] * kernel0[6]; - sum0n += (int)r3[1] * kernel0[7]; - sum0n += (int)r3[2] * kernel0[8]; - - sum1n += (int)r1[0] * kernel1[0]; - sum1n += (int)r1[1] * kernel1[1]; - sum1n += (int)r1[2] * kernel1[2]; - sum1n += (int)r2[0] * kernel1[3]; - sum1n += (int)r2[1] * kernel1[4]; - sum1n += (int)r2[2] * kernel1[5]; - sum1n += (int)r3[0] * kernel1[6]; - sum1n += (int)r3[1] * kernel1[7]; - sum1n += (int)r3[2] * kernel1[8]; +#if __ARM_NEON + int8x8_t _r0_s8 = vld1_s8(r0);// (a00 a01 a02 ....) + int8x8_t _r1_s8 = vld1_s8(r1);// (a10 a11 a12 ....) + int8x8_t _r2_s8 = vld1_s8(r2);// (a20 a21 a22 ....) + + int16x8_t _r0 = vmovl_s8(_r0_s8); + int16x8_t _r1 = vmovl_s8(_r1_s8); + int16x8_t _r2 = vmovl_s8(_r2_s8); + + int32x4_t _sum03, _sum47; + _sum03 = vld1q_lane_s32(outptr0, _sum03, 0);// out0 + _sum03 = vld1q_lane_s32(outptr1, _sum03, 1);// out1 + _sum03 = vld1q_lane_s32(outptr2, _sum03, 2);// out2 + _sum03 = vld1q_lane_s32(outptr3, _sum03, 3);// out3 + _sum47 = vld1q_lane_s32(outptr4, _sum47, 0);// out4 + _sum47 = vld1q_lane_s32(outptr5, _sum47, 1);// out5 + _sum47 = vld1q_lane_s32(outptr6, _sum47, 2);// out6 + _sum47 = vld1q_lane_s32(outptr7, _sum47, 3);// out7 + + // k0 - k2 + int8x8_t _k0_8 = vld1_s8(ktmp); //(k00-k70) + int8x8_t _k1_8 = vld1_s8(ktmp+8); //(k01-k71) + int8x8_t _k2_8 = vld1_s8(ktmp+16); //(k02-k72) + + int16x8_t _k0 = vmovl_s8(_k0_8); + int16x8_t _k1 = vmovl_s8(_k1_8); + int16x8_t _k2 = vmovl_s8(_k2_8); + + int32x4_t _sum0 = vmull_laneq_s16(vget_low_s16(_k0), _r0, 0); + int32x4_t _sum0n = vmull_laneq_s16(vget_high_s16(_k0), _r0, 0); + int32x4_t _sum1 = vmull_laneq_s16(vget_low_s16(_k1), _r0, 1); + int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1); + _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r0, 2); + _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r0, 2); + + // k3 - k5 + _k0_8 = vld1_s8(ktmp+24); //(k03-k73) + _k1_8 = vld1_s8(ktmp+32); //(k04-k74) + _k2_8 = vld1_s8(ktmp+40); //(k05-k75) + + _k0 = vmovl_s8(_k0_8); + _k1 = vmovl_s8(_k1_8); + _k2 = vmovl_s8(_k2_8); + + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r1, 0); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r1, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r1, 1); + _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1); + _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r1, 2); + _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r1, 2); + + // k6 - k8 + _k0_8 = vld1_s8(ktmp+48); //(k06-k76) + _k1_8 = vld1_s8(ktmp+56); //(k07-k77) + _k2_8 = vld1_s8(ktmp+64); //(k08-k78) + + _k0 = vmovl_s8(_k0_8); + _k1 = vmovl_s8(_k1_8); + _k2 = vmovl_s8(_k2_8); + + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r2, 0); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r2, 0); + _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r2, 1); + _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1); + _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r2, 2); + _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r2, 2); - *outptr0 += sum0; - *outptr1 += sum1; - *outptr0n += sum0n; - *outptr1n += sum1n; + _sum0 = vaddq_s32(_sum0, _sum1); + _sum0n = vaddq_s32(_sum0n, _sum1n); + _sum03 = vaddq_s32(_sum03, _sum0); + _sum47 = vaddq_s32(_sum47, _sum0n); + + vst1q_lane_s32(outptr0, _sum03, 0); + vst1q_lane_s32(outptr1, _sum03, 1); + vst1q_lane_s32(outptr2, _sum03, 2); + vst1q_lane_s32(outptr3, _sum03, 3); + vst1q_lane_s32(outptr4, _sum47, 0); + vst1q_lane_s32(outptr5, _sum47, 1); + vst1q_lane_s32(outptr6, _sum47, 2); + vst1q_lane_s32(outptr7, _sum47, 3); - r0++; - r1++; - r2++; - r3++; outptr0++; outptr1++; - outptr0n++; - outptr1n++; - } + outptr2++; + outptr3++; + outptr4++; + outptr5++; + outptr6++; + outptr7++; +#else // __ARM_NEON + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + int sum4 = 0; + int sum5 = 0; + int sum6 = 0; + int sum7 = 0; - r0 += 2 + w; - r1 += 2 + w; - r2 += 2 + w; - r3 += 2 + w; + sum0 += (int)r0[0] * ktmp[0]; + sum1 += (int)r0[0] * ktmp[1]; + sum2 += (int)r0[0] * ktmp[2]; + sum3 += (int)r0[0] * ktmp[3]; + sum4 += (int)r0[0] * ktmp[4]; + sum5 += (int)r0[0] * ktmp[5]; + sum6 += (int)r0[0] * ktmp[6]; + sum7 += (int)r0[0] * ktmp[7]; + ktmp += 8; - outptr0 += outw; - outptr1 += outw; - outptr0n += outw; - outptr1n += outw; - } + sum0 += (int)r0[1] * ktmp[0]; + sum1 += (int)r0[1] * ktmp[1]; + sum2 += (int)r0[1] * ktmp[2]; + sum3 += (int)r0[1] * ktmp[3]; + sum4 += (int)r0[1] * ktmp[4]; + sum5 += (int)r0[1] * ktmp[5]; + sum6 += (int)r0[1] * ktmp[6]; + sum7 += (int)r0[1] * ktmp[7]; + ktmp += 8; - for (; i < outh; i++) - { - int nn = outw >> 3; - int remain = outw & 7; + sum0 += (int)r0[2] * ktmp[0]; + sum1 += (int)r0[2] * ktmp[1]; + sum2 += (int)r0[2] * ktmp[2]; + sum3 += (int)r0[2] * ktmp[3]; + sum4 += (int)r0[2] * ktmp[4]; + sum5 += (int)r0[2] * ktmp[5]; + sum6 += (int)r0[2] * ktmp[6]; + sum7 += (int)r0[2] * ktmp[7]; + ktmp += 8; - if (nn > 0) - { - asm volatile( - "vld1.8 {d26-d27}, [%0] \n" - "vld1.8 {d28-d29}, [%1] \n" - : "=r"(kernel0), // %0 - "=r"(kernel1) // %1 - : "0"(kernel0), - "1"(kernel1) - : "cc", "memory" - ); + sum0 += (int)r1[0] * ktmp[0]; + sum1 += (int)r1[0] * ktmp[1]; + sum2 += (int)r1[0] * ktmp[2]; + sum3 += (int)r1[0] * ktmp[3]; + sum4 += (int)r1[0] * ktmp[4]; + sum5 += (int)r1[0] * ktmp[5]; + sum6 += (int)r1[0] * ktmp[6]; + sum7 += (int)r1[0] * ktmp[7]; + ktmp += 8; - asm volatile( - "0: \n" - "pld [%3, #128] \n" - "vld1.32 {d0-d1}, [%3] \n"// r0 - "add %3, #8 \n" - "vext.8 d2, d0, d1, #1 \n" - "vext.8 d3, d0, d1, #2 \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0 - "vmlal.s8 q2, d2, d30 \n"// k1 - "vmlal.s8 q2, d3, d31 \n"// k2 - - "pld [%4, #128] \n" - "vld1.32 {d6-d7}, [%4] \n"// r1 - "add %4, #8 \n" - "vext.8 d8, d6, d7, #1 \n" - "vext.8 d9, d6, d7, #2 \n" - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3 - "vmlal.s8 q2, d8, d30 \n"// k4 - "vmlal.s8 q2, d9, d31 \n"// k5 - - "pld [%5, #128] \n" - "vld1.32 {d10-d11}, [%5] \n"// r2 - "add %5, #8 \n" - "vext.8 d12, d10, d11, #1 \n" - "vext.8 d13, d10, d11, #2 \n" - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6 - "vmlal.s8 q2, d12, d30 \n"// k7 - "vmlal.s8 q2, d13, d31 \n"// k8 - - "pld [%1, #128] \n" - "vld1.32 {d18-d21}, [%1] \n"// sum0 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%1]! \n" - - "vdup.s8 d1, d28[0] \n" - "vdup.s8 d7, d28[1] \n" - "vdup.s8 d11, d28[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0n - "vmlal.s8 q2, d2, d7 \n"// k1n - "vmlal.s8 q2, d3, d11 \n"// k2n - - "vdup.s8 d1, d28[3] \n" - "vdup.s8 d7, d28[4] \n" - "vdup.s8 d11, d28[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3n - "vmlal.s8 q2, d8, d7 \n"// k4n - "vmlal.s8 q2, d9, d11 \n"// k5n - - "vdup.s8 d1, d28[6] \n" - "vdup.s8 d7, d28[7] \n" - "vdup.s8 d11, d29[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6n - "vmlal.s8 q2, d12, d7 \n"// k7n - "vmlal.s8 q2, d13, d11 \n"// k8n - - "pld [%2, #128] \n" - "vld1.32 {d18-d21}, [%2] \n"// sum1 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%2]! \n" - - "subs %0, #1 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(outptr1), // %2 - "=r"(r0), // %3 - "=r"(r1), // %4 - "=r"(r2) // %5 - : "0"(nn), - "1"(outptr0), - "2"(outptr1), - "3"(r0), - "4"(r1), - "5"(r2) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); - } + sum0 += (int)r1[1] * ktmp[0]; + sum1 += (int)r1[1] * ktmp[1]; + sum2 += (int)r1[1] * ktmp[2]; + sum3 += (int)r1[1] * ktmp[3]; + sum4 += (int)r1[1] * ktmp[4]; + sum5 += (int)r1[1] * ktmp[5]; + sum6 += (int)r1[1] * ktmp[6]; + sum7 += (int)r1[1] * ktmp[7]; + ktmp += 8; - for (; remain>0; remain--) - { - int sum0 = 0; - int sum1 = 0; + sum0 += (int)r1[2] * ktmp[0]; + sum1 += (int)r1[2] * ktmp[1]; + sum2 += (int)r1[2] * ktmp[2]; + sum3 += (int)r1[2] * ktmp[3]; + sum4 += (int)r1[2] * ktmp[4]; + sum5 += (int)r1[2] * ktmp[5]; + sum6 += (int)r1[2] * ktmp[6]; + sum7 += (int)r1[2] * ktmp[7]; + ktmp += 8; + + sum0 += (int)r2[0] * ktmp[0]; + sum1 += (int)r2[0] * ktmp[1]; + sum2 += (int)r2[0] * ktmp[2]; + sum3 += (int)r2[0] * ktmp[3]; + sum4 += (int)r2[0] * ktmp[4]; + sum5 += (int)r2[0] * ktmp[5]; + sum6 += (int)r2[0] * ktmp[6]; + sum7 += (int)r2[0] * ktmp[7]; + ktmp += 8; - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - sum1 += (int)r0[0] * kernel1[0]; - sum1 += (int)r0[1] * kernel1[1]; - sum1 += (int)r0[2] * kernel1[2]; - sum1 += (int)r1[0] * kernel1[3]; - sum1 += (int)r1[1] * kernel1[4]; - sum1 += (int)r1[2] * kernel1[5]; - sum1 += (int)r2[0] * kernel1[6]; - sum1 += (int)r2[1] * kernel1[7]; - sum1 += (int)r2[2] * kernel1[8]; + sum0 += (int)r2[1] * ktmp[0]; + sum1 += (int)r2[1] * ktmp[1]; + sum2 += (int)r2[1] * ktmp[2]; + sum3 += (int)r2[1] * ktmp[3]; + sum4 += (int)r2[1] * ktmp[4]; + sum5 += (int)r2[1] * ktmp[5]; + sum6 += (int)r2[1] * ktmp[6]; + sum7 += (int)r2[1] * ktmp[7]; + ktmp += 8; + + sum0 += (int)r2[2] * ktmp[0]; + sum1 += (int)r2[2] * ktmp[1]; + sum2 += (int)r2[2] * ktmp[2]; + sum3 += (int)r2[2] * ktmp[3]; + sum4 += (int)r2[2] * ktmp[4]; + sum5 += (int)r2[2] * ktmp[5]; + sum6 += (int)r2[2] * ktmp[6]; + sum7 += (int)r2[2] * ktmp[7]; + ktmp += 8; *outptr0 += sum0; *outptr1 += sum1; + *outptr2 += sum2; + *outptr3 += sum3; + *outptr4 += sum4; + *outptr5 += sum5; + *outptr6 += sum6; + *outptr7 += sum7; + + ktmp -= 8*9; - r0++; - r1++; - r2++; outptr0++; outptr1++; + outptr2++; + outptr3++; + outptr4++; + outptr5++; + outptr6++; + outptr7++; +#endif // __ARM_NEON + r0 += 2; + r1 += 2; + r2 += 2; } - r0 += 2; - r1 += 2; - r2 += 2; + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; } - kernel0 += 9; - kernel1 += 9; + ktmp += 8*9; } } #pragma omp parallel for num_threads(opt.num_threads) for (int p=remain_outch_start; p> 3; int remain = outw & 7; +#else + int remain = outw; +#endif // __ARM_NEON - if (nn > 0) - { - asm volatile( - "vld1.8 {d26-d27}, [%0] \n" - : "=r"(kernel0) // %0 - : "0"(kernel0) - : "cc", "memory" - ); - - asm volatile( - "0: \n" - "pld [%3, #128] \n" - "vld1.32 {d0-d1}, [%3] \n"// r0 - "add %3, #8 \n" - "vext.8 d2, d0, d1, #1 \n" - "vext.8 d3, d0, d1, #2 \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0 - "vmlal.s8 q2, d2, d30 \n"// k1 - "vmlal.s8 q2, d3, d31 \n"// k2 - - "pld [%4, #128] \n" - "vld1.32 {d6-d7}, [%4] \n"// r1 - "add %4, #8 \n" - "vext.8 d8, d6, d7, #1 \n" - "vext.8 d9, d6, d7, #2 \n" - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3 - "vmlal.s8 q2, d8, d30 \n"// k4 - "vmlal.s8 q2, d9, d31 \n"// k5 - - "pld [%5, #128] \n" - "vld1.32 {d10-d11}, [%5] \n"// r2 - "add %5, #8 \n" - "vext.8 d12, d10, d11, #1 \n" - "vext.8 d13, d10, d11, #2 \n" - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6 - "vmlal.s8 q2, d12, d30 \n"// k7 - "vmlal.s8 q2, d13, d31 \n"// k8 - - "pld [%6, #128] \n" - "vld1.32 {d14-d15}, [%6] \n"// r3 - "add %6, #8 \n" - "vext.8 d16, d14, d15, #1 \n" - "vext.8 d17, d14, d15, #2 \n" - - "pld [%1, #128] \n" - "vld1.32 {d18-d21}, [%1] \n"// sum0 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%1]! \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d6, d1 \n"// k0 - "vmlal.s8 q2, d8, d30 \n"// k1 - "vmlal.s8 q2, d9, d31 \n"// k2 - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d10, d1 \n"// k3 - "vmlal.s8 q2, d12, d30 \n"// k4 - "vmlal.s8 q2, d13, d31 \n"// k5 - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d14, d1 \n"// k6 - "vmlal.s8 q2, d16, d30 \n"// k7 - "vmlal.s8 q2, d17, d31 \n"// k8 - - "pld [%2, #128] \n" - "vld1.32 {d18-d21}, [%2] \n"// sum0n - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%2]! \n" - - "subs %0, #1 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(outptr0n), // %2 - "=r"(r0), // %3 - "=r"(r1), // %4 - "=r"(r2), // %5 - "=r"(r3) // %6 - : "0"(nn), - "1"(outptr0), - "2"(outptr0n), - "3"(r0), - "4"(r1), - "5"(r2), - "6"(r3) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); - } - - for (; remain>0; remain--) - { - //Todo Neon - - int sum0 = 0; - int sum0n = 0; - - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - sum0n += (int)r1[0] * kernel0[0]; - sum0n += (int)r1[1] * kernel0[1]; - sum0n += (int)r1[2] * kernel0[2]; - sum0n += (int)r2[0] * kernel0[3]; - sum0n += (int)r2[1] * kernel0[4]; - sum0n += (int)r2[2] * kernel0[5]; - sum0n += (int)r3[0] * kernel0[6]; - sum0n += (int)r3[1] * kernel0[7]; - sum0n += (int)r3[2] * kernel0[8]; +#if __ARM_NEON + for (; nn>0; nn--) + { + // load output ch 0 + int32x4_t _sum0 = vld1q_s32(outptr);// out0 + int32x4_t _sum0n = vld1q_s32(outptr+4);// out0n + + int8x8x2_t _r0_s8 = vld2_s8(r0); + int8x8x2_t _r0n_s8 = vld2_s8(r0+16); + + int8x8x2_t _r1_s8 = vld2_s8(r1); + int8x8x2_t _r1n_s8 = vld2_s8(r1+16); + + int8x8x2_t _r2_s8 = vld2_s8(r2); + int8x8x2_t _r2n_s8 = vld2_s8(r2+16); + + int8x8_t _r02_s8 = vext_s8(_r0_s8.val[0], _r0n_s8.val[0], 1); + int8x8_t _r12_s8 = vext_s8(_r1_s8.val[0], _r1n_s8.val[0], 1); + int8x8_t _r22_s8 = vext_s8(_r2_s8.val[0], _r2n_s8.val[0], 1); + + int16x8_t _r00 = vmovl_s8(_r0_s8.val[0]); // r00 + int16x8_t _r01 = vmovl_s8(_r0_s8.val[1]); // r01 + int16x8_t _r02 = vmovl_s8(_r02_s8); // r02 + + int16x8_t _r10 = vmovl_s8(_r1_s8.val[0]); // r10 + int16x8_t _r11 = vmovl_s8(_r1_s8.val[1]); // r11 + int16x8_t _r12 = vmovl_s8(_r12_s8); // r12 + + int16x8_t _r20 = vmovl_s8(_r2_s8.val[0]); // r20 + int16x8_t _r21 = vmovl_s8(_r2_s8.val[1]); // r21 + int16x8_t _r22 = vmovl_s8(_r22_s8); // r22 + + int8x16_t _k_s8 = vld1q_s8(ktmp); + int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k_s8)); // k0...k8 + int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k_s8));// k9... + + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r00), _k_s16, 0); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r00), _k_s16, 0); + int32x4_t _sum01 = vmull_laneq_s16(vget_low_s16(_r01), _k_s16, 1); + int32x4_t _sum01n = vmull_laneq_s16(vget_high_s16(_r01), _k_s16, 1); + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r02), _k_s16, 2); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r02), _k_s16, 2); + + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r10), _k_s16, 3); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r10), _k_s16, 3); + _sum01 = vmlal_laneq_s16(_sum01, vget_low_s16(_r11), _k_s16, 4); + _sum01n = vmlal_laneq_s16(_sum01n, vget_high_s16(_r11), _k_s16, 4); + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r12), _k_s16, 5); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r12), _k_s16, 5); + + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r20), _k_s16, 6); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r20), _k_s16, 6); + _sum01 = vmlal_laneq_s16(_sum01, vget_low_s16(_r21), _k_s16, 7); + _sum01n = vmlal_laneq_s16(_sum01n, vget_high_s16(_r21), _k_s16, 7); + _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r22), _kn_s16, 0); + _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r22), _kn_s16, 0); + + _sum0 = vaddq_s32(_sum0, _sum01); + _sum0n = vaddq_s32(_sum0n, _sum01n); - *outptr0 += sum0; - *outptr0n += sum0n; + // save s32 to memory + vst1q_s32(outptr, _sum0); + vst1q_s32(outptr+4, _sum0n); + + r0 += 16; + r1 += 16; + r2 += 16; - r0++; - r1++; - r2++; - r3++; - outptr0++; - outptr0n++; + outptr += 8; } +#endif // __ARM_NEON - r0 += 2 + w; - r1 += 2 + w; - r2 += 2 + w; - r3 += 2 + w; - - outptr0 += outw; - outptr0n += outw; - } + if (remain > 0) + { +#if __ARM_NEON + int8x8_t _k01234567s8 = vld1_s8(ktmp); + int8x8_t _k8xxxxxxxs8 = vld1_s8(ktmp+8); + int8x8_t _k34567xxxs8 = vext_s8(_k01234567s8, _k01234567s8, 3); + int8x8_t _k678xxxxxs8 = vext_s8(_k01234567s8, _k8xxxxxxxs8, 6); + int16x8_t _k0123_s16 = vmovl_s8(_k01234567s8); + int16x8_t _k3456_s16 = vmovl_s8(_k34567xxxs8); + int16x8_t _k678x_s16 = vmovl_s8(_k678xxxxxs8); +#endif + for (; remain>0; remain--) + { +#if __ARM_NEON + int8x8_t _r00s8 = vld1_s8(r0); + int8x8_t _r10s8 = vld1_s8(r1); + int8x8_t _r20s8 = vld1_s8(r2); - for (; i < outh; i++) - { - int nn = outw >> 3; - int remain = outw & 7; + int16x8_t _r00s16 = vmovl_s8(_r00s8); + int16x8_t _r10s16 = vmovl_s8(_r10s8); + int16x8_t _r20s16 = vmovl_s8(_r20s8); - if (nn > 0) - { - asm volatile( - "vld1.8 {d26-d27}, [%0] \n" - : "=r"(kernel0) // %0 - : "0"(kernel0) - : "cc", "memory" - ); + int32x4_t _sum = vmull_s16(vget_low_s16(_r00s16), vget_low_s16(_k0123_s16)); + _sum = vmlal_s16(_sum, vget_low_s16(_r10s16), vget_low_s16(_k3456_s16)); + _sum = vmlal_s16(_sum, vget_low_s16(_r20s16), vget_low_s16(_k678x_s16)); - asm volatile( - "0: \n" - "pld [%2, #128] \n" - "vld1.32 {d0-d1}, [%2] \n"// r0 - "add %2, #8 \n" - "vext.8 d2, d0, d1, #1 \n" - "vext.8 d3, d0, d1, #2 \n" - - "vdup.s8 d1, d26[0] \n" - "vdup.s8 d30, d26[1] \n" - "vdup.s8 d31, d26[2] \n" - "vmull.s8 q2, d0, d1 \n"// k0 - "vmlal.s8 q2, d2, d30 \n"// k1 - "vmlal.s8 q2, d3, d31 \n"// k2 - - "pld [%3, #128] \n" - "vld1.32 {d6-d7}, [%3] \n"// r1 - "add %3, #8 \n" - "vext.8 d8, d6, d7, #1 \n" - "vext.8 d9, d6, d7, #2 \n" - - "vdup.s8 d1, d26[3] \n" - "vdup.s8 d30, d26[4] \n" - "vdup.s8 d31, d26[5] \n" - "vmlal.s8 q2, d6, d1 \n"// k3 - "vmlal.s8 q2, d8, d30 \n"// k4 - "vmlal.s8 q2, d9, d31 \n"// k5 - - "pld [%4, #128] \n" - "vld1.32 {d10-d11}, [%4] \n"// r2 - "add %4, #8 \n" - "vext.8 d12, d10, d11, #1 \n" - "vext.8 d13, d10, d11, #2 \n" - - "vdup.s8 d1, d26[6] \n" - "vdup.s8 d30, d26[7] \n" - "vdup.s8 d31, d27[0] \n" - "vmlal.s8 q2, d10, d1 \n"// k6 - "vmlal.s8 q2, d12, d30 \n"// k7 - "vmlal.s8 q2, d13, d31 \n"// k8 - - "pld [%1, #128] \n" - "vld1.32 {d18-d21}, [%1] \n"// sum0 - "vaddw.s16 q9, q9, d4 \n" - "vaddw.s16 q10, q10, d5 \n" - "vst1.32 {d18-d21}, [%1]! \n" - - "subs %0, #1 \n" - "bne 0b \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(r0), // %2 - "=r"(r1), // %3 - "=r"(r2) // %4 - : "0"(nn), - "1"(outptr0), - "2"(r0), - "3"(r1), - "4"(r2) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" - ); - } + _sum = vsetq_lane_s32(*outptr, _sum, 3); - for (; remain>0; remain--) - { - int sum0 = 0; + *outptr = vaddvq_s32(_sum); +#else + int sum = 0; - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; + sum += (int)r0[0] * ktmp[0]; + sum += (int)r0[1] * ktmp[1]; + sum += (int)r0[2] * ktmp[2]; + sum += (int)r1[0] * ktmp[3]; + sum += (int)r1[1] * ktmp[4]; + sum += (int)r1[2] * ktmp[5]; + sum += (int)r2[0] * ktmp[6]; + sum += (int)r2[1] * ktmp[7]; + sum += (int)r2[2] * ktmp[8]; - *outptr0 += sum0; + *outptr += sum; +#endif // __ARM_NEON + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + } - r0++; - r1++; - r2++; - outptr0++; - } + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } - r0 += 2; - r1 += 2; - r2 += 2; - } - kernel0 += 9; - } + ktmp += 9; + } } } - -static void conv3x3s2_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt) +#else // __aarch64__ +static void conv3x3s1_winograd23_int8_neon(const Mat& bottom_blob, Mat& top_blob, const std::vector &kernel_tm_test, const Option& opt) { int w = bottom_blob.w; + int h = bottom_blob.h; int inch = bottom_blob.c; int outw = top_blob.w; int outh = top_blob.h; int outch = top_blob.c; - const int tailstep = w - 2 * outw + w; + // pad to 2n+2, winograd F(2,3) + Mat bottom_blob_bordered = bottom_blob; - const signed char* kernel = _kernel; - - int nn_outch = outch >> 1; - int remain_outch_start = nn_outch << 1; + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; - #pragma omp parallel for num_threads(opt.num_threads) - for (int pp=0; pp < nn_outch; pp++) + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); + + // BEGIN transform input + Mat bottom_blob_tm; { - int p = pp * 2; + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; - Mat out0 = top_blob.channel(p); - Mat out1 = top_blob.channel(p + 1); + int nColBlocks = h_tm/4; // may be the block num in FeatherCNN + int nRowBlocks = w_tm/4; - out0.fill(0.f); - out1.fill(0.f); + const int tiles = nColBlocks * nRowBlocks; - const signed char* kernel0 = (const signed char*)kernel + p * inch * 9; - const signed char* kernel1 = (const signed char*)kernel + (p + 1) * inch * 9; + bottom_blob_tm.create(4, inch, tiles*4, 2u, opt.workspace_allocator); + // BT + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + #pragma omp parallel for num_threads(opt.num_threads) for (int q=0; q(q); + short* out_tm1 = bottom_blob_tm.channel(tiles*1+j*nRowBlocks+i).row(q); + short* out_tm2 = bottom_blob_tm.channel(tiles*2+j*nRowBlocks+i).row(q); + short* out_tm3 = bottom_blob_tm.channel(tiles*3+j*nRowBlocks+i).row(q); +#if __ARM_NEON + asm volatile( + // load + "pld [%0, #64] \n" + "vld1.s8 {d0}, [%0] \n" + "pld [%1, #64] \n" + "vld1.s8 {d1}, [%1] \n" + "pld [%2, #64] \n" + "vld1.s8 {d2}, [%2] \n" + "pld [%3, #64] \n" + "vld1.s8 {d3}, [%3] \n" + // w = B_t * d, trans int8 to int16 + "vsubl.s8 q2, d0, d2 \n" // d4 + "vaddl.s8 q3, d1, d2 \n" // d6 + "vsubl.s8 q4, d2, d1 \n" // d8 + "vsubl.s8 q5, d3, d1 \n" // d10 + // transpose w to w_t + "vtrn.s16 d4, d6 \n" + "vtrn.s16 d8, d10 \n" + "vtrn.s32 d4, d8 \n" + "vtrn.s32 d6, d10 \n" + // U = B_t * d_t + "vsub.s16 d11, d4, d8 \n" + "vadd.s16 d12, d6, d8 \n" + "vsub.s16 d13, d8, d6 \n" + "vsub.s16 d14, d10, d6 \n" + // save + "vst1.s32 {d11}, [%4] \n" + "vst1.s32 {d12}, [%5] \n" + "vst1.s32 {d13}, [%6] \n" + "vst1.s32 {d14}, [%7] \n" + : "=r"(r0), // %0 + "=r"(r1), // %1 + "=r"(r2), // %2 + "=r"(r3), // %3 + "=r"(out_tm0), // %4 + "=r"(out_tm1), // %5 + "=r"(out_tm2), // %6 + "=r"(out_tm3) // %7 + : "0"(r0), + "1"(r1), + "2"(r2), + "3"(r3), + "4"(out_tm0), + "5"(out_tm1), + "6"(out_tm2), + "7"(out_tm3) + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +#else + short d0[4],d1[4],d2[4],d3[4]; + short w0[4],w1[4],w2[4],w3[4]; + short t0[4],t1[4],t2[4],t3[4]; + // load + for (int n = 0; n < 4; n++) + { + d0[n] = r0[n]; + d1[n] = r1[n]; + d2[n] = r2[n]; + d3[n] = r3[n]; + } + // w = B_t * d + for (int n = 0; n < 4; n++) + { + w0[n] = d0[n] - d2[n]; + w1[n] = d1[n] + d2[n]; + w2[n] = d2[n] - d1[n]; + w3[n] = d3[n] - d1[n]; + } + // transpose d to d_t + { + t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3]; + t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3]; + t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3]; + t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3]; + } + // U = B_t * d_t + for (int n = 0; n < 4; n++) + { + d0[n] = t0[n] - t2[n]; + d1[n] = t1[n] + t2[n]; + d2[n] = t2[n] - t1[n]; + d3[n] = t3[n] - t1[n]; + } + // save to out_tm + for (int n = 0; n < 4; n++) + { + out_tm0[n] = d0[n]; + out_tm1[n] = d1[n]; + out_tm2[n] = d2[n]; + out_tm3[n] = d3[n]; + } +#endif + r0 += 2; + r1 += 2; + r2 += 2; + r3 += 2; + } + } + } + } + bottom_blob_bordered = Mat(); - int i = 0; + // BEGIN dot + Mat top_blob_tm; + { + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; - for (; i < outh; i++) - { - int nn = outw >> 3; - int remain = outw & 7; + int nColBlocks = h_tm/4; // may be the block num in FeatherCNN + int nRowBlocks = w_tm/4; - asm volatile( - "vld1.s8 {d22-d23}, [%0] \n" - "vld1.s8 {d24-d25}, [%1] \n" - : "=r"(kernel0), // %0 - "=r"(kernel1) // %1 - : "0"(kernel0), - "1"(kernel1) - : "cc", "memory" - ); + const int tiles = nColBlocks * nRowBlocks; - if (nn > 0) + top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator); + + #pragma omp parallel for num_threads(opt.num_threads) + for (int r=0; r<4; r++) + { + int nn_outch = 0; + int remain_outch_start = 0; + + nn_outch = outch >> 3; + remain_outch_start = nn_outch << 3; + + for (int pp=0; pp= 4) + for (int n=0; n<4; n++) + { + output0_tm[n] = sum0[n]; + output1_tm[n] = sum1[n]; + output2_tm[n] = sum2[n]; + output3_tm[n] = sum3[n]; + output4_tm[n] = sum4[n]; + output5_tm[n] = sum5[n]; + output6_tm[n] = sum6[n]; + output7_tm[n] = sum7[n]; + } +#endif // __ARM_NEON + output0_tm += 16; + output1_tm += 16; + output2_tm += 16; + output3_tm += 16; + output4_tm += 16; + output5_tm += 16; + output6_tm += 16; + output7_tm += 16; + } + } + + nn_outch = (outch - remain_outch_start) >> 2; + + //#pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp0; remain--) + remain_outch_start += nn_outch << 2; + //#pragma omp parallel for num_threads(opt.num_threads) + for (int p=remain_outch_start; p> 3; - int remain = outw & 7; - - asm volatile( - "vld1.s8 {d22-d23}, [%0] \n" - : "=r"(kernel0) // %0 - : "0"(kernel0) - : "cc", "memory" - ); + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; p 0) + for (int j=0; j> 2 + "vshl.s32 d9, d9, %P6 \n" // o1 = o1 >> 2 + + "vst1.s32 {d8}, [%1]! \n" + "vst1.s32 {d9}, [%2]! \n" + : "=r"(out_tile), // %0 + "=r"(outRow0), // %1 + "=r"(outRow1) // %2 + : "0"(out_tile), + "1"(outRow0), + "2"(outRow1), + "w"(_shift) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4" ); - } - - if (remain >= 4) - { - remain -= 4; - asm volatile( - "pld [%2, #192] \n" - "vld2.s8 {d0-d1}, [%2]! \n" // r0 - "vld2.s8 {d2-d3}, [%2] \n" - "vext.8 d3, d0, d2, #1 \n" - - "vdup.s8 d26, d22[0] \n" - "vdup.s8 d27, d22[1] \n" - "vdup.s8 d28, d22[2] \n" - "vmull.s8 q2, d0, d26 \n" // k00 - "vmlal.s8 q2, d1, d27 \n" // k01 - "vmlal.s8 q2, d3, d28 \n" // k02 - - "pld [%3, #192] \n" - "vld2.s8 {d6-d7}, [%3]! \n" // r1 - "vld2.s8 {d8-d9}, [%3] \n" - "vext.8 d9, d6, d8, #1 \n" - - "vdup.s8 d26, d22[3] \n" - "vdup.s8 d27, d22[4] \n" - "vdup.s8 d28, d22[5] \n" - "vmlal.s8 q2, d6, d26 \n" // k03 - "vmlal.s8 q2, d7, d27 \n" // k04 - "vmlal.s8 q2, d9, d28 \n" // k05 - - "pld [%4, #192] \n" - "vld2.s8 {d10-d11}, [%4]! \n" // r2 - "vld2.s8 {d12-d13}, [%4] \n" - "vext.8 d13, d10, d12, #1 \n" - - "sub %2, #8 \n" - "sub %3, #8 \n" - "sub %4, #8 \n" - - "vdup.s8 d26, d22[6] \n" - "vdup.s8 d27, d22[7] \n" - "vdup.s8 d28, d23[0] \n" - "vmlal.s8 q2, d10, d26 \n" // k06 - "vmlal.s8 q2, d11, d27 \n" // k07 - "vmlal.s8 q2, d13, d28 \n" // k08 - - "pld [%1, #128] \n" - "vld1.32 {d14-d15}, [%1] \n" //sum0 - "vaddw.s16 q7, q7, d4 \n" - "vst1.32 {d14-d15}, [%1]! \n" - : "=r"(nn), // %0 - "=r"(outptr0), // %1 - "=r"(r0), // %2 - "=r"(r1), // %3 - "=r"(r2) // %4 - : "0"(nn), - "1"(outptr0), - "2"(r0), - "3"(r1), - "4"(r2) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q12", "q13", "q14" - ); - } +#else + int s0[4],s1[4],s2[4],s3[4]; + int w0[4],w1[4]; + int d0[2],d1[2],d2[2],d3[2]; + int o0[2],o1[2]; + // load + for (int n = 0; n < 4; n++) + { + s0[n] = out_tile[n]; + s1[n] = out_tile[n+ 4]; + s2[n] = out_tile[n+ 8]; + s3[n] = out_tile[n+12]; + } + // w = A_T * W + for (int n = 0; n < 4; n++) + { + w0[n] = s0[n] + s1[n] + s2[n]; + w1[n] = s1[n] - s2[n] + s3[n]; + } + // transpose w to w_t + { + d0[0] = w0[0]; d0[1] = w1[0]; + d1[0] = w0[1]; d1[1] = w1[1]; + d2[0] = w0[2]; d2[1] = w1[2]; + d3[0] = w0[3]; d3[1] = w1[3]; + } + // Y = A_T * w_t + for (int n = 0; n < 2; n++) + { + o0[n] = d0[n] + d1[n] + d2[n]; + o1[n] = d1[n] - d2[n] + d3[n]; + } + // save to top blob tm,why right 2,because the G' = G*2 + outRow0[0] = o0[0] >> 2; + outRow0[1] = o0[1] >> 2; + outRow1[0] = o1[0] >> 2; + outRow1[1] = o1[1] >> 2; - for (; remain>0; remain--) - { - int sum0 = 0; - - sum0 += (int)r0[0] * kernel0[0]; - sum0 += (int)r0[1] * kernel0[1]; - sum0 += (int)r0[2] * kernel0[2]; - sum0 += (int)r1[0] * kernel0[3]; - sum0 += (int)r1[1] * kernel0[4]; - sum0 += (int)r1[2] * kernel0[5]; - sum0 += (int)r2[0] * kernel0[6]; - sum0 += (int)r2[1] * kernel0[7]; - sum0 += (int)r2[2] * kernel0[8]; - - *outptr0 += sum0; + out_tile += 16; - r0 += 2; - r1 += 2; - r2 += 2; - outptr0++; + outRow0 += 2; + outRow1 += 2; +#endif // __ARM_NEON } - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; + outRow0 += outw; + outRow1 += outw; } - - kernel0 += 9; - } - } + } + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); } static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) @@ -3366,7 +3776,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co "sub %3, #2 \n" "vld1.s8 {d0[6]}, [%4]! \n" - "vld1.s8 {d0[7]}, [%4]! \n"// d0(r00 r01 r02 r10 r11 r12 r22 r21) + "vld1.s8 {d0[7]}, [%4]! \n"// d0(r00 r01 r02 r10 r11 r12 r20 r21) "vld1.s8 {d4[]}, [%4] \n"// d4(r22 r22 r22 r22 r22 r22 r22 r22) "sub %4, #2 \n" @@ -3381,7 +3791,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co "vld1.s8 {d5[]}, [%5] \n"// d5(r32 r32 r32 r32 r32 r32 r32 r32) "sub %5, #2 \n" - "veor d3, d3 \n"// d3(00 00 00 00 00 00 00 00) + "veor d3, d1, d1 \n"// d3(00 00 00 00 00 00 00 00) "vmull.s8 q8, d0, d2 \n"// sum0 = (r00 - r21) * (k00 - k21) "vmull.s8 q9, d1, d2 \n"// sum1 = (r10 - r31) * (k00 - k21) @@ -3404,7 +3814,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co "vpadd.s32 d20, d20, d21 \n" "vpadd.s32 d22, d22, d23 \n" "vpadd.s32 d20, d20, d22 \n" - "vpadd.s32 d6, d6, d20 \n" + "vadd.s32 d6, d6, d20 \n" "vst1.s32 {d6[0]}, [%0]! \n" "vst1.s32 {d6[1]}, [%1]! \n" @@ -3437,7 +3847,6 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co sum0 += r1[2] * ktmp[5]; sum0 += r2[0] * ktmp[6]; sum0 += r2[1] * ktmp[7]; - sum0 += r2[2] * ktmp[8]; sum0n += r1[0] * ktmp[0]; @@ -3448,7 +3857,6 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co sum0n += r2[2] * ktmp[5]; sum0n += r3[0] * ktmp[6]; sum0n += r3[1] * ktmp[7]; - sum0n += r3[2] * ktmp[8]; *outptr0 += sum0; @@ -3705,7 +4113,7 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co "vmovl.s8 q6, d12 \n"// q6(a02 a04 a06 a08 a010 a012 a014 a016) d13 "pld [%8, #128] \n" - "vld1.s32 {d30-d31}, [%8] \n"// out7 + "vld1.s32 {d30-d31}, [%8] \n"// out7 "vmlal.s16 q8, d8, d0[0] \n"// sum0 += (a00 a02 a04 a06) * k00 "vmlal.s16 q9, d8, d0[1] \n"// sum1 += (a00 a02 a04 a06) * k10 @@ -3723,7 +4131,7 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co "vmlal.s16 q12, d10, d3[0] \n"// sum4 += (a01-a07) * k41 "vmlal.s16 q13, d10, d3[1] \n"// sum5 += (a01-a07) * k51 "vmlal.s16 q14, d10, d3[2] \n"// sum6 += (a01-a07) * k61 - "vmlal.s16 q15, d10, d3[3] \n"// sum7 += (a01-a07) * k71 + "vmlal.s16 q15, d10, d3[3] \n"// sum7 += (a01-a07) * k71 "pld [%10, #64] \n" "vld2.s8 {d8-d9}, [%10] \n"// d8(a10 a12 a14 a16 a18 a110 a112 a114), d9(a11 a13 a15 a17 a19 a111 a113 a115) @@ -4293,3 +4701,25 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co } } #endif + +static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 3; + int kernel_h = 3; + + int stride_w = 1; + int stride_h = 1; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} + +static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 3; + int kernel_h = 3; + + int stride_w = 2; + int stride_h = 2; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} diff --git a/src/layer/arm/convolution_5x5_int8.h b/src/layer/arm/convolution_5x5_int8.h new file mode 100644 index 000000000..99abb8705 --- /dev/null +++ b/src/layer/arm/convolution_5x5_int8.h @@ -0,0 +1,35 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv5x5s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 5; + int kernel_h = 5; + + int stride_w = 1; + int stride_h = 1; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} + +static void conv5x5s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 5; + int kernel_h = 5; + + int stride_w = 2; + int stride_h = 2; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} diff --git a/src/layer/arm/convolution_7x7_int8.h b/src/layer/arm/convolution_7x7_int8.h new file mode 100644 index 000000000..d34f7a323 --- /dev/null +++ b/src/layer/arm/convolution_7x7_int8.h @@ -0,0 +1,35 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv7x7s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 7; + int kernel_h = 7; + + int stride_w = 1; + int stride_h = 1; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} + +static void conv7x7s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 7; + int kernel_h = 7; + + int stride_w = 2; + int stride_h = 2; + + conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp index 928e302ad..14be7ae45 100644 --- a/src/layer/arm/convolution_arm.cpp +++ b/src/layer/arm/convolution_arm.cpp @@ -14,6 +14,8 @@ #include "convolution_arm.h" +#include "benchmark.h" + namespace ncnn { #include "convolution_1x1.h" @@ -24,8 +26,11 @@ namespace ncnn { #include "convolution_7x7.h" #if __ARM_NEON +#include "convolution_sgemm_int8.h" #include "convolution_1x1_int8.h" #include "convolution_3x3_int8.h" +#include "convolution_5x5_int8.h" +#include "convolution_7x7_int8.h" #endif // __ARM_NEON DEFINE_LAYER_CREATOR(Convolution_arm) @@ -66,9 +71,12 @@ int Convolution_arm::load_model(const ModelBin& mb) if (use_int8_inference) { -#if __ARM_NEON -#if !__aarch64__ - if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + if (use_winograd3x3) + { + int num_input = weight_data_size / 9 / num_output; + conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output); + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { int num_input = weight_data_size / 9 / num_output; conv3x3s1_transform_kernel_int8_neon(weight_data, weight_3x3s1_int8_data, num_input, num_output); @@ -78,16 +86,15 @@ int Convolution_arm::load_model(const ModelBin& mb) { int num_input = weight_data_size / 9 / num_output; conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output); - } + } if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) { int num_input = weight_data_size / num_output; conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output); use_sgemm1x1 = true; - } -#endif // !__aarch64__ -#endif // __ARM_NEON + } + return 0; } @@ -233,7 +240,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option } const int kernel_size = kernel_w; - const int stride = stride_w; + //const int stride = stride_w; + int stride = stride_w; if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h) { @@ -293,43 +301,50 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option #if __ARM_NEON // kernel_size x stride - conv_int8_func conv_int8_func_table[5][5] = + conv_int8_func conv_int8_func_table[7][4] = { { conv1x1s1_int8_neon, conv1x1s2_int8_neon, 0, - 0, 0 }, // kernel_size = 1 { 0, 0, 0, - 0, 0 }, // kernel_size = 2 { conv3x3s1_int8_neon, conv3x3s2_int8_neon, 0, - 0, 0 }, // kernel_size = 3 { 0, 0, 0, - 0, 0 }, // kernel_size = 4 { + conv5x5s1_int8_neon, + conv5x5s2_int8_neon, 0, + 0 + }, // kernel_size = 5 + { 0, 0, 0, 0 - } // kernel_size = 5 + }, // kernel_size = 6 + { + conv7x7s1_int8_neon, + conv7x7s2_int8_neon, + 0, + 0 + } // kernel_size = 7 }; #endif // __ARM_NEON @@ -384,9 +399,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option opt_g.blob_allocator = bottom_blob_int8.allocator; quantize->forward(bottom_blob, bottom_blob_int8, opt_g); - } + } - bottom_blob_unbordered = bottom_blob_int8; + bottom_blob_unbordered = bottom_blob_int8; } Mat bottom_blob_bordered = bottom_blob_unbordered; @@ -423,34 +438,90 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option if (use_int8_inference) { -#if __ARM_NEON -#if !__aarch64__ - if (use_sgemm1x1) - { - conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt); - } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + if (use_int8_requantize == true) { - conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt); + Mat top_blob_tm; + top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_tm.empty()) + return -100; + + top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (use_sgemm1x1) + { + conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_1x1s1_sgemm_int8_data, opt); + } + else if (use_winograd3x3) + { + conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt); + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s1_int8_data, opt); + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt); + } + else + { + conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt); + } + + // requantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward(top_blob_tm_g, top_blob_g, opt_g); + } } - else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) - { - conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt); - } else -#endif // !__aarch64__ -#endif // __ARM_NEON { - conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); - } + top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; - // dequantize, reverse scale inplace - { - ncnn::Option opt_g = opt; - opt_g.blob_allocator = top_blob.allocator; + if (use_sgemm1x1) + { + conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt); + } + else if (use_winograd3x3) + { + conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt); + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1) + { + conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt); + } + else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2) + { + conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt); + } + else + { + conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); + } - dequantize->forward_inplace(top_blob, opt_g); - } + // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward_inplace(top_blob_g, opt_g); + } + } return 0; } diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h index 2a3c55433..7de63bb2b 100644 --- a/src/layer/arm/convolution_arm.h +++ b/src/layer/arm/convolution_arm.h @@ -40,6 +40,8 @@ public: Mat weight_3x3s1_int8_data; Mat weight_3x3s2_int8_data; Mat weight_1x1s1_sgemm_int8_data; + Mat weight_3x3_winograd23_data; + std::vector weight_3x3_winograd23_int8_data; }; } // namespace ncnn diff --git a/src/layer/arm/convolution_sgemm_int8.h b/src/layer/arm/convolution_sgemm_int8.h new file mode 100644 index 000000000..4ef2903f1 --- /dev/null +++ b/src/layer/arm/convolution_sgemm_int8.h @@ -0,0 +1,1598 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv_im2col_sgemm_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \ + const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + // im2col + Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator); + { + const int stride = kernel_h*kernel_w*outw*outh; + signed char* ret = (signed char*)bottom_im2col; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; p> 3; + int remain_size_start = nn_size << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii=0; ii> 3; + remain_outch_start = nn_outch << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2; + remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 3; + remain_outch_start = nn_outch << 3; +#endif + +#if __aarch64__ + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 3 + "cmp w4, #0 \n" + "beq 1f \n" + + "0: \n"// for (; k+7> 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 3 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%4, #128] \n" + "vld1.s8 {d8-d11}, [%4]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q7, d11 \n"// a30-a37 + "vmovl.s8 q6, d10 \n"// a20-a27 + "vmovl.s8 q5, d9 \n"// a10-a17 + "vmovl.s8 q4, d8 \n"// a00-a07 + + "pld [%5, #128] \n" + "vld1.s8 {d0-d3}, [%5]! \n"// kptr k00-k30,k01-k31, k02-k32,k03-k33, k04-k34,k05-k35, k06-k36,k07-k37 k(outch)(inch) + "vmovl.s8 q3, d3 \n"// k06-k36,k07-k37 + "vmovl.s8 q2, d2 \n"// k04-k34,k05-k35 + "vmovl.s8 q1, d1 \n"// k02-k32,k03-k33 + "vmovl.s8 q0, d0 \n"// k00-k30,k01-k31 + + "vmlal.s16 q8, d8, d0[0] \n"// sum0 = (a00-a07) * k00 + "vmlal.s16 q9, d9, d0[0] \n" + "vmlal.s16 q10, d8, d0[1] \n"// sum1 = (a00-a07) * k10 + "vmlal.s16 q11, d9, d0[1] \n" + "vmlal.s16 q12, d8, d0[2] \n"// sum2 = (a00-a07) * k20 + "vmlal.s16 q13, d9, d0[2] \n" + "vmlal.s16 q14, d8, d0[3] \n"// sum3 = (a00-a07) * k30 + "vmlal.s16 q15, d9, d0[3] \n" + + "vmlal.s16 q8, d10, d1[0] \n"// sum0 += (a10-a17) * k01 + "vmlal.s16 q9, d11, d1[0] \n" + "vmlal.s16 q10, d10, d1[1] \n"// sum1 += (a10-a17) * k11 + "vmlal.s16 q11, d11, d1[1] \n" + "vmlal.s16 q12, d10, d1[2] \n"// sum2 += (a10-a17) * k21 + "vmlal.s16 q13, d11, d1[2] \n" + "vmlal.s16 q14, d10, d1[3] \n"// sum3 += (a10-a17) * k31 + "vmlal.s16 q15, d11, d1[3] \n" + + "pld [%4, #128] \n" + "vld1.s8 {d8-d9}, [%4]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q5, d9 \n"// a10-a17 + "vmovl.s8 q4, d8 \n"// a00-a07 + + "vmlal.s16 q8, d12, d2[0] \n"// sum0 += (a20-a27) * k02 + "vmlal.s16 q9, d13, d2[0] \n" + "vmlal.s16 q10, d12, d2[1] \n"// sum1 += (a20-a27) * k12 + "vmlal.s16 q11, d13, d2[1] \n" + "vmlal.s16 q12, d12, d2[2] \n"// sum2 += (a20-a27) * k22 + "vmlal.s16 q13, d13, d2[2] \n" + "vmlal.s16 q14, d12, d2[3] \n"// sum3 += (a20-a27) * k32 + "vmlal.s16 q15, d13, d2[3] \n" + + "vmlal.s16 q8, d14, d3[0] \n"// sum0 += (a30-a37) * k03 + "vmlal.s16 q9, d15, d3[0] \n" + "vmlal.s16 q10, d14, d3[1] \n"// sum1 += (a30-a37) * k13 + "vmlal.s16 q11, d15, d3[1] \n" + "vmlal.s16 q12, d14, d3[2] \n"// sum2 += (a30-a37) * k23 + "vmlal.s16 q13, d15, d3[2] \n" + "vmlal.s16 q14, d14, d3[3] \n"// sum3 += (a30-a37) * k33 + "vmlal.s16 q15, d15, d3[3] \n" + + "pld [%4, #128] \n" + "vld1.s8 {d0-d1}, [%4]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q1, d1 \n"// a10-a17 + "vmovl.s8 q0, d0 \n"// a00-a07 + + "vmlal.s16 q8, d8, d4[0] \n"// sum0 += (a40-a47) * k04 + "vmlal.s16 q9, d9, d4[0] \n" + "vmlal.s16 q10, d8, d4[1] \n"// sum1 += (a40-a47) * k14 + "vmlal.s16 q11, d9, d4[1] \n" + "vmlal.s16 q12, d8, d4[2] \n"// sum2 += (a40-a47) * k24 + "vmlal.s16 q13, d9, d4[2] \n" + "vmlal.s16 q14, d8, d4[3] \n"// sum3 += (a40-a47) * k34 + "vmlal.s16 q15, d9, d4[3] \n" + + "vmlal.s16 q8, d10, d5[0] \n"// sum0 += (a50-a57) * k05 + "vmlal.s16 q9, d11, d5[0] \n" + "vmlal.s16 q10, d10, d5[1] \n"// sum1 += (a50-a57) * k15 + "vmlal.s16 q11, d11, d5[1] \n" + "vmlal.s16 q12, d10, d5[2] \n"// sum2 += (a50-a57) * k25 + "vmlal.s16 q13, d11, d5[2] \n" + "vmlal.s16 q14, d10, d5[3] \n"// sum3 += (a50-a57) * k35 + "vmlal.s16 q15, d11, d5[3] \n" + + "vmlal.s16 q8, d0, d6[0] \n"// sum0 += (a60-a67) * k06 + "vmlal.s16 q9, d1, d6[0] \n" + "vmlal.s16 q10, d0, d6[1] \n"// sum1 += (a60-a67) * k16 + "vmlal.s16 q11, d1, d6[1] \n" + "vmlal.s16 q12, d0, d6[2] \n"// sum2 += (a60-a67) * k26 + "vmlal.s16 q13, d1, d6[2] \n" + "vmlal.s16 q14, d0, d6[3] \n"// sum3 += (a60-a67) * k36 + "vmlal.s16 q15, d1, d6[3] \n" + + "vmlal.s16 q8, d2, d7[0] \n"// sum0 += (a70-a77) * k07 + "vmlal.s16 q9, d3, d7[0] \n" + "vmlal.s16 q10, d2, d7[1] \n"// sum1 += (a70-a77) * k17 + "vmlal.s16 q11, d3, d7[1] \n" + "vmlal.s16 q12, d2, d7[2] \n"// sum2 += (a70-a77) * k27 + "vmlal.s16 q13, d3, d7[2] \n" + "vmlal.s16 q14, d2, d7[3] \n"// sum3 += (a70-a77) * k37 + "vmlal.s16 q15, d3, d7[3] \n" + + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %12, #7 \n"// r4 = remain = inch & 7 + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%4]! \n"// tmpr a00-a70 a(inch)(data) + "vld1.s8 {d0}, [%5] \n"// kptr k00-k30 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %5, #4 \n" + + "vmlal.s16 q8, d2, d0[0] \n"// sum0 += (a00-a70) * k00 + "vmlal.s16 q9, d3, d0[0] \n" + "vmlal.s16 q10, d2, d0[1] \n"// sum1 += (a00-a70) * k10 + "vmlal.s16 q11, d3, d0[1] \n" + "vmlal.s16 q12, d2, d0[2] \n"// sum2 += (a00-a70) * k20 + "vmlal.s16 q13, d3, d0[2] \n" + "vmlal.s16 q14, d2, d0[3] \n"// sum3 += (a00-a70) * k30 + "vmlal.s16 q15, d3, d0[3] \n" + + "subs r4, r4, #1 \n" + "bne 2b \n" + + "3: \n"// store the result to memory + "vst1.s32 {d16-d19}, [%0] \n" + "vst1.s32 {d20-d23}, [%1] \n" + "vst1.s32 {d24-d27}, [%2] \n" + "vst1.s32 {d28-d31}, [%3] \n" + + : "=r"(output0), // %0 + "=r"(output1), // %1 + "=r"(output2), // %2 + "=r"(output3), // %3 + "=r"(vb), // %4 + "=r"(va) // %5 + : "0"(output0), + "1"(output1), + "2"(output2), + "3"(output3), + "4"(vb), + "5"(va), + "r"(L) // %12 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +#endif // __aarch64__ +#else + int sum0[8] = {0}; + int sum1[8] = {0}; + int sum2[8] = {0}; + int sum3[8] = {0}; + + int k=0; + for (; k+7> 2 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%4, #128] \n" + "vld1.s8 {d0}, [%4]! \n"// tmpr a00,a10,a20,a30 a(inch)(data) + "vmovl.s8 q0, d0 \n"// a00-a07 + + "pld [%5, #128] \n" + "vld1.s8 {d2-d5}, [%5]! \n"// kptr k00-k30,k01-k31, k02-k32,k03-k33, k04-k34,k05-k35, k06-k36,k07-k37 k(outch)(inch) + "vmovl.s8 q4, d5 \n"// k06-k36,k07-k37 + "vmovl.s8 q3, d4 \n"// k04-k34,k05-k35 + "vmovl.s8 q2, d3 \n"// k02-k32,k03-k33 + "vmovl.s8 q1, d2 \n"// k00-k30,k01-k31 + + "vmlal.s16 q6, d2, d0[0] \n"// (k00-k30) * a00 + "vmlal.s16 q7, d3, d0[1] \n"// (k01-k31) * a01 + "vmlal.s16 q8, d4, d0[2] \n"// (k02-k32) * a02 + "vmlal.s16 q9, d5, d0[3] \n"// (k03-k33) * a03 + "vmlal.s16 q10, d6, d1[0] \n"// (k04-k34) * a04 + "vmlal.s16 q11, d7, d1[1] \n"// (k05-k35) * a05 + "vmlal.s16 q12, d8, d1[2] \n"// (k06-k36) * a06 + "vmlal.s16 q13, d9, d1[3] \n"// (k07-k37) * a07 + + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "vadd.s32 q6, q6, q7 \n" + "vadd.s32 q9, q9, q8 \n" + "vadd.s32 q11, q11, q10 \n" + "vadd.s32 q13, q13, q12 \n" + + "vadd.s32 q9, q9, q6 \n" + "vadd.s32 q13, q13, q11 \n" + "vadd.s32 q14, q13, q9 \n" + + "1: \n" + // remain loop + "and r4, %12, #7 \n"// r4 = remain = inch & 3 + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%4] \n"// tmpr a00 a(inch)(data) + "vld1.s8 {d0}, [%5] \n"// kptr k00-k30 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %4, #1 \n" + "add %5, #4 \n" + + "vmlal.s16 q14, d0, d2[0] \n" + + "subs r4, r4, #1 \n" + "bne 2b \n" + + "3: \n"// store the result to memory + "vst1.s32 {d28[0]}, [%0] \n" + "vst1.s32 {d28[1]}, [%1] \n" + "vst1.s32 {d29[0]}, [%2] \n" + "vst1.s32 {d29[1]}, [%3] \n" + + : "=r"(output0), // %0 + "=r"(output1), // %1 + "=r"(output2), // %2 + "=r"(output3), // %3 + "=r"(vb), // %4 + "=r"(va) // %5 + : "0"(output0), + "1"(output1), + "2"(output2), + "3"(output3), + "4"(vb), + "5"(va), + "r"(L) // %12 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14" + ); +#endif // __aarch64__ +#else + int sum0 = 0; + int sum1 = 0; + int sum2 = 0; + int sum3 = 0; + + for (int k=0; k> 3 + "cmp r4, #0 \n" + "beq 1f \n" + + "0: \n"// for(; nn != 0; nn--) + "pld [%1, #128] \n" + "vld1.s8 {d4-d7}, [%1]! \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37 a(inch)(data) + "vmovl.s8 q5, d7 \n"// a30-a37 + "vmovl.s8 q4, d6 \n"// a20-a27 + "vmovl.s8 q3, d5 \n"// a10-a17 + "vmovl.s8 q2, d4 \n"// a00-a07 + + "pld [%2, #128] \n" + "vld1.s8 {d0}, [%2]! \n"// kptr k00-k07 k(outch)(inch) + "vmovl.s8 q1, d1 \n"// k04,k05,k06,k07 + "vmovl.s8 q0, d0 \n"// k00,k01,k02,k03 + + "vmlal.s16 q6, d4, d0[0] \n"// (a00-a07) * k00 + "vmlal.s16 q7, d5, d0[0] \n" + "vmlal.s16 q6, d6, d0[1] \n"// (a10-a17) * k01 + "vmlal.s16 q7, d7, d0[1] \n" + "vmlal.s16 q6, d8, d0[2] \n"// (a20-a27) * k02 + "vmlal.s16 q7, d9, d0[2] \n" + "vmlal.s16 q6, d10, d0[3] \n"// (a30-a37) * k03 + "vmlal.s16 q7, d11, d0[3] \n" + + "pld [%1, #128] \n" + "vld1.s8 {d4-d7}, [%1]! \n"// tmpr a40-a47,a50-a57,a60-a67,a70-a77 a(inch)(data) + "vmovl.s8 q5, d7 \n"// a70-a77 + "vmovl.s8 q4, d6 \n"// a60-a67 + "vmovl.s8 q3, d5 \n"// a50-a57 + "vmovl.s8 q2, d4 \n"// a40-a47 + + "vmlal.s16 q6, d4, d1[0] \n"// (a00-a07) * k00 + "vmlal.s16 q7, d5, d1[0] \n" + "vmlal.s16 q6, d6, d1[1] \n"// (a10-a17) * k01 + "vmlal.s16 q7, d7, d1[1] \n" + "vmlal.s16 q6, d8, d1[2] \n"// (a20-a27) * k02 + "vmlal.s16 q7, d9, d1[2] \n" + "vmlal.s16 q6, d10, d1[3] \n"// (a30-a37) * k03 + "vmlal.s16 q7, d11, d1[3] \n" + + "subs r4, r4, #1 \n" + "bne 0b \n"// end for + + "1: \n" + // remain loop + "and r4, %6, #7 \n"// r4 = remain = inch & 7 + "cmp r4, #0 \n" + "beq 3f \n" + + "2: \n"// for(; remain != 0; remain--) + "vld1.s8 {d2}, [%1]! \n"// tmpr a00-a07 a(inch)(data) + "vld1.s8 {d0}, [%2] \n"// kptr k00 k(outch)(inch) + "vmovl.s8 q1, d2 \n" + "vmovl.s8 q0, d0 \n" + "add %2, #1 \n" + + "vmlal.s16 q6, d2, d0[0] \n"// (a00-a07) * k00 + "vmlal.s16 q7, d3, d0[0] \n" + + "subs r4, r4, #1 \n" + "bne 2b \n" + + "3: \n"// store the result to memory + "vst1.s32 {d12-d15}, [%0] \n" + + : "=r"(output), // %0 + "=r"(vb), // %1 + "=r"(va) // %2 + : "0"(output), + "1"(vb), + "2"(va), + "r"(L) // %6 + : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +#endif // __aarch64__ +#else + int sum[8] = {0}; + + int k=0; + for (; k+7 #endif // __ARM_NEON -#if __aarch64__ -static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) -{ - int w = bottom_blob.w; - - int outw = top_blob.w; - int outh = top_blob.h; - int outch = top_blob.c; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p = 0; p < outch; p++) - { - Mat out = top_blob.channel(p); - - const signed char* kernel = (const signed char *)_kernel + p*9; - - int* outptr0 = out; - int* outptr0n = outptr0 + outw; - - const signed char* img0 = bottom_blob.channel(p); - - const signed char* r0 = img0; - const signed char* r1 = img0 + w; - const signed char* r2 = img0 + w*2; - const signed char* r3 = img0 + w*3; - - int i = 0; - - int8x8_t _k0 = vdup_n_s8(kernel[0]); - int8x8_t _k1 = vdup_n_s8(kernel[1]); - int8x8_t _k2 = vdup_n_s8(kernel[2]); - - int8x8_t _k3 = vdup_n_s8(kernel[3]); - int8x8_t _k4 = vdup_n_s8(kernel[4]); - int8x8_t _k5 = vdup_n_s8(kernel[5]); - - int8x8_t _k6 = vdup_n_s8(kernel[6]); - int8x8_t _k7 = vdup_n_s8(kernel[7]); - int8x8_t _k8 = vdup_n_s8(kernel[8]); - - for (; i+1 < outh; i+=2) - { - int nn = outw >> 3; - int remain = outw & 7; - - for (; nn >0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - int8x8_t _r0n = vld1_s8(r0+8); - int8x8_t _r01 = vext_s8(_r0, _r0n, 1); - int8x8_t _r02 = vext_s8(_r0, _r0n, 2); - - int16x8_t _sum0 = vmull_s8(_r0, _k0); - _sum0 = vmlal_s8(_sum0, _r01, _k1); - _sum0 = vmlal_s8(_sum0, _r02, _k2); - - int8x8_t _r1 = vld1_s8(r1); - int8x8_t _r1n = vld1_s8(r1+8); - int8x8_t _r11 = vext_s8(_r1, _r1n, 1); - int8x8_t _r12 = vext_s8(_r1, _r1n, 2); - _sum0 = vmlal_s8(_sum0, _r1, _k3); - _sum0 = vmlal_s8(_sum0, _r11, _k4); - _sum0 = vmlal_s8(_sum0, _r12, _k5); - - int16x8_t _sum1 = vmull_s8(_r1, _k0); - _sum1 = vmlal_s8(_sum1, _r11, _k1); - _sum1 = vmlal_s8(_sum1, _r12, _k2); - - int8x8_t _r2 = vld1_s8(r2); - int8x8_t _r2n = vld1_s8(r2+8); - int8x8_t _r21 = vext_s8(_r2, _r2n, 1); - int8x8_t _r22 = vext_s8(_r2, _r2n, 2); - _sum0 = vmlal_s8(_sum0, _r2, _k6); - _sum0 = vmlal_s8(_sum0, _r21, _k7); - _sum0 = vmlal_s8(_sum0, _r22, _k8); - - _sum1 = vmlal_s8(_sum1, _r2, _k3); - _sum1 = vmlal_s8(_sum1, _r21, _k4); - _sum1 = vmlal_s8(_sum1, _r22, _k5); - - int8x8_t _r3 = vld1_s8(r3); - int8x8_t _r3n = vld1_s8(r3+8); - int8x8_t _r31 = vext_s8(_r3, _r3n, 1); - int8x8_t _r32 = vext_s8(_r3, _r3n, 2); - _sum1 = vmlal_s8(_sum1, _r3, _k6); - _sum1 = vmlal_s8(_sum1, _r31, _k7); - _sum1 = vmlal_s8(_sum1, _r32, _k8); - - int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); - int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); - - vst1q_s32(outptr0, sum0_s32); - vst1q_s32(outptr0+4, sum0n_s32); - - int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1)); - int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1)); - - vst1q_s32(outptr0n, sum1_s32); - vst1q_s32(outptr0n+4, sum1n_s32); - - r0 += 8; - r1 += 8; - r2 += 8; - r3 += 8; - outptr0 += 8; - outptr0n += 8; - } - - for (; remain>0; remain--) - { - //Todo Neon - - int sum0 = 0; - int sum0n = 0; - - sum0 += (int)r0[0] * kernel[0]; - sum0 += (int)r0[1] * kernel[1]; - sum0 += (int)r0[2] * kernel[2]; - sum0 += (int)r1[0] * kernel[3]; - sum0 += (int)r1[1] * kernel[4]; - sum0 += (int)r1[2] * kernel[5]; - sum0 += (int)r2[0] * kernel[6]; - sum0 += (int)r2[1] * kernel[7]; - sum0 += (int)r2[2] * kernel[8]; - - sum0n += (int)r1[0] * kernel[0]; - sum0n += (int)r1[1] * kernel[1]; - sum0n += (int)r1[2] * kernel[2]; - sum0n += (int)r2[0] * kernel[3]; - sum0n += (int)r2[1] * kernel[4]; - sum0n += (int)r2[2] * kernel[5]; - sum0n += (int)r3[0] * kernel[6]; - sum0n += (int)r3[1] * kernel[7]; - sum0n += (int)r3[2] * kernel[8]; - - *outptr0 = sum0; - *outptr0n = sum0n; - - r0++; - r1++; - r2++; - r3++; - outptr0++; - outptr0n++; - } - - r0 += 2 + w; - r1 += 2 + w; - r2 += 2 + w; - r3 += 2 + w; - - outptr0 += outw; - outptr0n += outw; - } - - for (; i < outh; i++) - { - int nn = outw >> 3; - int remain = outw & 7; - - for (; nn >0; nn--) - { - int8x8_t _r0 = vld1_s8(r0); - int8x8_t _r0n = vld1_s8(r0+8); - int8x8_t _r01 = vext_s8(_r0, _r0n, 1); - int8x8_t _r02 = vext_s8(_r0, _r0n, 2); - - int16x8_t _sum0 = vmull_s8(_r0, _k0); - _sum0 = vmlal_s8(_sum0, _r01, _k1); - _sum0 = vmlal_s8(_sum0, _r02, _k2); - - int8x8_t _r1 = vld1_s8(r1); - int8x8_t _r1n = vld1_s8(r1+8); - int8x8_t _r11 = vext_s8(_r1, _r1n, 1); - int8x8_t _r12 = vext_s8(_r1, _r1n, 2); - _sum0 = vmlal_s8(_sum0, _r1, _k3); - _sum0 = vmlal_s8(_sum0, _r11, _k4); - _sum0 = vmlal_s8(_sum0, _r12, _k5); - - int8x8_t _r2 = vld1_s8(r2); - int8x8_t _r2n = vld1_s8(r2+8); - int8x8_t _r21 = vext_s8(_r2, _r2n, 1); - int8x8_t _r22 = vext_s8(_r2, _r2n, 2); - _sum0 = vmlal_s8(_sum0, _r2, _k6); - _sum0 = vmlal_s8(_sum0, _r21, _k7); - _sum0 = vmlal_s8(_sum0, _r22, _k8); - - int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0)); - int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0)); - - vst1q_s32(outptr0, sum0_s32); - vst1q_s32(outptr0+4, sum0n_s32); - - r0 += 8; - r1 += 8; - r2 += 8; - outptr0 += 8; - } - - for (; remain>0; remain--) - { - int sum = 0; - - sum += (int)r0[0] * kernel[0]; - sum += (int)r0[1] * kernel[1]; - sum += (int)r0[2] * kernel[2]; - sum += (int)r1[0] * kernel[3]; - sum += (int)r1[1] * kernel[4]; - sum += (int)r1[2] * kernel[5]; - sum += (int)r2[0] * kernel[6]; - sum += (int)r2[1] * kernel[7]; - sum += (int)r2[2] * kernel[8]; - - *outptr0 = sum; - - r0++; - r1++; - r2++; - outptr0++; - } - - r0 += 2; - r1 += 2; - r2 += 2; - } - } -} - -static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) -{ - int w = bottom_blob.w; - - int outw = top_blob.w; - int outh = top_blob.h; - int outch = top_blob.c; - - const int tailstep = w - 2*outw + w; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int p=0; p> 3; - int remain = outw & 7; - - for (; nn > 0; nn--) - { - int8x8x2_t _r0 = vld2_s8(r0); - int8x8x2_t _r0n = vld2_s8(r0+16); - int8x8_t _r00 = _r0.val[0]; - int8x8_t _r01 = _r0.val[1]; - int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1); - - int16x8_t _sum = vmull_s8(_r00, _k0); - _sum = vmlal_s8(_sum, _r01, _k1); - _sum = vmlal_s8(_sum, _r02, _k2); - - int8x8x2_t _r1 = vld2_s8(r1); - int8x8x2_t _r1n = vld2_s8(r1+16); - int8x8_t _r10 = _r1.val[0]; - int8x8_t _r11 = _r1.val[1]; - int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1); - _sum = vmlal_s8(_sum, _r10, _k3); - _sum = vmlal_s8(_sum, _r11, _k4); - _sum = vmlal_s8(_sum, _r12, _k5); - - int8x8x2_t _r2 = vld2_s8(r2); - int8x8x2_t _r2n = vld2_s8(r2+16); - int8x8_t _r20 = _r2.val[0]; - int8x8_t _r21 = _r2.val[1]; - int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1); - _sum = vmlal_s8(_sum, _r20, _k6); - _sum = vmlal_s8(_sum, _r21, _k7); - _sum = vmlal_s8(_sum, _r22, _k8); - - int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum)); - int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum)); - - vst1q_s32(outptr, sum0_s32); - vst1q_s32(outptr+4, sum0n_s32); - - r0 += 16; - r1 += 16; - r2 += 16; - outptr += 8; - } - - for (; remain>0; remain--) - { - int sum = 0; - - sum += (int)r0[0] * kernel[0]; - sum += (int)r0[1] * kernel[1]; - sum += (int)r0[2] * kernel[2]; - sum += (int)r1[0] * kernel[3]; - sum += (int)r1[1] * kernel[4]; - sum += (int)r1[2] * kernel[5]; - sum += (int)r2[0] * kernel[6]; - sum += (int)r2[1] * kernel[7]; - sum += (int)r2[2] * kernel[8]; - - *outptr = sum; - - r0 += 2; - r1 += 2; - r2 += 2; - outptr++; - } - - r0 += tailstep; - r1 += tailstep; - r2 += tailstep; - } - } -} -#else // __aarch64__ static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) { int w = bottom_blob.w; @@ -824,5 +483,3 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M } } } - -#endif diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp index a8b2c76d7..2a76f1333 100644 --- a/src/layer/arm/convolutiondepthwise_arm.cpp +++ b/src/layer/arm/convolutiondepthwise_arm.cpp @@ -13,7 +13,7 @@ // specific language governing permissions and limitations under the License. #include "convolutiondepthwise_arm.h" - +#include "benchmark.h" #ifdef _OPENMP #include #endif @@ -147,6 +147,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con Mat bottom_blob_unbordered = bottom_blob; if (use_int8_inference && elemsize != 1) { + // start = ncnn::get_current_time(); + Mat bottom_blob_int8; bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator); if (bottom_blob_int8.empty()) @@ -167,8 +169,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g); } - bottom_blob_unbordered = bottom_blob_int8; - } + bottom_blob_unbordered = bottom_blob_int8; + } Mat bottom_blob_bordered = bottom_blob_unbordered; if (pad_w > 0 || pad_h > 0) @@ -211,25 +213,67 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con { if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) { - if (stride_w == 1 && stride_h == 1) + if (use_int8_requantize) { - convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); + Mat top_blob_tm; + top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_tm.empty()) + return -100; + + top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt); + } + + // requantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward(top_blob_tm_g, top_blob_g, opt_g); + } } - else if (stride_w == 2 && stride_h == 2) + else { - convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); - } - - // dequantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) - for (int g=0; gforward_inplace(top_blob_g, opt_g); + // start = ncnn::get_current_time(); + + top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt); + } + + // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward_inplace(top_blob_g, opt_g); + } } return 0; diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp index be7fb00da..5cbd3f782 100644 --- a/src/layer/arm/quantize_arm.cpp +++ b/src/layer/arm/quantize_arm.cpp @@ -31,19 +31,6 @@ static inline signed char float2int8(float v) int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { -#if !__aarch64__ && __ARM_NEON - int FPSCR_value = 0; - - asm volatile( - "vmrs %0, FPSCR \n" - "bic r10, %0, #0x00c00000 \n" - "vmsr FPSCR, r10 \n" - : "=r"(FPSCR_value) - : - : "memory", "r10" - ); -#endif - int dims = bottom_blob.dims; if (dims == 1) @@ -200,15 +187,6 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } } -#if !__aarch64__ && __ARM_NEON - asm volatile( - "vmsr FPSCR, %0 \n" - : - : "r"(FPSCR_value) - : "memory" - ); -#endif - return 0; } diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp index 9f3d541bc..f0fa4f80d 100644 --- a/src/layer/arm/relu_arm.cpp +++ b/src/layer/arm/relu_arm.cpp @@ -22,8 +22,92 @@ namespace ncnn { DEFINE_LAYER_CREATOR(ReLU_arm) +int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (slope == 0.f) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 4; + int remain = size - (nn << 4); +#else + int remain = size; +#endif // __ARM_NEON + +#if __ARM_NEON +#if __aarch64__ + int8x16_t _zero = vdupq_n_s8(0); + for (; nn>0; nn--) + { + int8x16_t _p = vld1q_s8(ptr); + _p = vmaxq_s8(_p, _zero); + vst1q_s8(ptr, _p); + + ptr += 16; + } +#else + if (nn > 0) + { + asm volatile( + "veor q1, q0, q0 \n" + "0: \n" + "pld [%1, #128] \n" + "vld1.s8 {d0-d1}, [%1 :128] \n" + "vmax.s8 q0, q0, q1 \n" + "subs %0, #1 \n" + "vst1.s8 {d0-d1}, [%1 :128]! \n" + "bne 0b \n" + : "=r"(nn), // %0 + "=r"(ptr) // %1 + : "0"(nn), + "1"(ptr) + : "cc", "memory", "q0", "q1" + ); + } +#endif // __aarch64__ +#endif // __ARM_NEON + for (; remain>0; remain--) + { + if (*ptr < 0) + *ptr = 0; + + ptr++; + } + } + } + else + { + // TODO + // #pragma omp parallel for num_threads(opt.num_threads) + // for (int q=0; q + +#if __ARM_NEON +#include +#endif // __ARM_NEON + +namespace ncnn { + +DEFINE_LAYER_CREATOR(Requantize_arm) + +static inline signed char float2int8(float v) +{ + int int32 = round(v); + if (int32 > 127) return 127; + if (int32 < -128) return -128; + return (signed char)int32; +} + +int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + int w = bottom_blob.w; + + const int* intptr = bottom_blob; + signed char * ptr = top_blob; + + if (bias_term) + { + if (bias_data_size > 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i=0; i(i); + signed char* ptr = top_blob.row(i); + + float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0]; + + for (int j=0; j(i); + signed char* ptr = top_blob.row(i); + + for (int j=0; j 1 ? bias_data[q] : bias_data[0]; + +#if __ARM_NEON + int nn = size >> 3; + int remain = size & 7; + +#if __aarch64__ + for (; nn>0; nn--) + { + ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out); + ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out); + ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out); + ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out); + ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out); + ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out); + ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out); + ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out); + + ptr += 8; + intptr += 8; + } +#else + if (nn > 0) + { + asm volatile( + "pld [%1, #256] \n" + "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data + "vdup.f32 q10, %6 \n" //q10 scale_in + "vdup.f32 q11, %7 \n" //q11 scale_out + "vdup.f32 q12, %8 \n" //q12 bias + "0: \n" + // top_s32 -> top_f32 + "vcvt.f32.s32 q0, q0 \n" + "vcvt.f32.s32 q1, q1 \n" + // top_f32 = top_f32 * scale_int + "vmul.f32 q0, q0, q10 \n" + "vmul.f32 q1, q1, q10 \n" + // top_f32 = top_f32 + bias + "vadd.f32 q0, q0, q12 \n" + "vadd.f32 q1, q1, q12 \n" + // top_f32 = top_f32 * scale_out + "vmul.f32 q0, q0, q11 \n" + "vmul.f32 q1, q1, q11 \n" + // top_f32 -> top_s32 + "vcvtr.s32.f32 s0, s0 \n" + "vcvtr.s32.f32 s1, s1 \n" + "vcvtr.s32.f32 s2, s2 \n" + "vcvtr.s32.f32 s3, s3 \n" + "vcvtr.s32.f32 s4, s4 \n" + "vcvtr.s32.f32 s5, s5 \n" + "vcvtr.s32.f32 s6, s6 \n" + "vcvtr.s32.f32 s7, s7 \n" + // top_s32 -> top_s16 + "vqmovn.s32 d4, q0 \n" + "vqmovn.s32 d5, q1 \n" + "pld [%1, #256] \n" + "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data + // top_s16 -> top_s8 + "vqmovn.s16 d4, q2 \n" + // save top_s8 + "vst1.8 {d4}, [%2:64]! \n" + "subs %0, #1 \n" + "bne 0b \n" + "sub %1, #32 \n" + : "=r"(nn), // %0 + "=r"(intptr), // %1 + "=r"(ptr) // %2 + : "0"(nn), + "1"(intptr), + "2"(ptr), + "r"(scale_in), // %6 + "r"(scale_out), // %7 + "r"(bias) // %8 + : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12" + ); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out); + + intptr++; + ptr ++; + } + } + } + else + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 3; + int remain = size & 7; + +#if __aarch64__ + //TODO + for (; nn>0; nn--) + { + ptr[0] = float2int8(intptr[0] * scale_in * scale_out); + ptr[1] = float2int8(intptr[1] * scale_in * scale_out); + ptr[2] = float2int8(intptr[2] * scale_in * scale_out); + ptr[3] = float2int8(intptr[3] * scale_in * scale_out); + ptr[4] = float2int8(intptr[4] * scale_in * scale_out); + ptr[5] = float2int8(intptr[5] * scale_in * scale_out); + ptr[6] = float2int8(intptr[6] * scale_in * scale_out); + ptr[7] = float2int8(intptr[7] * scale_in * scale_out); + + ptr += 8; + intptr += 8; + } +#else + if (nn > 0) + { + asm volatile( + "pld [%1, #256] \n" + "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data + "vdup.f32 q10, %6 \n" //q10 scale_in + "vdup.f32 q11, %7 \n" //q11 scale_out + "0: \n" + // top_s32 -> top_f32 + "vcvt.f32.s32 q0, q0 \n" + "vcvt.f32.s32 q1, q1 \n" + // top_f32 = top_f32 * scale_int + "vmul.f32 q0, q0, q10 \n" + "vmul.f32 q1, q1, q10 \n" + // top_f32 = top_f32 * scale_out + "vmul.f32 q0, q0, q11 \n" + "vmul.f32 q1, q1, q11 \n" + // top_f32 -> top_s32 + "vcvtr.s32.f32 s0, s0 \n" + "vcvtr.s32.f32 s1, s1 \n" + "vcvtr.s32.f32 s2, s2 \n" + "vcvtr.s32.f32 s3, s3 \n" + "vcvtr.s32.f32 s4, s4 \n" + "vcvtr.s32.f32 s5, s5 \n" + "vcvtr.s32.f32 s6, s6 \n" + "vcvtr.s32.f32 s7, s7 \n" + // top_s32 -> top_s16 + "vqmovn.s32 d4, q0 \n" + "vqmovn.s32 d5, q1 \n" + "pld [%1, #256] \n" + "vld1.s32 {d0-d3}, [%1:128]! \n" //q0-q1 data + // top_s16 -> top_s8 + "vqmovn.s16 d4, q2 \n" + // save top_s8 + "vst1.8 {d4}, [%2:64]! \n" + "subs %0, #1 \n" + "bne 0b \n" + "sub %1, #32 \n" + : "=r"(nn), // %0 + "=r"(intptr), // %1 + "=r"(ptr) // %2 + : "0"(nn), + "1"(intptr), + "2"(ptr), + "r"(scale_in), // %6 + "r"(scale_out) // %7 + : "cc", "memory", "q0", "q1", "q2", "q10", "q11" + ); + } +#endif // __aarch64__ +#else + int remain = size; +#endif // __ARM_NEON + + for (; remain > 0; remain--) + { + *ptr = float2int8(*intptr * scale_in * scale_out); + + intptr++; + ptr ++; + } + } + } + } + + return 0; +} + +} // namespace ncnn \ No newline at end of file diff --git a/src/layer/arm/requantize_arm.h b/src/layer/arm/requantize_arm.h new file mode 100644 index 000000000..1bfd40068 --- /dev/null +++ b/src/layer/arm/requantize_arm.h @@ -0,0 +1,30 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#ifndef LAYER_REQUANTIZE_ARM_H +#define LAYER_REQUANTIZE_ARM_H + +#include "requantize.h" + +namespace ncnn { + +class Requantize_arm : public Requantize +{ +public: + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +}; + +} // namespace ncnn + +#endif // LAYER_REQUANTIZE_ARM_H \ No newline at end of file diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp index 29e031427..b1632a477 100644 --- a/src/layer/convolution.cpp +++ b/src/layer/convolution.cpp @@ -25,6 +25,7 @@ Convolution::Convolution() one_blob_only = true; support_inplace = false; support_vulkan = true; + use_int8_requantize = false; #if NCNN_VULKAN padding = 0; @@ -42,7 +43,6 @@ Convolution::Convolution() #endif // NCNN_VULKAN quantize = 0; - dequantize = 0; } Convolution::~Convolution() @@ -52,7 +52,14 @@ Convolution::~Convolution() #endif // NCNN_VULKAN delete quantize; - delete dequantize; + + for (int i=0; i<(int)dequantize_ops.size(); i++) + delete dequantize_ops[i]; + dequantize_ops.clear(); + + for (int i=0; i<(int)requantize_ops.size(); i++) + delete requantize_ops[i]; + requantize_ops.clear(); } int Convolution::load_param(const ParamDict& pd) @@ -113,10 +120,18 @@ int Convolution::load_model(const ModelBin& mb) if (int8_scale_term) { - weight_data_int8_scale = mb.load(1, 1)[0]; + weight_data_int8_scales = mb.load(num_output, 1); bottom_blob_int8_scale = mb.load(1, 1)[0]; } + for (int i=0; i<(int)dequantize_ops.size(); i++) + delete dequantize_ops[i]; + dequantize_ops.clear(); + + for (int i=0; i<(int)requantize_ops.size(); i++) + delete requantize_ops[i]; + requantize_ops.clear(); + bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u); bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u); @@ -126,27 +141,39 @@ int Convolution::load_model(const ModelBin& mb) return -1; } + // runtime quantize the weight data if (weight_data_is_float32 && use_int8_inference) { // quantize weight to int8 - Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize); + Mat int8_weight_data(weight_data_size, (size_t)1u); + if (int8_weight_data.empty()) + return -100; - ncnn::ParamDict pd; - pd.set(0, weight_data_int8_scale);// scale + const int weight_data_size_output = weight_data_size / num_output; + + for (int n=0; nload_param(pd); + ncnn::ParamDict pd; + pd.set(0, weight_data_int8_scales[n]);// scale - Mat int8_weight_data; - op->forward(weight_data, int8_weight_data); + op->load_param(pd); - delete op; + ncnn::Option opt = ncnn::get_default_option(); + opt.blob_allocator = int8_weight_data.allocator; - if (int8_weight_data.empty()) - return -100; + const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output); + Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output); + op->forward(weight_data_n, int8_weight_data_n, opt); + + delete op; + } weight_data = int8_weight_data; } + // initial the quantize,dequantize op layer if (use_int8_inference) { quantize = ncnn::create_layer(ncnn::LayerType::Quantize); @@ -157,22 +184,74 @@ int Convolution::load_model(const ModelBin& mb) quantize->load_param(pd); } - dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize); + dequantize_ops.resize(num_output); + for (int n=0; nload_param(pd); + dequantize_ops[n]->load_param(pd); ncnn::Mat weights[1]; - weights[0] = bias_data; + weights[0] = bias_data.range(n, 1); - dequantize->load_model(ModelBinFromMatArray(weights)); + dequantize_ops[n]->load_model(ModelBinFromMatArray(weights)); + } + } + + return 0; +} + +int Convolution::create_requantize_op(void) +{ + if (!use_int8_requantize) + { + fprintf(stderr, "requantized op set but use_int8_requantize disabled\n"); + return -1; + } + + requantize_ops.resize(num_output); + for (int n=0; nload_param(pd); + + ncnn::Mat weights[1]; + weights[0] = bias_data.range(n, 1); + + requantize_ops[n]->load_model(ModelBinFromMatArray(weights)); } return 0; @@ -210,7 +289,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op if (int8_scale_term) { - weights[2] = Mat(1, (size_t)4u, (void*)&weight_data_int8_scale); + weights[2] = weight_data_int8_scales; weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale); } @@ -309,50 +388,118 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op if (use_int8_inference) { - // num_output - #pragma omp parallel for num_threads(opt.num_threads) - for (int p=0; p(i*stride_h) + j*stride_w; + int sum = 0; + + const signed char* kptr = (const signed char*)weight_data + maxk * channels * p; - for (int k = 0; k < maxk; k++) + // channels + for (int q=0; q(i*stride_h) + j*stride_w; + + for (int k = 0; k < maxk; k++) + { + int val = sptr[ space_ofs[k] ]; + int w = kptr[k]; + sum += val * w; + } + + kptr += maxk; } - kptr += maxk; + outptr[j] = sum; } - outptr[j] = sum; + outptr += outw; } - outptr += outw; + // requantize, reverse scale inplace + { + ncnn::Option opt_g = opt; + opt_g.num_threads = 1; + opt_g.blob_allocator = top_blob.allocator; + + Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1); + Mat top_blob_g = top_blob.channel_range(p, 1); + requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g); + } } } - - // dequantize, reverse scale inplace + else { - ncnn::Option opt_g = opt; - opt_g.blob_allocator = top_blob.allocator; + top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + // num_output + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward_inplace(top_blob, opt_g); - } + for (int i = 0; i < outh; i++) + { + for (int j = 0; j < outw; j++) + { + int sum = 0; + + const signed char* kptr = (const signed char*)weight_data + maxk * channels * p; + + // channels + for (int q=0; q(i*stride_h) + j*stride_w; + + for (int k = 0; k < maxk; k++) + { + int val = sptr[ space_ofs[k] ]; + int w = kptr[k]; + sum += val * w; + } + + kptr += maxk; + } + + outptr[j] = sum; + } + + outptr += outw; + } + + // dequantize, reverse scale inplace + { + ncnn::Option opt_g = opt; + opt_g.num_threads = 1; + opt_g.blob_allocator = top_blob.allocator; + + Mat top_blob_g = top_blob.channel_range(p, 1); + dequantize_ops[p]->forward_inplace(top_blob_g, opt_g); + } + } + } return 0; } diff --git a/src/layer/convolution.h b/src/layer/convolution.h index 6c4566797..1c244daf1 100644 --- a/src/layer/convolution.h +++ b/src/layer/convolution.h @@ -29,6 +29,8 @@ public: virtual int load_model(const ModelBin& mb); + virtual int create_requantize_op(void); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #if NCNN_VULKAN @@ -91,13 +93,16 @@ public: Pipeline* pipeline_innerproduct_pack4to1; #endif // NCNN_VULKAN - float weight_data_int8_scale; + Mat weight_data_int8_scales; float bottom_blob_int8_scale; + float top_blob_int8_scale; bool use_int8_inference; + bool use_int8_requantize; ncnn::Layer* quantize; - ncnn::Layer* dequantize; + std::vector dequantize_ops; + std::vector requantize_ops; }; } // namespace ncnn diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp index fe1ed1191..e01916eae 100644 --- a/src/layer/convolutiondepthwise.cpp +++ b/src/layer/convolutiondepthwise.cpp @@ -25,6 +25,7 @@ ConvolutionDepthWise::ConvolutionDepthWise() one_blob_only = true; support_inplace = false; support_vulkan = true; + use_int8_requantize = false; #if NCNN_VULKAN padding = 0; @@ -58,6 +59,11 @@ ConvolutionDepthWise::~ConvolutionDepthWise() delete dequantize_ops[i]; dequantize_ops.clear(); + + for (int i=0; i<(int)requantize_ops.size(); i++) + delete requantize_ops[i]; + + requantize_ops.clear(); } int ConvolutionDepthWise::load_param(const ParamDict& pd) @@ -150,7 +156,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) if (int8_scale_term == 1) { weight_data_int8_scales = mb.load(group, 1); - bottom_blob_int8_scales = mb.load(group, 1); + bottom_blob_int8_scales = mb.load(1, 1); + + float bottom_blob_int8_scale = bottom_blob_int8_scales[0]; + bottom_blob_int8_scales = Mat(group); + bottom_blob_int8_scales.fill(bottom_blob_int8_scale); } else if (int8_scale_term == 2) { @@ -177,6 +187,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) dequantize_ops.clear(); + for (int i=0; i<(int)requantize_ops.size(); i++) + delete requantize_ops[i]; + + requantize_ops.clear(); + bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u); bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u); @@ -236,7 +251,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) { dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize); - float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); + float top_rescale = 1.f; + if (weight_data_int8_scales[g] == 0) + top_rescale = 0; + else + top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]); ncnn::ParamDict pd; pd.set(0, top_rescale);// scale @@ -255,6 +274,50 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb) return 0; } +int ConvolutionDepthWise::create_requantize_op(void) +{ + if (!use_int8_requantize) + { + fprintf(stderr, "requantized op set but use_int8_requantize disabled\n"); + return -1; + } + + requantize_ops.resize(group); + for (int g=0; gload_param(pd); + + ncnn::Mat weights[1]; + weights[0] = bias_data.range(g, 1); + + requantize_ops[g]->load_model(ModelBinFromMatArray(weights)); + } + + return 0; +} + int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { // convolv with NxN kernel diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h index b936cdc72..c8a90e606 100644 --- a/src/layer/convolutiondepthwise.h +++ b/src/layer/convolutiondepthwise.h @@ -29,6 +29,8 @@ public: virtual int load_model(const ModelBin& mb); + virtual int create_requantize_op(void); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #if NCNN_VULKAN @@ -92,11 +94,14 @@ public: Mat weight_data_int8_scales; Mat bottom_blob_int8_scales; + float top_blob_int8_scale; bool use_int8_inference; + bool use_int8_requantize; std::vector quantize_ops; std::vector dequantize_ops; + std::vector requantize_ops; }; } // namespace ncnn diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp index 44adf09cd..4ec945dfc 100644 --- a/src/layer/innerproduct.cpp +++ b/src/layer/innerproduct.cpp @@ -36,7 +36,6 @@ InnerProduct::InnerProduct() #endif // NCNN_VULKAN quantize = 0; - dequantize = 0; } InnerProduct::~InnerProduct() @@ -46,7 +45,11 @@ InnerProduct::~InnerProduct() #endif // NCNN_VULKAN delete quantize; - delete dequantize; + + for (int i=0; i<(int)dequantize_ops.size(); i++) + delete dequantize_ops[i]; + + dequantize_ops.clear(); } int InnerProduct::load_param(const ParamDict& pd) @@ -92,7 +95,7 @@ int InnerProduct::load_model(const ModelBin& mb) if (int8_scale_term) { - weight_data_int8_scale = mb.load(1, 1)[0]; + weight_data_int8_scales = mb.load(num_output, 1); bottom_blob_int8_scale = mb.load(1, 1)[0]; } @@ -105,25 +108,71 @@ int InnerProduct::load_model(const ModelBin& mb) return -1; } + // initial the quantize,dequantize op layer if (use_int8_inference) { quantize = ncnn::create_layer(ncnn::LayerType::Quantize); - dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize); + { + ncnn::ParamDict pd; + pd.set(0, bottom_blob_int8_scale);// scale + + quantize->load_param(pd); + } + + dequantize_ops.resize(num_output); + for (int n=0; nload_param(pd); + + ncnn::Mat weights[1]; + weights[0] = bias_data.range(n, 1); + + dequantize_ops[n]->load_model(ModelBinFromMatArray(weights)); + } } + // runtime quantize the weight data if (weight_data_is_float32 && use_int8_inference) { // quantize weight to int8 - ncnn::ParamDict pd; - pd.set(0, weight_data_int8_scale);// scale + Mat int8_weight_data(weight_data_size, (size_t)1u); + if (int8_weight_data.empty()) + return -100; - quantize->load_param(pd); + const int weight_data_size_output = weight_data_size / num_output; - Mat int8_weight_data; - quantize->forward(weight_data, int8_weight_data); + for (int n=0; nload_param(pd); + + ncnn::Option opt = ncnn::get_default_option(); + opt.blob_allocator = int8_weight_data.allocator; + + const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output); + Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output); + op->forward(weight_data_n, int8_weight_data_n, opt); + + delete op; + } weight_data = int8_weight_data; } @@ -152,12 +201,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o // quantize, scale and round to nearest { - ncnn::ParamDict pd; - pd.set(0, bottom_blob_int8_scale);// scale + ncnn::Option opt_g = opt; + opt_g.blob_allocator = bottom_blob_int8.allocator; - quantize->load_param(pd); - - quantize->forward(bottom_blob, bottom_blob_int8, opt); + quantize->forward(bottom_blob, bottom_blob_int8, opt_g); } // num_output @@ -179,26 +226,24 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o } } - out[p] = sum; + out[p] = sum; } - // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pload_param(pd); - - ncnn::Mat weights[1]; - weights[0] = bias_data; - - dequantize->load_model(ModelBinFromMatArray(weights)); - - dequantize->forward_inplace(top_blob, opt); + int* out_s32 = top_blob; + float* out_f32 = top_blob; + float top_rescale = 1.f; + if (weight_data_int8_scales[p] == 0) + top_rescale = 0; + else + top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]); + + if (bias_term) + out_f32[p] = out_s32[p] * top_rescale + bias_data[p]; + else + out_f32[p] = out_s32[p] * top_rescale; } return 0; diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h index 0f9b151af..8ac451f43 100644 --- a/src/layer/innerproduct.h +++ b/src/layer/innerproduct.h @@ -76,13 +76,13 @@ public: Pipeline* pipeline_innerproduct_pack4to1; #endif // NCNN_VULKAN - float weight_data_int8_scale; + Mat weight_data_int8_scales; float bottom_blob_int8_scale; bool use_int8_inference; ncnn::Layer* quantize; - ncnn::Layer* dequantize; + std::vector dequantize_ops; }; } // namespace ncnn diff --git a/src/layer/relu.cpp b/src/layer/relu.cpp index e1c6bd2ae..dfffd8918 100644 --- a/src/layer/relu.cpp +++ b/src/layer/relu.cpp @@ -38,8 +38,51 @@ int ReLU::load_param(const ParamDict& pd) return 0; } +int ReLU::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const +{ + int w = bottom_top_blob.w; + int h = bottom_top_blob.h; + int channels = bottom_top_blob.c; + int size = w * h; + + if (slope == 0.f) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q + +namespace ncnn { + +DEFINE_LAYER_CREATOR(Requantize) + +Requantize::Requantize() +{ + one_blob_only = true; + support_inplace = false; + fusion_relu = false; +} + +static inline signed char float2int8(float v) +{ + int int32 = round(v); + if (int32 > 127) return 127; + if (int32 < -128) return -128; + return (signed char)int32; +} + +int Requantize::load_param(const ParamDict& pd) +{ + scale_in = pd.get(0, 1.f); // bottom_blob_scale * weight_scale + scale_out = pd.get(1, 1.f); // top_blob_scale + bias_term = pd.get(2, 0); + bias_data_size = pd.get(3, 0); + fusion_relu = pd.get(4, 0); + + return 0; +} + +int Requantize::load_model(const ModelBin& mb) +{ + if (bias_term) + { + bias_data = mb.load(bias_data_size, 1); + if (bias_data.empty()) + return -100; + } + + return 0; +} + +int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ + int dims = bottom_blob.dims; + + if (dims == 1) + { + int w = bottom_blob.w; + + const int* intptr = bottom_blob; + signed char * ptr = top_blob; + + if (bias_term) + { + if (bias_data_size > 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i=0; i(i); + signed char* ptr = top_blob.row(i); + + float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0]; + + for (int j=0; j(i); + signed char* ptr = top_blob.row(i); + + for (int j=0; j 1 ? bias_data[q] : bias_data[0]; + + for (int i=0; i> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp 0; remain--) + { + float sum = 0; + + sum += r0[0] * k0[0]; + sum += r0[1] * k0[1]; + sum += r0[2] * k0[2]; + sum += r1[0] * k1[0]; + sum += r1[1] * k1[1]; + sum += r1[2] * k1[2]; + sum += r2[0] * k2[0]; + sum += r2[1] * k2[1]; + sum += r2[2] * k2[2]; + + *outptr += sum; + + r0 += 2; + r1 += 2; + r2 += 2; + outptr++; + } + + r0 += tailstep; + r1 += tailstep; + r2 += tailstep; + } + } + } +} \ No newline at end of file diff --git a/src/layer/x86/convolution_3x3_int8.h b/src/layer/x86/convolution_3x3_int8.h index 4fd8f0ec8..4f1fc6407 100644 --- a/src/layer/x86/convolution_3x3_int8.h +++ b/src/layer/x86/convolution_3x3_int8.h @@ -11,12 +11,6 @@ // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR // CONDITIONS OF ANY KIND, either express or implied. See the License for the // specific language governing permissions and limitations under the License. -static inline short saturate2int16(int v) -{ - if (v > 32767) return 32767; - if (v < -32768) return -32768; - return (short)v; -} static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) { @@ -84,6 +78,424 @@ static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat } } +static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch) +{ + kernel_tm.create(4*4, inch, outch, 2ul); + + // G + const short ktm[4][3] = { + { 2, 0, 0}, + { 1, 1, 1}, + { 1, -1, 1}, + { 0, 0, 2} + }; + + #pragma omp parallel for + for (int p = 0; p(q); + + // transform kernel + const signed char* k0 = kernel0; + const signed char* k1 = kernel0 + 3; + const signed char* k2 = kernel0 + 6; + + // h + short tmp[4][3]; + for (int i=0; i<4; i++) + { + tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2]; + tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2]; + tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2]; + } + + // U + for (int j=0; j<4; j++) + { + short* tmpp = &tmp[j][0]; + + for (int i=0; i<4; i++) + { + kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2]; + } + } + } + } +} + +static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt) +{ + int w = bottom_blob.w; + int h = bottom_blob.h; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + // pad to 2n+2, winograd F(2,3) + Mat bottom_blob_bordered = bottom_blob; + + outw = (outw + 1) / 2 * 2; + outh = (outh + 1) / 2 * 2; + + w = outw + 2; + h = outh + 2; + copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads); + + // BEGIN transform input + Mat bottom_blob_tm; + { + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; + + int nColBlocks = h_tm/4; // may be the block num in Feathercnn + int nRowBlocks = w_tm/4; + + const int tiles = nColBlocks * nRowBlocks; + + bottom_blob_tm.create(4*4, tiles, inch, 2u, opt.workspace_allocator); + + // BT + // const float itm[4][4] = { + // {1.0f, 0.0f, -1.0f, 0.0f}, + // {0.0f, 1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 1.00f, 0.0f}, + // {0.0f, -1.0f, 0.00f, 1.0f} + // }; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int q=0; q> 2; + int remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp(i); + int* output1_tm = out1_tm.row(i); + int* output2_tm = out2_tm.row(i); + int* output3_tm = out3_tm.row(i); + + int sum0[16] = {0}; + int sum1[16] = {0}; + int sum2[16] = {0}; + int sum3[16] = {0}; + + int q = 0; + for (; q+3(i); + const short* r1 = bottom_blob_tm.channel(q+1).row(i); + const short* r2 = bottom_blob_tm.channel(q+2).row(i); + const short* r3 = bottom_blob_tm.channel(q+3).row(i); + + const short* k0 = kernel0_tm.row(q); + const short* k1 = kernel1_tm.row(q); + const short* k2 = kernel2_tm.row(q); + const short* k3 = kernel3_tm.row(q); + + for (int n=0; n<16; n++) + { + sum0[n] += (int)r0[n] * k0[n]; + k0 += 16; + sum0[n] += (int)r1[n] * k0[n]; + k0 += 16; + sum0[n] += (int)r2[n] * k0[n]; + k0 += 16; + sum0[n] += (int)r3[n] * k0[n]; + k0 -= 16 * 3; + + sum1[n] += (int)r0[n] * k1[n]; + k1 += 16; + sum1[n] += (int)r1[n] * k1[n]; + k1 += 16; + sum1[n] += (int)r2[n] * k1[n]; + k1 += 16; + sum1[n] += (int)r3[n] * k1[n]; + k1 -= 16 * 3; + + sum2[n] += (int)r0[n] * k2[n]; + k2 += 16; + sum2[n] += (int)r1[n] * k2[n]; + k2 += 16; + sum2[n] += (int)r2[n] * k2[n]; + k2 += 16; + sum2[n] += (int)r3[n] * k2[n]; + k2 -= 16 * 3; + + sum3[n] += (int)r0[n] * k3[n]; + k3 += 16; + sum3[n] += (int)r1[n] * k3[n]; + k3 += 16; + sum3[n] += (int)r2[n] * k3[n]; + k3 += 16; + sum3[n] += (int)r3[n] * k3[n]; + k3 -= 16 * 3; + } + } + + for (; q(i); + + const short* k0 = kernel0_tm.row(q); + const short* k1 = kernel1_tm.row(q); + const short* k2 = kernel2_tm.row(q); + const short* k3 = kernel3_tm.row(q); + + for (int n=0; n<16; n++) + { + sum0[n] += (int)r0[n] * k0[n]; + sum1[n] += (int)r0[n] * k1[n]; + sum2[n] += (int)r0[n] * k2[n]; + sum3[n] += (int)r0[n] * k3[n]; + } + } + + for (int n=0; n<16; n++) + { + output0_tm[n] = sum0[n]; + output1_tm[n] = sum1[n]; + output2_tm[n] = sum2[n]; + output3_tm[n] = sum3[n]; + } + } + } + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=remain_outch_start; p(i); + + int sum0[16] = {0}; + + int q = 0; + for (; q+3(i); + const short* r1 = bottom_blob_tm.channel(q+1).row(i); + const short* r2 = bottom_blob_tm.channel(q+2).row(i); + const short* r3 = bottom_blob_tm.channel(q+3).row(i); + + const short* k0 = kernel0_tm.row(q); + const short* k1 = kernel0_tm.row(q+1); + const short* k2 = kernel0_tm.row(q+2); + const short* k3 = kernel0_tm.row(q+3); + + for (int n=0; n<16; n++) + { + sum0[n] += (int)r0[n] * k0[n]; + sum0[n] += (int)r1[n] * k1[n]; + sum0[n] += (int)r2[n] * k2[n]; + sum0[n] += (int)r3[n] * k3[n]; + } + } + + for (; q(i); + const short* k0 = kernel0_tm.row(q); + + for (int n=0; n<16; n++) + { + sum0[n] += (int)r0[n] * k0[n]; + } + } + + for (int n=0; n<16; n++) + { + output0_tm[n] = sum0[n]; + } + } + } + } + bottom_blob_tm = Mat(); + // END dot + + // BEGIN transform output + Mat top_blob_bordered; + top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator); + { + // AT + // const float itm[2][4] = { + // {1.0f, 1.0f, 1.0f, 0.0f}, + // {0.0f, 1.0f, -1.0f, 1.0f} + // }; + + int w_tm = outw / 2 * 4; + int h_tm = outh / 2 * 4; + + int nColBlocks = h_tm/4; // may be the block num in Feathercnn + int nRowBlocks = w_tm/4; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; p(j*2); + int* outRow1 = out.row(j*2+1); + + for(int i=0; i(j*nRowBlocks + i); + + int s0[4],s1[4],s2[4],s3[4]; + int w0[4],w1[4]; + int d0[2],d1[2],d2[2],d3[2]; + int o0[2],o1[2]; + // load + for (int n = 0; n < 4; n++) + { + s0[n] = out_tile[n]; + s1[n] = out_tile[n+ 4]; + s2[n] = out_tile[n+ 8]; + s3[n] = out_tile[n+12]; + } + // w = A_T * W + for (int n = 0; n < 4; n++) + { + w0[n] = s0[n] + s1[n] + s2[n]; + w1[n] = s1[n] - s2[n] + s3[n]; + } + // transpose w to w_t + { + d0[0] = w0[0]; d0[1] = w1[0]; + d1[0] = w0[1]; d1[1] = w1[1]; + d2[0] = w0[2]; d2[1] = w1[2]; + d3[0] = w0[3]; d3[1] = w1[3]; + } + // Y = A_T * w_t + for (int n = 0; n < 2; n++) + { + o0[n] = d0[n] + d1[n] + d2[n]; + o1[n] = d1[n] - d2[n] + d3[n]; + } + // save to top blob tm,why right 2,because the G' = G*2 + outRow0[0] = o0[0] >> 2; + outRow0[1] = o0[1] >> 2; + outRow1[0] = o1[0] >> 2; + outRow1[1] = o1[1] >> 2; + + outRow0 += 2; + outRow1 += 2; + } + } + } + } + // END transform output + + // cut result pad + copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads); +} + static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) { int w = bottom_blob.w; @@ -122,23 +534,19 @@ static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat for (; remain > 0; remain--) { - short sum0 = 0; - short sum1 = 0; - short sum2 = 0; - - sum0 += (short)r0[0] * kernel0[0]; - sum0 += (short)r0[1] * kernel0[1]; - sum0 += (short)r0[2] * kernel0[2]; - sum1 += (short)r1[0] * kernel0[3]; - sum1 += (short)r1[1] * kernel0[4]; - sum1 += (short)r1[2] * kernel0[5]; - sum2 += (short)r2[0] * kernel0[6]; - sum2 += (short)r2[1] * kernel0[7]; - sum2 += (short)r2[2] * kernel0[8]; - - *outptr0 = saturate2int16(*outptr0 + sum0); - *outptr0 = saturate2int16(*outptr0 + sum1); - *outptr0 = saturate2int16(*outptr0 + sum2); + int sum0 = 0; + + sum0 += (int)r0[0] * kernel0[0]; + sum0 += (int)r0[1] * kernel0[1]; + sum0 += (int)r0[2] * kernel0[2]; + sum0 += (int)r1[0] * kernel0[3]; + sum0 += (int)r1[1] * kernel0[4]; + sum0 += (int)r1[2] * kernel0[5]; + sum0 += (int)r2[0] * kernel0[6]; + sum0 += (int)r2[1] * kernel0[7]; + sum0 += (int)r2[2] * kernel0[8]; + + *outptr0 += sum0; r0 += 2; r1 += 2; diff --git a/src/layer/x86/convolution_5x5_int8.h b/src/layer/x86/convolution_5x5_int8.h new file mode 100644 index 000000000..662034faf --- /dev/null +++ b/src/layer/x86/convolution_5x5_int8.h @@ -0,0 +1,35 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv5x5s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 5; + int kernel_h = 5; + + int stride_w = 1; + int stride_h = 1; + + conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} + +static void conv5x5s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 5; + int kernel_h = 5; + + int stride_w = 2; + int stride_h = 2; + + conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} diff --git a/src/layer/x86/convolution_7x7_int8.h b/src/layer/x86/convolution_7x7_int8.h new file mode 100644 index 000000000..1704c41f3 --- /dev/null +++ b/src/layer/x86/convolution_7x7_int8.h @@ -0,0 +1,35 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv7x7s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 7; + int kernel_h = 7; + + int stride_w = 1; + int stride_h = 1; + + conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} + +static void conv7x7s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt) +{ + int kernel_w = 7; + int kernel_h = 7; + + int stride_w = 2; + int stride_h = 2; + + conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt); +} \ No newline at end of file diff --git a/src/layer/x86/convolution_sgemm_int8.h b/src/layer/x86/convolution_sgemm_int8.h new file mode 100644 index 000000000..753a08f54 --- /dev/null +++ b/src/layer/x86/convolution_sgemm_int8.h @@ -0,0 +1,381 @@ +// SenseNets is pleased to support the open source community by supporting ncnn available. +// +// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +static void conv_im2col_sgemm_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \ + const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt) +{ + int w = bottom_blob.w; + int inch = bottom_blob.c; + + int outw = top_blob.w; + int outh = top_blob.h; + int outch = top_blob.c; + + const signed char *kernel = _kernel; + + // im2col + Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator); + { + const int stride = kernel_h*kernel_w*outw*outh; + signed char* ret = (signed char*)bottom_im2col; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; p> 3; + int remain_size_start = nn_size << 3; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int ii=0; ii> 2; + remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp> 2; + remain_outch_start = nn_outch << 2; + + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp=0; pp= 16 && num_output >= 16) + use_winograd3x3 = true; + } + + return 0; +} + +int Convolution_x86::load_model(const ModelBin& mb) +{ + int ret = Convolution::load_model(mb); + if (ret != 0) + return ret; + + if (use_winograd3x3) + { + int num_input = weight_data_size / 9 / num_output; + + if (use_int8_inference) + conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output); + else + conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output); + } + + return 0; +} + int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const { int w = bottom_blob.w; @@ -147,7 +191,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option const int kernel_size = kernel_w; const int stride = stride_w; - if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h) + if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h) { return Convolution::forward(bottom_blob, top_blob, opt); } @@ -155,26 +199,23 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&); // kernel_size x stride - conv_func conv_func_table[5][5] = + conv_func conv_func_table[7][4] = { { conv1x1s1_sse, conv1x1s2_sse, 0, - 0, 0 }, // kernel_size = 1 { 0, 0, 0, - 0, 0 }, // kernel_size = 2 { conv3x3s1_sse, - 0, - 0, + conv3x3s2_sse, 0, 0 }, // kernel_size = 3 @@ -182,35 +223,43 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option 0, 0, 0, - 0, 0 }, // kernel_size = 4 { conv5x5s1_sse, 0, + 0, + 0 + }, // kernel_size = 5 + { 0, 0, + 0, + 0 + }, // kernel_size = 6 + { + 0, + 0, + 0, 0 - } // kernel_size = 5 + } // kernel_size = 7 }; typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&); // kernel_size x stride - conv_int8_func conv_int8_func_table[5][5] = + conv_int8_func conv_int8_func_table[7][4] = { { conv1x1s1_int8_sse, conv1x1s2_int8_sse, 0, - 0, 0 }, // kernel_size = 1 { 0, 0, 0, - 0, 0 }, // kernel_size = 2 { @@ -218,22 +267,31 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option conv3x3s2_int8_sse, 0, 0, - 0 }, // kernel_size = 3 { 0, 0, 0, - 0, 0 }, // kernel_size = 4 + { + conv5x5s1_int8_sse, + conv5x5s2_int8_sse, + 0, + 0 + }, // kernel_size = 5 { 0, 0, 0, + 0 + }, // kernel_size = 6 + { + conv7x7s1_int8_sse, + conv7x7s2_int8_sse, 0, 0 - } // kernel_size = 5 + } // kernel_size = 7 }; conv_func conv = 0; @@ -322,21 +380,69 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option if (use_int8_inference) { - conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); - - // dequantize, reverse scale inplace + if (use_int8_requantize == true) { - ncnn::Option opt_g = opt; - opt_g.blob_allocator = top_blob.allocator; + Mat top_blob_tm; + top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_tm.empty()) + return -100; + + top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (use_winograd3x3) + conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt); + else + conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt); - dequantize->forward_inplace(top_blob, opt_g); + // requantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward(top_blob_tm_g, top_blob_g, opt_g); + } } + else + { + top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + if (use_winograd3x3) + conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt); + else + conv_int8(bottom_blob_bordered, top_blob, weight_data, opt); + + // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int p=0; pforward_inplace(top_blob_g, opt_g); + } + } + return 0; } - conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); - + if (use_winograd3x3) + { + conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt); + } + else + conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt); + return 0; } diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h index e72c14aca..0062548af 100644 --- a/src/layer/x86/convolution_x86.h +++ b/src/layer/x86/convolution_x86.h @@ -24,8 +24,16 @@ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option class Convolution_x86 : public Convolution { public: + virtual int load_param(const ParamDict& pd); + + virtual int load_model(const ModelBin& mb); + virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const; + +public: + bool use_winograd3x3; + Mat weight_3x3_winograd23_data; }; } // namespace ncnn diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp index a440c1fac..d1911e8c7 100644 --- a/src/layer/x86/convolutiondepthwise_x86.cpp +++ b/src/layer/x86/convolutiondepthwise_x86.cpp @@ -134,7 +134,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con } const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; - const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; Mat bottom_blob_unbordered = bottom_blob; if (use_int8_inference && elemsize != 1) @@ -159,8 +159,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g); } - bottom_blob_unbordered = bottom_blob_int8; - } + bottom_blob_unbordered = bottom_blob_int8; + } Mat bottom_blob_bordered = bottom_blob_unbordered; if (pad_w > 0 || pad_h > 0) @@ -203,25 +203,65 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con { if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2)) { - if (stride_w == 1 && stride_h == 1) + if (use_int8_requantize) { - convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); + Mat top_blob_tm; + top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator); + if (top_blob_tm.empty()) + return -100; + + top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt); + } + + // requantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward(top_blob_tm_g, top_blob_g, opt_g); + } } - else if (stride_w == 2 && stride_h == 2) + else { - convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); - } - - // dequantize, reverse scale inplace - #pragma omp parallel for num_threads(opt.num_threads) - for (int g=0; gforward_inplace(top_blob_g, opt_g); + top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (stride_w == 1 && stride_h == 1) + { + convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); + } + else if (stride_w == 2 && stride_h == 2) + { + convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt); + } + + // dequantize, reverse scale inplace + #pragma omp parallel for num_threads(opt.num_threads) + for (int g=0; gforward_inplace(top_blob_g, opt_g); + } } return 0; diff --git a/src/net.cpp b/src/net.cpp index fc3565734..0c129d12e 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -16,6 +16,9 @@ #include "layer_type.h" #include "modelbin.h" #include "paramdict.h" +#include "convolution.h" +#include "convolutiondepthwise.h" +#include "relu.h" #include #include @@ -679,6 +682,8 @@ int Net::load_model(FILE* fp) } #endif // NCNN_VULKAN + fuse_network(); + return ret; } @@ -898,6 +903,110 @@ int Net::load_model(const unsigned char* _mem) return mem - _mem; } +void Net::fuse_network() +{ + // set the int8 op fusion:requantize +#if NCNN_STRING && NCNN_REQUANT + // fprintf(stderr, "Test op fusion to int8 implement:\n"); + for (size_t i=0; itype == "Convolution" || layer->type == "ConvolutionDepthWise") + { + if (((Convolution*)layer)->use_int8_inference == false) + continue; + + for (size_t n=0; ntops[0]].consumers.size(); n++) + { + int layer_next_index = blobs[layer->tops[0]].consumers[n]; + Layer* layer_next = layers[layer_next_index]; + + if (layer_next->type == "ReLU") + { + int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0]; + Layer* layer_next_2 = layers[layer_next_2_index]; + + if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise") + { + // fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); + if (layer->type == "Convolution" && layer_next_2->type == "Convolution") + { + ((Convolution*)layer)->use_int8_requantize = true; + ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; + ((Convolution*)layer)->create_requantize_op(); + } + else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution") + { + ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; + ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale; + ((ConvolutionDepthWise*)layer)->create_requantize_op(); + } + else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise") + { + ((Convolution*)layer)->use_int8_requantize = true; + ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; + ((Convolution*)layer)->create_requantize_op(); + } + else + { + ((ConvolutionDepthWise*)layer)->use_int8_requantize = true; + ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0]; + ((ConvolutionDepthWise*)layer)->create_requantize_op(); + } + } + else if (layer_next_2->type == "Split") + { + bool all_conv = true; + for (size_t i=0; itops.size(); i++) + { + int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; + if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox" ) + { + // fprintf(stderr, "%s, %s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str()); + all_conv = false; + } + } + + if (all_conv == true && layer_next_2->tops.size() >= size_t(2)) + { + // fprintf(stderr, "%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str()); + for (size_t i=0; itops.size(); i++) + { + int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0]; + Layer* layer_next_3 = layers[layer_next_3_index]; + + // fprintf(stderr, "%s, ", layer_next_3->name.c_str()); + if (layer_next_3->type == "Convolution") + { + ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale; + } + } + + ((Convolution*)layer)->use_int8_requantize = true; + ((Convolution*)layer)->create_requantize_op(); + // fprintf(stderr, "\n"); + } + } + else + { + // fprintf(stderr, "%s, %s\n", layer->name.c_str(), layer_next->name.c_str()); + } + } + else if (layer_next->type == "Pooling") + { + // ToDo + } + else + { + // fprintf(stderr, "%s\n", layer->name.c_str()); + } + } + } + } +#endif +} + void Net::clear() { blobs.clear(); diff --git a/src/net.h b/src/net.h index 6c877e4e9..f957aa1e7 100644 --- a/src/net.h +++ b/src/net.h @@ -76,6 +76,10 @@ public: // return bytes consumed int load_model(const unsigned char* mem); + // parse the structure of network + // fuse int8 op dequantize and quantize by requantize + void fuse_network(); + // unload network structure and weight data void clear(); diff --git a/src/platform.h.in b/src/platform.h.in index d94484068..505f141cc 100644 --- a/src/platform.h.in +++ b/src/platform.h.in @@ -22,5 +22,7 @@ #cmakedefine01 NCNN_PIXEL #cmakedefine01 NCNN_PIXEL_ROTATE #cmakedefine01 NCNN_VULKAN +#cmakedefine01 NCNN_REQUANT +#cmakedefine01 NCNN_IM2COL_SGEMM #endif // NCNN_PLATFORM_H diff --git a/tools/caffe/caffe2ncnn.cpp b/tools/caffe/caffe2ncnn.cpp index 2451b5367..0d0b15c76 100644 --- a/tools/caffe/caffe2ncnn.cpp +++ b/tools/caffe/caffe2ncnn.cpp @@ -685,7 +685,7 @@ int main(int argc, char** argv) if (int8_scale_term) { - if ((int)weight_int8scale.size() == num_group && (int)blob_int8scale.size() == num_group) + if ((int)weight_int8scale.size() == num_group) { fprintf(pp, " 8=1"); }