new int8 implement,better accuracy (#749)

* add the armv7a conv3x3s1 implement without overflow,remove old codes * fix the bug of conv3x3s2 packed int8 * new int8 implement,weight quant by perchanel,better accuracy~ * fix the bug of conv3x3s1 packed int8 neon * add the naive c fp32 and int8 winograd F(2,3) * add the neon intrinsic int8 winograd F(2,3) * optimize the armv7a int8 winograd F(2,3) with neon assembly * optimize the armv7a int8 winograd F(2,3) input transform with assembly. * add the requantize layer and int8 relu implement. * add graph optimize conv1x1s2 -> conv1x1s1,begin optimize int8 aarch64. * fix int8 bugs * add the c naive im2col with sgemm * add aarch64 int8 winograd f23, conv3x3s2 naive implement * add the int8 sgemm conv7x7s2 on x86/armv7a platform * optimize the int8 sgemm by neon intrinsic and packed kernel * optimize the int8 sgemm with packed data * optimize the int8 sgemm with armv7a neon assembly * add the int8 sgemm on arm64-v8a platform * perpare to merge latest codes from master * add the int8 param files * In the Class Net,add the fuse_network method
7 years ago · df3d224484
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,8 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
 option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" OFF)
 option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
 option(NCNN_VULKAN "vulkan compute support" OFF)
 option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
 option(NCNN_IM2COL_SGEMM "im2col sgemm support" OFF)

 if(NCNN_OPENMP)
    find_package(OpenMP)
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -202,7 +202,7 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const

    time_avg /= g_loop_count;

    fprintf(stderr, "%16s  min = %7.2f  max = %7.2f  avg = %7.2f\n", comment, time_min, time_max, time_avg);
    fprintf(stderr, "%-20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", comment, time_min, time_max, time_avg);
 }

 void squeezenet_init(ncnn::Net& net)
@@ -210,6 +210,11 @@ void squeezenet_init(ncnn::Net& net)
    net.load_param("squeezenet.param");
 }

 void squeezenet_int8_init(ncnn::Net& net)
 {
    net.load_param("squeezenet_int8.param");
 }

 void squeezenet_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -226,6 +231,11 @@ void mobilenet_init(ncnn::Net& net)
    net.load_param("mobilenet.param");
 }

 void mobilenet_int8_init(ncnn::Net& net)
 {
    net.load_param("mobilenet_int8.param");
 }

 void mobilenet_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -306,6 +316,11 @@ void googlenet_init(ncnn::Net& net)
    net.load_param("googlenet.param");
 }

 void googlenet_int8_init(ncnn::Net& net)
 {
    net.load_param("googlenet_int8.param");
 }

 void googlenet_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -322,6 +337,11 @@ void resnet18_init(ncnn::Net& net)
    net.load_param("resnet18.param");
 }

 void resnet18_int8_init(ncnn::Net& net)
 {
    net.load_param("resnet18_int8.param");
 }

 void resnet18_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -354,6 +374,11 @@ void vgg16_init(ncnn::Net& net)
    net.load_param("vgg16.param");
 }

 void vgg16_int8_init(ncnn::Net& net)
 {
    net.load_param("vgg16_int8.param");
 }

 void vgg16_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -365,11 +390,37 @@ void vgg16_run(const ncnn::Net& net)
    ex.extract("prob", out);
 }

 void resnet50_init(ncnn::Net& net)
 {
    net.load_param("resnet50.param");
 }

 void resnet50_int8_init(ncnn::Net& net)
 {
    net.load_param("resnet50_int8.param");
 }

 void resnet50_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();

    ncnn::Mat in(224, 224, 3);
    ex.input("data", in);

    ncnn::Mat out;
    ex.extract("prob", out);
 }

 void squeezenet_ssd_init(ncnn::Net& net)
 {
    net.load_param("squeezenet_ssd.param");
 }

 void squeezenet_ssd_int8_init(ncnn::Net& net)
 {
    net.load_param("squeezenet_ssd_int8.param");
 }

 void squeezenet_ssd_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -386,6 +437,11 @@ void mobilenet_ssd_init(ncnn::Net& net)
    net.load_param("mobilenet_ssd.param");
 }

 void mobilenet_ssd_int8_init(ncnn::Net& net)
 {
    net.load_param("mobilenet_ssd_int8.param");
 }

 void mobilenet_ssd_run(const ncnn::Net& net)
 {
    ncnn::Extractor ex = net.create_extractor();
@@ -497,8 +553,12 @@ int main(int argc, char** argv)
    // run
    benchmark("squeezenet", squeezenet_init, squeezenet_run);

    benchmark("squeezenet-int8", squeezenet_int8_init, squeezenet_run);

    benchmark("mobilenet", mobilenet_init, mobilenet_run);

    benchmark("mobilenet-int8", mobilenet_int8_init, mobilenet_run);

    benchmark("mobilenet_v2", mobilenet_v2_init, mobilenet_v2_run);

    benchmark("shufflenet", shufflenet_init, shufflenet_run);
@@ -509,16 +569,28 @@ int main(int argc, char** argv)

    benchmark("googlenet", googlenet_init, googlenet_run);

    benchmark("googlenet-int8", googlenet_int8_init, googlenet_run);

    benchmark("resnet18", resnet18_init, resnet18_run);

    benchmark("resnet18-int8", resnet18_int8_init, resnet18_run);

    benchmark("alexnet", alexnet_init, alexnet_run);

    benchmark("vgg16", vgg16_init, vgg16_run);

    benchmark("resnet50", resnet50_init, resnet50_run);

    benchmark("resnet50-int8", resnet50_int8_init, resnet50_run);

    benchmark("squeezenet-ssd", squeezenet_ssd_init, squeezenet_ssd_run);

    benchmark("squeezenet-ssd-int8", squeezenet_ssd_int8_init, squeezenet_ssd_run);

    benchmark("mobilenet-ssd", mobilenet_ssd_init, mobilenet_ssd_run);

    benchmark("mobilenet-ssd-int8", mobilenet_ssd_int8_init, mobilenet_ssd_run);

    benchmark("mobilenet-yolo", mobilenet_yolo_init, mobilenet_yolo_run);

    benchmark("mobilenet-yolov3", mobilenet_yolov3_init, mobilenet_yolov3_run);
--- a/benchmark/googlenet_int8.param
+++ b/benchmark/googlenet_int8.param
@@ -0,0 +1,154 @@
 7767517
 152 179
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1/7x7_s2     1 1 data conv1/7x7_s2 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
 ReLU             conv1/relu_7x7   1 1 conv1/7x7_s2 conv1/7x7_s2_conv1/relu_7x7
 Pooling          pool1/3x3_s2     1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 0=0 1=3 2=2 3=0 4=0
 LRN              pool1/norm1      1 1 pool1/3x3_s2 pool1/norm1 0=0 1=5 2=0.000100 3=0.750000
 Convolution      conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce 0=64 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             conv2/relu_3x3_reduce 1 1 conv2/3x3_reduce conv2/3x3_reduce_conv2/relu_3x3_reduce
 Convolution      conv2/3x3        1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
 ReLU             conv2/relu_3x3   1 1 conv2/3x3 conv2/3x3_conv2/relu_3x3
 LRN              conv2/norm2      1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 0=0 1=5 2=0.000100 3=0.750000
 Pooling          pool2/3x3_s2     1 1 conv2/norm2 pool2/3x3_s2 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_0      1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3
 Convolution      inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 ReLU             inception_3a/relu_1x1 1 1 inception_3a/1x1 inception_3a/1x1_inception_3a/relu_1x1
 Convolution      inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
 ReLU             inception_3a/relu_3x3_reduce 1 1 inception_3a/3x3_reduce inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce
 Convolution      inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
 ReLU             inception_3a/relu_3x3 1 1 inception_3a/3x3 inception_3a/3x3_inception_3a/relu_3x3
 Convolution      inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
 ReLU             inception_3a/relu_5x5_reduce 1 1 inception_3a/5x5_reduce inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce
 Convolution      inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5 0=32 1=5 2=1 3=1 4=2 5=1 6=12800 8=2
 ReLU             inception_3a/relu_5x5 1 1 inception_3a/5x5 inception_3a/5x5_inception_3a/relu_5x5
 Pooling          inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj 0=32 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
 ReLU             inception_3a/relu_pool_proj 1 1 inception_3a/pool_proj inception_3a/pool_proj_inception_3a/relu_pool_proj
 Concat           inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output 0=0
 Split            splitncnn_1      1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3
 Convolution      inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             inception_3b/relu_1x1 1 1 inception_3b/1x1 inception_3b/1x1_inception_3b/relu_1x1
 Convolution      inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             inception_3b/relu_3x3_reduce 1 1 inception_3b/3x3_reduce inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce
 Convolution      inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=221184 8=2
 ReLU             inception_3b/relu_3x3 1 1 inception_3b/3x3 inception_3b/3x3_inception_3b/relu_3x3
 Convolution      inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
 ReLU             inception_3b/relu_5x5_reduce 1 1 inception_3b/5x5_reduce inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce
 Convolution      inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5 0=96 1=5 2=1 3=1 4=2 5=1 6=76800 8=2
 ReLU             inception_3b/relu_5x5 1 1 inception_3b/5x5 inception_3b/5x5_inception_3b/relu_5x5
 Pooling          inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             inception_3b/relu_pool_proj 1 1 inception_3b/pool_proj inception_3b/pool_proj_inception_3b/relu_pool_proj
 Concat           inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output 0=0
 Pooling          pool3/3x3_s2     1 1 inception_3b/output pool3/3x3_s2 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_2      1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3
 Convolution      inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=92160 8=2
 ReLU             inception_4a/relu_1x1 1 1 inception_4a/1x1 inception_4a/1x1_inception_4a/relu_1x1
 Convolution      inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=46080 8=2
 ReLU             inception_4a/relu_3x3_reduce 1 1 inception_4a/3x3_reduce inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce
 Convolution      inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3 0=208 1=3 2=1 3=1 4=1 5=1 6=179712 8=2
 ReLU             inception_4a/relu_3x3 1 1 inception_4a/3x3 inception_4a/3x3_inception_4a/relu_3x3
 Convolution      inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=7680 8=2
 ReLU             inception_4a/relu_5x5_reduce 1 1 inception_4a/5x5_reduce inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce
 Convolution      inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5 0=48 1=5 2=1 3=1 4=2 5=1 6=19200 8=2
 ReLU             inception_4a/relu_5x5 1 1 inception_4a/5x5 inception_4a/5x5_inception_4a/relu_5x5
 Pooling          inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=30720 8=2
 ReLU             inception_4a/relu_pool_proj 1 1 inception_4a/pool_proj inception_4a/pool_proj_inception_4a/relu_pool_proj
 Concat           inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output 0=0
 Split            splitncnn_3      1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3
 Convolution      inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1 0=160 1=1 2=1 3=1 4=0 5=1 6=81920 8=2
 ReLU             inception_4b/relu_1x1 1 1 inception_4b/1x1 inception_4b/1x1_inception_4b/relu_1x1
 Convolution      inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
 ReLU             inception_4b/relu_3x3_reduce 1 1 inception_4b/3x3_reduce inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce
 Convolution      inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3 0=224 1=3 2=1 3=1 4=1 5=1 6=225792 8=2
 ReLU             inception_4b/relu_3x3 1 1 inception_4b/3x3 inception_4b/3x3_inception_4b/relu_3x3
 Convolution      inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 ReLU             inception_4b/relu_5x5_reduce 1 1 inception_4b/5x5_reduce inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce
 Convolution      inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
 ReLU             inception_4b/relu_5x5 1 1 inception_4b/5x5 inception_4b/5x5_inception_4b/relu_5x5
 Pooling          inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             inception_4b/relu_pool_proj 1 1 inception_4b/pool_proj inception_4b/pool_proj_inception_4b/relu_pool_proj
 Concat           inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output 0=0
 Split            splitncnn_4      1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3
 Convolution      inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
 ReLU             inception_4c/relu_1x1 1 1 inception_4c/1x1 inception_4c/1x1_inception_4c/relu_1x1
 Convolution      inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
 ReLU             inception_4c/relu_3x3_reduce 1 1 inception_4c/3x3_reduce inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce
 Convolution      inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
 ReLU             inception_4c/relu_3x3 1 1 inception_4c/3x3 inception_4c/3x3_inception_4c/relu_3x3
 Convolution      inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 ReLU             inception_4c/relu_5x5_reduce 1 1 inception_4c/5x5_reduce inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce
 Convolution      inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
 ReLU             inception_4c/relu_5x5 1 1 inception_4c/5x5 inception_4c/5x5_inception_4c/relu_5x5
 Pooling          inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             inception_4c/relu_pool_proj 1 1 inception_4c/pool_proj inception_4c/pool_proj_inception_4c/relu_pool_proj
 Concat           inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output 0=0
 Split            splitncnn_5      1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3
 Convolution      inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
 ReLU             inception_4d/relu_1x1 1 1 inception_4d/1x1 inception_4d/1x1_inception_4d/relu_1x1
 Convolution      inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce 0=144 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
 ReLU             inception_4d/relu_3x3_reduce 1 1 inception_4d/3x3_reduce inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce
 Convolution      inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3 0=288 1=3 2=1 3=1 4=1 5=1 6=373248 8=2
 ReLU             inception_4d/relu_3x3 1 1 inception_4d/3x3 inception_4d/3x3_inception_4d/relu_3x3
 Convolution      inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             inception_4d/relu_5x5_reduce 1 1 inception_4d/5x5_reduce inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce
 Convolution      inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=51200 8=2
 ReLU             inception_4d/relu_5x5 1 1 inception_4d/5x5 inception_4d/5x5_inception_4d/relu_5x5
 Pooling          inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             inception_4d/relu_pool_proj 1 1 inception_4d/pool_proj inception_4d/pool_proj_inception_4d/relu_pool_proj
 Concat           inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output 0=0
 Split            splitncnn_6      1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3
 Convolution      inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=135168 8=2
 ReLU             inception_4e/relu_1x1 1 1 inception_4e/1x1 inception_4e/1x1_inception_4e/relu_1x1
 Convolution      inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=84480 8=2
 ReLU             inception_4e/relu_3x3_reduce 1 1 inception_4e/3x3_reduce inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce
 Convolution      inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
 ReLU             inception_4e/relu_3x3 1 1 inception_4e/3x3 inception_4e/3x3_inception_4e/relu_3x3
 Convolution      inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16896 8=2
 ReLU             inception_4e/relu_5x5_reduce 1 1 inception_4e/5x5_reduce inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce
 Convolution      inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
 ReLU             inception_4e/relu_5x5 1 1 inception_4e/5x5 inception_4e/5x5_inception_4e/relu_5x5
 Pooling          inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=67584 8=2
 ReLU             inception_4e/relu_pool_proj 1 1 inception_4e/pool_proj inception_4e/pool_proj_inception_4e/relu_pool_proj
 Concat           inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output 0=0
 Pooling          pool4/3x3_s2     1 1 inception_4e/output pool4/3x3_s2 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_7      1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3
 Convolution      inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=212992 8=2
 ReLU             inception_5a/relu_1x1 1 1 inception_5a/1x1 inception_5a/1x1_inception_5a/relu_1x1
 Convolution      inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=133120 8=2
 ReLU             inception_5a/relu_3x3_reduce 1 1 inception_5a/3x3_reduce inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce
 Convolution      inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
 ReLU             inception_5a/relu_3x3 1 1 inception_5a/3x3 inception_5a/3x3_inception_5a/relu_3x3
 Convolution      inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=26624 8=2
 ReLU             inception_5a/relu_5x5_reduce 1 1 inception_5a/5x5_reduce inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce
 Convolution      inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
 ReLU             inception_5a/relu_5x5 1 1 inception_5a/5x5 inception_5a/5x5_inception_5a/relu_5x5
 Pooling          inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
 ReLU             inception_5a/relu_pool_proj 1 1 inception_5a/pool_proj inception_5a/pool_proj_inception_5a/relu_pool_proj
 Concat           inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output 0=0
 Split            splitncnn_8      1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3
 Convolution      inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=319488 8=2
 ReLU             inception_5b/relu_1x1 1 1 inception_5b/1x1 inception_5b/1x1_inception_5b/relu_1x1
 Convolution      inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce 0=192 1=1 2=1 3=1 4=0 5=1 6=159744 8=2
 ReLU             inception_5b/relu_3x3_reduce 1 1 inception_5b/3x3_reduce inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce
 Convolution      inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=663552 8=2
 ReLU             inception_5b/relu_3x3 1 1 inception_5b/3x3 inception_5b/3x3_inception_5b/relu_3x3
 Convolution      inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce 0=48 1=1 2=1 3=1 4=0 5=1 6=39936 8=2
 ReLU             inception_5b/relu_5x5_reduce 1 1 inception_5b/5x5_reduce inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce
 Convolution      inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=153600 8=2
 ReLU             inception_5b/relu_5x5 1 1 inception_5b/5x5 inception_5b/5x5_inception_5b/relu_5x5
 Pooling          inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 0=0 1=3 2=1 3=1 4=0
 Convolution      inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
 ReLU             inception_5b/relu_pool_proj 1 1 inception_5b/pool_proj inception_5b/pool_proj_inception_5b/relu_pool_proj
 Concat           inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output 0=0
 Pooling          pool5/7x7_s1     1 1 inception_5b/output pool5/7x7_s1 0=1 1=7 2=1 3=0 4=0
 Dropout          pool5/drop_7x7_s1 1 1 pool5/7x7_s1 pool5/7x7_s1_pool5/drop_7x7_s1
 InnerProduct     loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000
 Softmax          prob             1 1 loss3/classifier prob 0=0
--- a/benchmark/mobilenet_int8.param
+++ b/benchmark/mobilenet_int8.param
@@ -0,0 +1,114 @@
 7767517
 112 112
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1            1 1 data conv1 0=32 1=3 2=1 3=2 4=1 5=0 6=864 8=2
 BatchNorm        conv1/bn         1 1 conv1 conv1_conv1/bn 0=32
 Scale            conv1/scale      1 1 conv1_conv1/bn conv1_conv1/scale 0=32 1=1
 ReLU             relu1            1 1 conv1_conv1/scale conv1_relu1
 ConvolutionDepthWise conv2_1/dw       1 1 conv1_relu1 conv2_1/dw 0=32 1=3 2=1 3=1 4=1 5=0 6=288 7=32 8=1
 BatchNorm        conv2_1/dw/bn    1 1 conv2_1/dw conv2_1/dw_conv2_1/dw/bn 0=32
 Scale            conv2_1/dw/scale 1 1 conv2_1/dw_conv2_1/dw/bn conv2_1/dw_conv2_1/dw/scale 0=32 1=1
 ReLU             relu2_1/dw       1 1 conv2_1/dw_conv2_1/dw/scale conv2_1/dw_relu2_1/dw
 Convolution      conv2_1/sep      1 1 conv2_1/dw_relu2_1/dw conv2_1/sep 0=64 1=1 2=1 3=1 4=0 5=0 6=2048 8=2
 BatchNorm        conv2_1/sep/bn   1 1 conv2_1/sep conv2_1/sep_conv2_1/sep/bn 0=64
 Scale            conv2_1/sep/scale 1 1 conv2_1/sep_conv2_1/sep/bn conv2_1/sep_conv2_1/sep/scale 0=64 1=1
 ReLU             relu2_1/sep      1 1 conv2_1/sep_conv2_1/sep/scale conv2_1/sep_relu2_1/sep
 ConvolutionDepthWise conv2_2/dw       1 1 conv2_1/sep_relu2_1/sep conv2_2/dw 0=64 1=3 2=1 3=2 4=1 5=0 6=576 7=64 8=1
 BatchNorm        conv2_2/dw/bn    1 1 conv2_2/dw conv2_2/dw_conv2_2/dw/bn 0=64
 Scale            conv2_2/dw/scale 1 1 conv2_2/dw_conv2_2/dw/bn conv2_2/dw_conv2_2/dw/scale 0=64 1=1
 ReLU             relu2_2/dw       1 1 conv2_2/dw_conv2_2/dw/scale conv2_2/dw_relu2_2/dw
 Convolution      conv2_2/sep      1 1 conv2_2/dw_relu2_2/dw conv2_2/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=8192 8=2
 BatchNorm        conv2_2/sep/bn   1 1 conv2_2/sep conv2_2/sep_conv2_2/sep/bn 0=128
 Scale            conv2_2/sep/scale 1 1 conv2_2/sep_conv2_2/sep/bn conv2_2/sep_conv2_2/sep/scale 0=128 1=1
 ReLU             relu2_2/sep      1 1 conv2_2/sep_conv2_2/sep/scale conv2_2/sep_relu2_2/sep
 ConvolutionDepthWise conv3_1/dw       1 1 conv2_2/sep_relu2_2/sep conv3_1/dw 0=128 1=3 2=1 3=1 4=1 5=0 6=1152 7=128 8=1
 BatchNorm        conv3_1/dw/bn    1 1 conv3_1/dw conv3_1/dw_conv3_1/dw/bn 0=128
 Scale            conv3_1/dw/scale 1 1 conv3_1/dw_conv3_1/dw/bn conv3_1/dw_conv3_1/dw/scale 0=128 1=1
 ReLU             relu3_1/dw       1 1 conv3_1/dw_conv3_1/dw/scale conv3_1/dw_relu3_1/dw
 Convolution      conv3_1/sep      1 1 conv3_1/dw_relu3_1/dw conv3_1/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        conv3_1/sep/bn   1 1 conv3_1/sep conv3_1/sep_conv3_1/sep/bn 0=128
 Scale            conv3_1/sep/scale 1 1 conv3_1/sep_conv3_1/sep/bn conv3_1/sep_conv3_1/sep/scale 0=128 1=1
 ReLU             relu3_1/sep      1 1 conv3_1/sep_conv3_1/sep/scale conv3_1/sep_relu3_1/sep
 ConvolutionDepthWise conv3_2/dw       1 1 conv3_1/sep_relu3_1/sep conv3_2/dw 0=128 1=3 2=1 3=2 4=1 5=0 6=1152 7=128 8=1
 BatchNorm        conv3_2/dw/bn    1 1 conv3_2/dw conv3_2/dw_conv3_2/dw/bn 0=128
 Scale            conv3_2/dw/scale 1 1 conv3_2/dw_conv3_2/dw/bn conv3_2/dw_conv3_2/dw/scale 0=128 1=1
 ReLU             relu3_2/dw       1 1 conv3_2/dw_conv3_2/dw/scale conv3_2/dw_relu3_2/dw
 Convolution      conv3_2/sep      1 1 conv3_2/dw_relu3_2/dw conv3_2/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=32768 8=2
 BatchNorm        conv3_2/sep/bn   1 1 conv3_2/sep conv3_2/sep_conv3_2/sep/bn 0=256
 Scale            conv3_2/sep/scale 1 1 conv3_2/sep_conv3_2/sep/bn conv3_2/sep_conv3_2/sep/scale 0=256 1=1
 ReLU             relu3_2/sep      1 1 conv3_2/sep_conv3_2/sep/scale conv3_2/sep_relu3_2/sep
 ConvolutionDepthWise conv4_1/dw       1 1 conv3_2/sep_relu3_2/sep conv4_1/dw 0=256 1=3 2=1 3=1 4=1 5=0 6=2304 7=256 8=1
 BatchNorm        conv4_1/dw/bn    1 1 conv4_1/dw conv4_1/dw_conv4_1/dw/bn 0=256
 Scale            conv4_1/dw/scale 1 1 conv4_1/dw_conv4_1/dw/bn conv4_1/dw_conv4_1/dw/scale 0=256 1=1
 ReLU             relu4_1/dw       1 1 conv4_1/dw_conv4_1/dw/scale conv4_1/dw_relu4_1/dw
 Convolution      conv4_1/sep      1 1 conv4_1/dw_relu4_1/dw conv4_1/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        conv4_1/sep/bn   1 1 conv4_1/sep conv4_1/sep_conv4_1/sep/bn 0=256
 Scale            conv4_1/sep/scale 1 1 conv4_1/sep_conv4_1/sep/bn conv4_1/sep_conv4_1/sep/scale 0=256 1=1
 ReLU             relu4_1/sep      1 1 conv4_1/sep_conv4_1/sep/scale conv4_1/sep_relu4_1/sep
 ConvolutionDepthWise conv4_2/dw       1 1 conv4_1/sep_relu4_1/sep conv4_2/dw 0=256 1=3 2=1 3=2 4=1 5=0 6=2304 7=256 8=1
 BatchNorm        conv4_2/dw/bn    1 1 conv4_2/dw conv4_2/dw_conv4_2/dw/bn 0=256
 Scale            conv4_2/dw/scale 1 1 conv4_2/dw_conv4_2/dw/bn conv4_2/dw_conv4_2/dw/scale 0=256 1=1
 ReLU             relu4_2/dw       1 1 conv4_2/dw_conv4_2/dw/scale conv4_2/dw_relu4_2/dw
 Convolution      conv4_2/sep      1 1 conv4_2/dw_relu4_2/dw conv4_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=131072 8=2
 BatchNorm        conv4_2/sep/bn   1 1 conv4_2/sep conv4_2/sep_conv4_2/sep/bn 0=512
 Scale            conv4_2/sep/scale 1 1 conv4_2/sep_conv4_2/sep/bn conv4_2/sep_conv4_2/sep/scale 0=512 1=1
 ReLU             relu4_2/sep      1 1 conv4_2/sep_conv4_2/sep/scale conv4_2/sep_relu4_2/sep
 ConvolutionDepthWise conv5_1/dw       1 1 conv4_2/sep_relu4_2/sep conv5_1/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_1/dw/bn    1 1 conv5_1/dw conv5_1/dw_conv5_1/dw/bn 0=512
 Scale            conv5_1/dw/scale 1 1 conv5_1/dw_conv5_1/dw/bn conv5_1/dw_conv5_1/dw/scale 0=512 1=1
 ReLU             relu5_1/dw       1 1 conv5_1/dw_conv5_1/dw/scale conv5_1/dw_relu5_1/dw
 Convolution      conv5_1/sep      1 1 conv5_1/dw_relu5_1/dw conv5_1/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        conv5_1/sep/bn   1 1 conv5_1/sep conv5_1/sep_conv5_1/sep/bn 0=512
 Scale            conv5_1/sep/scale 1 1 conv5_1/sep_conv5_1/sep/bn conv5_1/sep_conv5_1/sep/scale 0=512 1=1
 ReLU             relu5_1/sep      1 1 conv5_1/sep_conv5_1/sep/scale conv5_1/sep_relu5_1/sep
 ConvolutionDepthWise conv5_2/dw       1 1 conv5_1/sep_relu5_1/sep conv5_2/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_2/dw/bn    1 1 conv5_2/dw conv5_2/dw_conv5_2/dw/bn 0=512
 Scale            conv5_2/dw/scale 1 1 conv5_2/dw_conv5_2/dw/bn conv5_2/dw_conv5_2/dw/scale 0=512 1=1
 ReLU             relu5_2/dw       1 1 conv5_2/dw_conv5_2/dw/scale conv5_2/dw_relu5_2/dw
 Convolution      conv5_2/sep      1 1 conv5_2/dw_relu5_2/dw conv5_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        conv5_2/sep/bn   1 1 conv5_2/sep conv5_2/sep_conv5_2/sep/bn 0=512
 Scale            conv5_2/sep/scale 1 1 conv5_2/sep_conv5_2/sep/bn conv5_2/sep_conv5_2/sep/scale 0=512 1=1
 ReLU             relu5_2/sep      1 1 conv5_2/sep_conv5_2/sep/scale conv5_2/sep_relu5_2/sep
 ConvolutionDepthWise conv5_3/dw       1 1 conv5_2/sep_relu5_2/sep conv5_3/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_3/dw/bn    1 1 conv5_3/dw conv5_3/dw_conv5_3/dw/bn 0=512
 Scale            conv5_3/dw/scale 1 1 conv5_3/dw_conv5_3/dw/bn conv5_3/dw_conv5_3/dw/scale 0=512 1=1
 ReLU             relu5_3/dw       1 1 conv5_3/dw_conv5_3/dw/scale conv5_3/dw_relu5_3/dw
 Convolution      conv5_3/sep      1 1 conv5_3/dw_relu5_3/dw conv5_3/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        conv5_3/sep/bn   1 1 conv5_3/sep conv5_3/sep_conv5_3/sep/bn 0=512
 Scale            conv5_3/sep/scale 1 1 conv5_3/sep_conv5_3/sep/bn conv5_3/sep_conv5_3/sep/scale 0=512 1=1
 ReLU             relu5_3/sep      1 1 conv5_3/sep_conv5_3/sep/scale conv5_3/sep_relu5_3/sep
 ConvolutionDepthWise conv5_4/dw       1 1 conv5_3/sep_relu5_3/sep conv5_4/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_4/dw/bn    1 1 conv5_4/dw conv5_4/dw_conv5_4/dw/bn 0=512
 Scale            conv5_4/dw/scale 1 1 conv5_4/dw_conv5_4/dw/bn conv5_4/dw_conv5_4/dw/scale 0=512 1=1
 ReLU             relu5_4/dw       1 1 conv5_4/dw_conv5_4/dw/scale conv5_4/dw_relu5_4/dw
 Convolution      conv5_4/sep      1 1 conv5_4/dw_relu5_4/dw conv5_4/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        conv5_4/sep/bn   1 1 conv5_4/sep conv5_4/sep_conv5_4/sep/bn 0=512
 Scale            conv5_4/sep/scale 1 1 conv5_4/sep_conv5_4/sep/bn conv5_4/sep_conv5_4/sep/scale 0=512 1=1
 ReLU             relu5_4/sep      1 1 conv5_4/sep_conv5_4/sep/scale conv5_4/sep_relu5_4/sep
 ConvolutionDepthWise conv5_5/dw       1 1 conv5_4/sep_relu5_4/sep conv5_5/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_5/dw/bn    1 1 conv5_5/dw conv5_5/dw_conv5_5/dw/bn 0=512
 Scale            conv5_5/dw/scale 1 1 conv5_5/dw_conv5_5/dw/bn conv5_5/dw_conv5_5/dw/scale 0=512 1=1
 ReLU             relu5_5/dw       1 1 conv5_5/dw_conv5_5/dw/scale conv5_5/dw_relu5_5/dw
 Convolution      conv5_5/sep      1 1 conv5_5/dw_relu5_5/dw conv5_5/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        conv5_5/sep/bn   1 1 conv5_5/sep conv5_5/sep_conv5_5/sep/bn 0=512
 Scale            conv5_5/sep/scale 1 1 conv5_5/sep_conv5_5/sep/bn conv5_5/sep_conv5_5/sep/scale 0=512 1=1
 ReLU             relu5_5/sep      1 1 conv5_5/sep_conv5_5/sep/scale conv5_5/sep_relu5_5/sep
 ConvolutionDepthWise conv5_6/dw       1 1 conv5_5/sep_relu5_5/sep conv5_6/dw 0=512 1=3 2=1 3=2 4=1 5=0 6=4608 7=512 8=1
 BatchNorm        conv5_6/dw/bn    1 1 conv5_6/dw conv5_6/dw_conv5_6/dw/bn 0=512
 Scale            conv5_6/dw/scale 1 1 conv5_6/dw_conv5_6/dw/bn conv5_6/dw_conv5_6/dw/scale 0=512 1=1
 ReLU             relu5_6/dw       1 1 conv5_6/dw_conv5_6/dw/scale conv5_6/dw_relu5_6/dw
 Convolution      conv5_6/sep      1 1 conv5_6/dw_relu5_6/dw conv5_6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=524288 8=2
 BatchNorm        conv5_6/sep/bn   1 1 conv5_6/sep conv5_6/sep_conv5_6/sep/bn 0=1024
 Scale            conv5_6/sep/scale 1 1 conv5_6/sep_conv5_6/sep/bn conv5_6/sep_conv5_6/sep/scale 0=1024 1=1
 ReLU             relu5_6/sep      1 1 conv5_6/sep_conv5_6/sep/scale conv5_6/sep_relu5_6/sep
 ConvolutionDepthWise conv6/dw         1 1 conv5_6/sep_relu5_6/sep conv6/dw 0=1024 1=3 2=1 3=1 4=1 5=0 6=9216 7=1024 8=1
 BatchNorm        conv6/dw/bn      1 1 conv6/dw conv6/dw_conv6/dw/bn 0=1024
 Scale            conv6/dw/scale   1 1 conv6/dw_conv6/dw/bn conv6/dw_conv6/dw/scale 0=1024 1=1
 ReLU             relu6/dw         1 1 conv6/dw_conv6/dw/scale conv6/dw_relu6/dw
 Convolution      conv6/sep        1 1 conv6/dw_relu6/dw conv6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        conv6/sep/bn     1 1 conv6/sep conv6/sep_conv6/sep/bn 0=1024
 Scale            conv6/sep/scale  1 1 conv6/sep_conv6/sep/bn conv6/sep_conv6/sep/scale 0=1024 1=1
 ReLU             relu6/sep        1 1 conv6/sep_conv6/sep/scale conv6/sep_relu6/sep
 Pooling          pool6            1 1 conv6/sep_relu6/sep pool6 0=1 1=0 2=1 3=0 4=1
 Convolution      fc7              1 1 pool6 fc7 0=1000 1=1 2=1 3=1 4=0 5=1 6=1024000 8=2
 Softmax          prob             1 1 fc7 prob 0=0
--- a/benchmark/mobilenet_ssd_int8.param
+++ b/benchmark/mobilenet_ssd_int8.param
@@ -0,0 +1,129 @@
 7767517
 127 150
 Input            data             0 1 data 0=300 1=300 2=3
 Split            splitncnn_0      1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
 Convolution      conv0            1 1 data_splitncnn_6 conv0 0=32 1=3 2=1 3=2 4=1 5=1 6=864 8=2
 ReLU             conv0/relu       1 1 conv0 conv0_conv0/relu
 ConvolutionDepthWise conv1/dw         1 1 conv0_conv0/relu conv1/dw 0=32 1=3 2=1 3=1 4=1 5=1 6=288 7=32 8=1
 ReLU             conv1/dw/relu    1 1 conv1/dw conv1/dw_conv1/dw/relu
 Convolution      conv1            1 1 conv1/dw_conv1/dw/relu conv1 0=64 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
 ReLU             conv1/relu       1 1 conv1 conv1_conv1/relu
 ConvolutionDepthWise conv2/dw         1 1 conv1_conv1/relu conv2/dw 0=64 1=3 2=1 3=2 4=1 5=1 6=576 7=64 8=1
 ReLU             conv2/dw/relu    1 1 conv2/dw conv2/dw_conv2/dw/relu
 Convolution      conv2            1 1 conv2/dw_conv2/dw/relu conv2 0=128 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
 ReLU             conv2/relu       1 1 conv2 conv2_conv2/relu
 ConvolutionDepthWise conv3/dw         1 1 conv2_conv2/relu conv3/dw 0=128 1=3 2=1 3=1 4=1 5=1 6=1152 7=128 8=1
 ReLU             conv3/dw/relu    1 1 conv3/dw conv3/dw_conv3/dw/relu
 Convolution      conv3            1 1 conv3/dw_conv3/dw/relu conv3 0=128 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             conv3/relu       1 1 conv3 conv3_conv3/relu
 ConvolutionDepthWise conv4/dw         1 1 conv3_conv3/relu conv4/dw 0=128 1=3 2=1 3=2 4=1 5=1 6=1152 7=128 8=1
 ReLU             conv4/dw/relu    1 1 conv4/dw conv4/dw_conv4/dw/relu
 Convolution      conv4            1 1 conv4/dw_conv4/dw/relu conv4 0=256 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             conv4/relu       1 1 conv4 conv4_conv4/relu
 ConvolutionDepthWise conv5/dw         1 1 conv4_conv4/relu conv5/dw 0=256 1=3 2=1 3=1 4=1 5=1 6=2304 7=256 8=1
 ReLU             conv5/dw/relu    1 1 conv5/dw conv5/dw_conv5/dw/relu
 Convolution      conv5            1 1 conv5/dw_conv5/dw/relu conv5 0=256 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
 ReLU             conv5/relu       1 1 conv5 conv5_conv5/relu
 ConvolutionDepthWise conv6/dw         1 1 conv5_conv5/relu conv6/dw 0=256 1=3 2=1 3=2 4=1 5=1 6=2304 7=256 8=1
 ReLU             conv6/dw/relu    1 1 conv6/dw conv6/dw_conv6/dw/relu
 Convolution      conv6            1 1 conv6/dw_conv6/dw/relu conv6 0=512 1=1 2=1 3=1 4=0 5=1 6=131072 8=2
 ReLU             conv6/relu       1 1 conv6 conv6_conv6/relu
 ConvolutionDepthWise conv7/dw         1 1 conv6_conv6/relu conv7/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv7/dw/relu    1 1 conv7/dw conv7/dw_conv7/dw/relu
 Convolution      conv7            1 1 conv7/dw_conv7/dw/relu conv7 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv7/relu       1 1 conv7 conv7_conv7/relu
 ConvolutionDepthWise conv8/dw         1 1 conv7_conv7/relu conv8/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv8/dw/relu    1 1 conv8/dw conv8/dw_conv8/dw/relu
 Convolution      conv8            1 1 conv8/dw_conv8/dw/relu conv8 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv8/relu       1 1 conv8 conv8_conv8/relu
 ConvolutionDepthWise conv9/dw         1 1 conv8_conv8/relu conv9/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv9/dw/relu    1 1 conv9/dw conv9/dw_conv9/dw/relu
 Convolution      conv9            1 1 conv9/dw_conv9/dw/relu conv9 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv9/relu       1 1 conv9 conv9_conv9/relu
 ConvolutionDepthWise conv10/dw        1 1 conv9_conv9/relu conv10/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv10/dw/relu   1 1 conv10/dw conv10/dw_conv10/dw/relu
 Convolution      conv10           1 1 conv10/dw_conv10/dw/relu conv10 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv10/relu      1 1 conv10 conv10_conv10/relu
 ConvolutionDepthWise conv11/dw        1 1 conv10_conv10/relu conv11/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv11/dw/relu   1 1 conv11/dw conv11/dw_conv11/dw/relu
 Convolution      conv11           1 1 conv11/dw_conv11/dw/relu conv11 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv11/relu      1 1 conv11 conv11_conv11/relu
 Split            splitncnn_1      1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3
 ConvolutionDepthWise conv12/dw        1 1 conv11_conv11/relu_splitncnn_3 conv12/dw 0=512 1=3 2=1 3=2 4=1 5=1 6=4608 7=512 8=1
 ReLU             conv12/dw/relu   1 1 conv12/dw conv12/dw_conv12/dw/relu
 Convolution      conv12           1 1 conv12/dw_conv12/dw/relu conv12 0=1024 1=1 2=1 3=1 4=0 5=1 6=524288 8=2
 ReLU             conv12/relu      1 1 conv12 conv12_conv12/relu
 ConvolutionDepthWise conv13/dw        1 1 conv12_conv12/relu conv13/dw 0=1024 1=3 2=1 3=1 4=1 5=1 6=9216 7=1024 8=1
 ReLU             conv13/dw/relu   1 1 conv13/dw conv13/dw_conv13/dw/relu
 Convolution      conv13           1 1 conv13/dw_conv13/dw/relu conv13 0=1024 1=1 2=1 3=1 4=0 5=1 6=1048576 8=2
 ReLU             conv13/relu      1 1 conv13 conv13_conv13/relu
 Split            splitncnn_2      1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3
 Convolution      conv14_1         1 1 conv13_conv13/relu_splitncnn_3 conv14_1 0=256 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
 ReLU             conv14_1/relu    1 1 conv14_1 conv14_1_conv14_1/relu
 Convolution      conv14_2         1 1 conv14_1_conv14_1/relu conv14_2 0=512 1=3 2=1 3=2 4=1 5=1 6=1179648 8=2
 ReLU             conv14_2/relu    1 1 conv14_2 conv14_2_conv14_2/relu
 Split            splitncnn_3      1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3
 Convolution      conv15_1         1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
 ReLU             conv15_1/relu    1 1 conv15_1 conv15_1_conv15_1/relu
 Convolution      conv15_2         1 1 conv15_1_conv15_1/relu conv15_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
 ReLU             conv15_2/relu    1 1 conv15_2 conv15_2_conv15_2/relu
 Split            splitncnn_4      1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3
 Convolution      conv16_1         1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             conv16_1/relu    1 1 conv16_1 conv16_1_conv16_1/relu
 Convolution      conv16_2         1 1 conv16_1_conv16_1/relu conv16_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
 ReLU             conv16_2/relu    1 1 conv16_2 conv16_2_conv16_2/relu
 Split            splitncnn_5      1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3
 Convolution      conv17_1         1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             conv17_1/relu    1 1 conv17_1 conv17_1_conv17_1/relu
 Convolution      conv17_2         1 1 conv17_1_conv17_1/relu conv17_2 0=128 1=3 2=1 3=2 4=1 5=1 6=73728 8=2
 ReLU             conv17_2/relu    1 1 conv17_2 conv17_2_conv17_2/relu
 Split            splitncnn_6      1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2
 Convolution      conv11_mbox_loc  1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
 Permute          conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3
 Flatten          conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat
 Convolution      conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
 Permute          conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3
 Flatten          conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat
 PriorBox         conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23301=0 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Convolution      conv13_mbox_loc  1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
 Permute          conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3
 Flatten          conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat
 Convolution      conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=129024 8=2
 Permute          conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3
 Flatten          conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat
 PriorBox         conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Convolution      conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 Permute          conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3
 Flatten          conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat
 Convolution      conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=64512 8=2
 Permute          conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3
 Flatten          conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat
 PriorBox         conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Convolution      conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
 Permute          conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3
 Flatten          conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat
 Convolution      conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
 Permute          conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3
 Flatten          conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat
 PriorBox         conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Convolution      conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
 Permute          conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3
 Flatten          conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat
 Convolution      conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
 Permute          conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3
 Flatten          conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat
 PriorBox         conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Convolution      conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
 Permute          conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3
 Flatten          conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat
 Convolution      conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=16128 8=2
 Permute          conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3
 Flatten          conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat
 PriorBox         conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
 Concat           mbox_loc         6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc 0=0
 Concat           mbox_conf        6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf 0=0
 Concat           mbox_priorbox    6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1
 Reshape          mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
 Softmax          mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
 Flatten          mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
 DetectionOutput  detection_out    3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.250000
--- a/benchmark/resnet18_int8.param
+++ b/benchmark/resnet18_int8.param
@@ -0,0 +1,103 @@
 7767517
 101 109
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=0 6=9408 8=2
 BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
 Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
 ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
 Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
 Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
 BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=64
 Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=64 1=1
 Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
 Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
 ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
 Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
 Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
 Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -23301=0
 ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
 Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
 Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
 Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
 ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
 Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
 Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
 Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -23301=0
 ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
 Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
 Convolution      res3a_branch1    1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1 0=128 1=1 2=1 3=2 4=0 5=0 6=8192 8=2
 BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=128
 Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=128 1=1
 Convolution      res3a_branch2a   1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
 BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
 Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
 ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
 Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
 Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
 Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -23301=0
 ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
 Split            splitncnn_3      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
 Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
 Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
 ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
 Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
 Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
 Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -23301=0
 ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
 Split            splitncnn_4      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
 Convolution      res4a_branch1    1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1 0=256 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
 BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=256
 Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=256 1=1
 Convolution      res4a_branch2a   1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
 BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
 Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
 ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
 Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
 Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
 Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -23301=0
 ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
 Split            splitncnn_5      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
 Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
 Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
 ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
 Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
 Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
 Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -23301=0
 ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
 Split            splitncnn_6      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
 Convolution      res5a_branch1    1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
 BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=512
 Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=512 1=1
 Convolution      res5a_branch2a   1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a 0=512 1=3 2=1 3=2 4=1 5=0 6=1179648 8=2
 BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
 Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
 ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
 Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
 Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
 Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -23301=0
 ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
 Split            splitncnn_7      1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
 Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
 Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
 ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
 Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
 Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
 Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -23301=0
 ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
 Pooling          pool5            1 1 res5b_res5b_relu pool5 0=1 1=7 2=1 3=0 4=0
 InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=512000
 Softmax          prob             1 1 fc1000 prob 0=0
--- a/benchmark/resnet50.param
+++ b/benchmark/resnet50.param
@@ -0,0 +1,247 @@
 7767517
 245 261
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408
 BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
 Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
 ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
 Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
 Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
 Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
 Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096
 BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
 Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
 ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
 Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
 BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
 Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
 ReLU             res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
 Convolution      res2a_branch2c   1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2a_branch2c    1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
 Scale            scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
 Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
 ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
 Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
 Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
 Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
 ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
 Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
 BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
 Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
 ReLU             res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
 Convolution      res2b_branch2c   1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2b_branch2c    1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
 Scale            scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
 Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
 ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
 Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
 Convolution      res2c_branch2a   1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2c_branch2a    1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
 Scale            scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
 ReLU             res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
 Convolution      res2c_branch2b   1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
 BatchNorm        bn2c_branch2b    1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
 Scale            scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
 ReLU             res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
 Convolution      res2c_branch2c   1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
 BatchNorm        bn2c_branch2c    1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
 Scale            scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
 Eltwise          res2c            2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
 ReLU             res2c_relu       1 1 res2c res2c_res2c_relu
 Split            splitncnn_3      1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
 Convolution      res3a_branch1    1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072
 BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
 Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
 Convolution      res3a_branch2a   1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768
 BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
 Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
 ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
 Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
 BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
 Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
 ReLU             res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
 Convolution      res3a_branch2c   1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3a_branch2c    1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
 Scale            scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
 Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
 ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
 Split            splitncnn_4      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
 Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
 Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
 ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
 Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
 BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
 Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
 ReLU             res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
 Convolution      res3b_branch2c   1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3b_branch2c    1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
 Scale            scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
 Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
 ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
 Split            splitncnn_5      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
 Convolution      res3c_branch2a   1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3c_branch2a    1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
 Scale            scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
 ReLU             res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
 Convolution      res3c_branch2b   1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
 BatchNorm        bn3c_branch2b    1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
 Scale            scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
 ReLU             res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
 Convolution      res3c_branch2c   1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3c_branch2c    1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
 Scale            scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
 Eltwise          res3c            2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
 ReLU             res3c_relu       1 1 res3c res3c_res3c_relu
 Split            splitncnn_6      1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
 Convolution      res3d_branch2a   1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3d_branch2a    1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
 Scale            scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
 ReLU             res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
 Convolution      res3d_branch2b   1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
 BatchNorm        bn3d_branch2b    1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
 Scale            scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
 ReLU             res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
 Convolution      res3d_branch2c   1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
 BatchNorm        bn3d_branch2c    1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
 Scale            scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
 Eltwise          res3d            2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
 ReLU             res3d_relu       1 1 res3d res3d_res3d_relu
 Split            splitncnn_7      1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
 Convolution      res4a_branch1    1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288
 BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
 Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
 Convolution      res4a_branch2a   1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072
 BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
 Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
 ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
 Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
 Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
 ReLU             res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
 Convolution      res4a_branch2c   1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4a_branch2c    1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
 Scale            scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
 Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
 ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
 Split            splitncnn_8      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
 Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
 Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
 ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
 Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
 Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
 ReLU             res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
 Convolution      res4b_branch2c   1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4b_branch2c    1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
 Scale            scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
 Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
 ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
 Split            splitncnn_9      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
 Convolution      res4c_branch2a   1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4c_branch2a    1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
 Scale            scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
 ReLU             res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
 Convolution      res4c_branch2b   1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4c_branch2b    1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
 Scale            scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
 ReLU             res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
 Convolution      res4c_branch2c   1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4c_branch2c    1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
 Scale            scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
 Eltwise          res4c            2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
 ReLU             res4c_relu       1 1 res4c res4c_res4c_relu
 Split            splitncnn_10     1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
 Convolution      res4d_branch2a   1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4d_branch2a    1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
 Scale            scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
 ReLU             res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
 Convolution      res4d_branch2b   1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4d_branch2b    1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
 Scale            scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
 ReLU             res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
 Convolution      res4d_branch2c   1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4d_branch2c    1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
 Scale            scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
 Eltwise          res4d            2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
 ReLU             res4d_relu       1 1 res4d res4d_res4d_relu
 Split            splitncnn_11     1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
 Convolution      res4e_branch2a   1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4e_branch2a    1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
 Scale            scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
 ReLU             res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
 Convolution      res4e_branch2b   1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4e_branch2b    1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
 Scale            scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
 ReLU             res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
 Convolution      res4e_branch2c   1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4e_branch2c    1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
 Scale            scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
 Eltwise          res4e            2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
 ReLU             res4e_relu       1 1 res4e res4e_res4e_relu
 Split            splitncnn_12     1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
 Convolution      res4f_branch2a   1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4f_branch2a    1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
 Scale            scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
 ReLU             res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
 Convolution      res4f_branch2b   1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
 BatchNorm        bn4f_branch2b    1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
 Scale            scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
 ReLU             res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
 Convolution      res4f_branch2c   1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
 BatchNorm        bn4f_branch2c    1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
 Scale            scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
 Eltwise          res4f            2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
 ReLU             res4f_relu       1 1 res4f res4f_res4f_relu
 Split            splitncnn_13     1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
 Convolution      res5a_branch1    1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152
 BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
 Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
 Convolution      res5a_branch2a   1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288
 BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
 Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
 ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
 Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
 BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
 Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
 ReLU             res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
 Convolution      res5a_branch2c   1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
 BatchNorm        bn5a_branch2c    1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
 Scale            scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
 Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
 ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
 Split            splitncnn_14     1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
 Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
 BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
 Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
 ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
 Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
 BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
 Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
 ReLU             res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
 Convolution      res5b_branch2c   1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
 BatchNorm        bn5b_branch2c    1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
 Scale            scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
 Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
 ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
 Split            splitncnn_15     1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
 Convolution      res5c_branch2a   1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
 BatchNorm        bn5c_branch2a    1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
 Scale            scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
 ReLU             res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
 Convolution      res5c_branch2b   1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
 BatchNorm        bn5c_branch2b    1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
 Scale            scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
 ReLU             res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
 Convolution      res5c_branch2c   1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
 BatchNorm        bn5c_branch2c    1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
 Scale            scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
 Eltwise          res5c            2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
 ReLU             res5c_relu       1 1 res5c res5c_res5c_relu
 Pooling          pool5            1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
 InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=2048000
 Softmax          prob             1 1 fc1000 prob 0=0
--- a/benchmark/resnet50_int8.param
+++ b/benchmark/resnet50_int8.param
@@ -0,0 +1,247 @@
 7767517
 245 261
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
 BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
 Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
 ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
 Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
 Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
 Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
 Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
 Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
 BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
 Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
 ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
 Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
 Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
 ReLU             res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
 Convolution      res2a_branch2c   1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2a_branch2c    1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
 Scale            scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
 Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
 ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
 Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
 Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
 Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
 ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
 Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
 Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
 ReLU             res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
 Convolution      res2b_branch2c   1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2b_branch2c    1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
 Scale            scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
 Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
 ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
 Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
 Convolution      res2c_branch2a   1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2c_branch2a    1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
 Scale            scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
 ReLU             res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
 Convolution      res2c_branch2b   1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
 BatchNorm        bn2c_branch2b    1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
 Scale            scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
 ReLU             res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
 Convolution      res2c_branch2c   1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        bn2c_branch2c    1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
 Scale            scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
 Eltwise          res2c            2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
 ReLU             res2c_relu       1 1 res2c res2c_res2c_relu
 Split            splitncnn_3      1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
 Convolution      res3a_branch1    1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
 BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
 Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
 Convolution      res3a_branch2a   1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
 BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
 Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
 ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
 Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
 Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
 ReLU             res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
 Convolution      res3a_branch2c   1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3a_branch2c    1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
 Scale            scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
 Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
 ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
 Split            splitncnn_4      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
 Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
 Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
 ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
 Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
 Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
 ReLU             res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
 Convolution      res3b_branch2c   1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3b_branch2c    1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
 Scale            scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
 Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
 ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
 Split            splitncnn_5      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
 Convolution      res3c_branch2a   1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3c_branch2a    1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
 Scale            scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
 ReLU             res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
 Convolution      res3c_branch2b   1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3c_branch2b    1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
 Scale            scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
 ReLU             res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
 Convolution      res3c_branch2c   1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3c_branch2c    1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
 Scale            scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
 Eltwise          res3c            2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
 ReLU             res3c_relu       1 1 res3c res3c_res3c_relu
 Split            splitncnn_6      1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
 Convolution      res3d_branch2a   1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3d_branch2a    1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
 Scale            scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
 ReLU             res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
 Convolution      res3d_branch2b   1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
 BatchNorm        bn3d_branch2b    1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
 Scale            scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
 ReLU             res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
 Convolution      res3d_branch2c   1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
 BatchNorm        bn3d_branch2c    1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
 Scale            scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
 Eltwise          res3d            2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
 ReLU             res3d_relu       1 1 res3d res3d_res3d_relu
 Split            splitncnn_7      1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
 Convolution      res4a_branch1    1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
 BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
 Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
 Convolution      res4a_branch2a   1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
 BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
 Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
 ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
 Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
 Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
 ReLU             res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
 Convolution      res4a_branch2c   1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4a_branch2c    1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
 Scale            scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
 Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
 ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
 Split            splitncnn_8      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
 Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
 Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
 ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
 Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
 Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
 ReLU             res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
 Convolution      res4b_branch2c   1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4b_branch2c    1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
 Scale            scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
 Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
 ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
 Split            splitncnn_9      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
 Convolution      res4c_branch2a   1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4c_branch2a    1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
 Scale            scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
 ReLU             res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
 Convolution      res4c_branch2b   1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4c_branch2b    1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
 Scale            scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
 ReLU             res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
 Convolution      res4c_branch2c   1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4c_branch2c    1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
 Scale            scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
 Eltwise          res4c            2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
 ReLU             res4c_relu       1 1 res4c res4c_res4c_relu
 Split            splitncnn_10     1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
 Convolution      res4d_branch2a   1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4d_branch2a    1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
 Scale            scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
 ReLU             res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
 Convolution      res4d_branch2b   1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4d_branch2b    1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
 Scale            scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
 ReLU             res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
 Convolution      res4d_branch2c   1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4d_branch2c    1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
 Scale            scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
 Eltwise          res4d            2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
 ReLU             res4d_relu       1 1 res4d res4d_res4d_relu
 Split            splitncnn_11     1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
 Convolution      res4e_branch2a   1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4e_branch2a    1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
 Scale            scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
 ReLU             res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
 Convolution      res4e_branch2b   1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4e_branch2b    1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
 Scale            scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
 ReLU             res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
 Convolution      res4e_branch2c   1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4e_branch2c    1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
 Scale            scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
 Eltwise          res4e            2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
 ReLU             res4e_relu       1 1 res4e res4e_res4e_relu
 Split            splitncnn_12     1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
 Convolution      res4f_branch2a   1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4f_branch2a    1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
 Scale            scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
 ReLU             res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
 Convolution      res4f_branch2b   1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
 BatchNorm        bn4f_branch2b    1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
 Scale            scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
 ReLU             res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
 Convolution      res4f_branch2c   1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
 BatchNorm        bn4f_branch2c    1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
 Scale            scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
 Eltwise          res4f            2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
 ReLU             res4f_relu       1 1 res4f res4f_res4f_relu
 Split            splitncnn_13     1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
 Convolution      res5a_branch1    1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 8=2
 BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
 Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
 Convolution      res5a_branch2a   1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
 BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
 Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
 ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
 Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
 Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
 ReLU             res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
 Convolution      res5a_branch2c   1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        bn5a_branch2c    1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
 Scale            scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
 Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
 ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
 Split            splitncnn_14     1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
 Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
 Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
 ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
 Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
 Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
 ReLU             res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
 Convolution      res5b_branch2c   1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        bn5b_branch2c    1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
 Scale            scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
 Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
 ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
 Split            splitncnn_15     1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
 Convolution      res5c_branch2a   1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        bn5c_branch2a    1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
 Scale            scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
 ReLU             res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
 Convolution      res5c_branch2b   1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
 BatchNorm        bn5c_branch2b    1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
 Scale            scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
 ReLU             res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
 Convolution      res5c_branch2c   1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
 BatchNorm        bn5c_branch2c    1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
 Scale            scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
 Eltwise          res5c            2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
 ReLU             res5c_relu       1 1 res5c res5c_res5c_relu
 Pooling          pool5            1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
 InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=2048000
 Softmax          prob             1 1 fc1000 prob 0=0
--- a/benchmark/squeezenet_int8.param
+++ b/benchmark/squeezenet_int8.param
@@ -0,0 +1,77 @@
 7767517
 75 83
 Input            data             0 1 data 0=227 1=227 2=3
 Convolution      conv1            1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
 ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1
 Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
 Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
 Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
 Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
 Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
 ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
 Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
 Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
 ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
 Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
 Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
 Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
 ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
 Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
 Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
 Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
 Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
 Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
 Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
 Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
 Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
 ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
 Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
 Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
 Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
 Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
 Pooling          pool5            1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
 Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
 Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
 Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
 ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
 Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
 ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
 Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
 Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
 ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
 Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
 Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
 ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
 Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
 ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
 Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
 Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
 ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
 Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
 Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
 Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
 ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
 Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
 Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
 Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
 Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1
 Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
 ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3
 Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
 Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
 Convolution      conv10           1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 8=2
 ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10
 Pooling          pool10           1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
 Softmax          prob             1 1 pool10 prob 0=0
--- a/benchmark/squeezenet_ssd_int8.param
+++ b/benchmark/squeezenet_ssd_int8.param
@@ -0,0 +1,181 @@
 7767517
 179 212
 Input            data             0 1 data 0=300 1=300 2=3
 Split            splitncnn_0      1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
 Convolution      conv1            1 1 data_splitncnn_6 conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
 ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1
 Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
 Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
 Split            splitncnn_1      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
 Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
 Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
 ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
 Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
 Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
 ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
 Split            splitncnn_2      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
 Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
 ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
 Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
 ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
 Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
 Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
 Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
 Split            splitncnn_3      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
 Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
 Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
 Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
 Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
 ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
 Split            splitncnn_4      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
 Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
 ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
 Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
 Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
 Split            splitncnn_5      1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1
 Pooling          pool5            1 1 fire5/concat_splitncnn_1 pool5 0=0 1=3 2=2 3=0 4=0
 Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
 ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
 Split            splitncnn_6      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
 Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
 ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
 Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
 ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
 Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
 Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
 ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
 Split            splitncnn_7      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
 Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
 ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
 Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
 ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
 Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
 Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
 ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
 Split            splitncnn_8      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
 Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
 Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
 ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
 Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
 Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
 ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
 Split            splitncnn_9      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
 Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
 BatchNorm        fire9/expand1x1/bn 1 1 fire9/expand1x1 fire9/expand1x1_fire9/expand1x1/bn 0=256
 Scale            fire9/expand1x1/scale 1 1 fire9/expand1x1_fire9/expand1x1/bn fire9/expand1x1_fire9/expand1x1/scale 0=256 1=1
 ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1_fire9/expand1x1/scale fire9/expand1x1_fire9/relu_expand1x1
 Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
 BatchNorm        fire9/expand3x3/bn 1 1 fire9/expand3x3 fire9/expand3x3_fire9/expand3x3/bn 0=256
 Scale            fire9/expand3x3/scale 1 1 fire9/expand3x3_fire9/expand3x3/bn fire9/expand3x3_fire9/expand3x3/scale 0=256 1=1
 ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3_fire9/expand3x3/scale fire9/expand3x3_fire9/relu_expand3x3
 Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
 Split            splitncnn_10     1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3
 Pooling          pool9            1 1 fire9/concat_splitncnn_3 pool9 0=0 1=3 2=2 3=0 4=0
 Convolution      fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=49152 8=2
 BatchNorm        fire10/squeeze1x1/bn 1 1 fire10/squeeze1x1 fire10/squeeze1x1_fire10/squeeze1x1/bn 0=96
 Scale            fire10/squeeze1x1/scale 1 1 fire10/squeeze1x1_fire10/squeeze1x1/bn fire10/squeeze1x1_fire10/squeeze1x1/scale 0=96 1=1
 ReLU             fire10/relu_squeeze1x1 1 1 fire10/squeeze1x1_fire10/squeeze1x1/scale fire10/squeeze1x1_fire10/relu_squeeze1x1
 Split            splitncnn_11     1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1
 Convolution      fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
 BatchNorm        fire10/expand1x1/bn 1 1 fire10/expand1x1 fire10/expand1x1_fire10/expand1x1/bn 0=384
 Scale            fire10/expand1x1/scale 1 1 fire10/expand1x1_fire10/expand1x1/bn fire10/expand1x1_fire10/expand1x1/scale 0=384 1=1
 ReLU             fire10/relu_expand1x1 1 1 fire10/expand1x1_fire10/expand1x1/scale fire10/expand1x1_fire10/relu_expand1x1
 Convolution      fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
 BatchNorm        fire10/expand3x3/bn 1 1 fire10/expand3x3 fire10/expand3x3_fire10/expand3x3/bn 0=384
 Scale            fire10/expand3x3/scale 1 1 fire10/expand3x3_fire10/expand3x3/bn fire10/expand3x3_fire10/expand3x3/scale 0=384 1=1
 ReLU             fire10/relu_expand3x3 1 1 fire10/expand3x3_fire10/expand3x3/scale fire10/expand3x3_fire10/relu_expand3x3
 Concat           fire10/concat    2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat 0=0
 Split            splitncnn_12     1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3
 Pooling          pool10           1 1 fire10/concat_splitncnn_3 pool10 0=0 1=3 2=2 3=0 4=0
 Convolution      fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
 BatchNorm        fire11/squeeze1x1/bn 1 1 fire11/squeeze1x1 fire11/squeeze1x1_fire11/squeeze1x1/bn 0=96
 Scale            fire11/squeeze1x1/scale 1 1 fire11/squeeze1x1_fire11/squeeze1x1/bn fire11/squeeze1x1_fire11/squeeze1x1/scale 0=96 1=1
 ReLU             fire11/relu_squeeze1x1 1 1 fire11/squeeze1x1_fire11/squeeze1x1/scale fire11/squeeze1x1_fire11/relu_squeeze1x1
 Split            splitncnn_13     1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1
 Convolution      fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
 BatchNorm        fire11/expand1x1/bn 1 1 fire11/expand1x1 fire11/expand1x1_fire11/expand1x1/bn 0=384
 Scale            fire11/expand1x1/scale 1 1 fire11/expand1x1_fire11/expand1x1/bn fire11/expand1x1_fire11/expand1x1/scale 0=384 1=1
 ReLU             fire11/relu_expand1x1 1 1 fire11/expand1x1_fire11/expand1x1/scale fire11/expand1x1_fire11/relu_expand1x1
 Convolution      fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
 BatchNorm        fire11/expand3x3/bn 1 1 fire11/expand3x3 fire11/expand3x3_fire11/expand3x3/bn 0=384
 Scale            fire11/expand3x3/scale 1 1 fire11/expand3x3_fire11/expand3x3/bn fire11/expand3x3_fire11/expand3x3/scale 0=384 1=1
 ReLU             fire11/relu_expand3x3 1 1 fire11/expand3x3_fire11/expand3x3/scale fire11/expand3x3_fire11/relu_expand3x3
 Concat           fire11/concat    2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat 0=0
 Split            splitncnn_14     1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3
 Convolution      conv12_1         1 1 fire11/concat_splitncnn_3 conv12_1 0=128 1=1 2=1 3=1 4=0 5=0 6=98304 8=2
 BatchNorm        conv12_1/bn      1 1 conv12_1 conv12_1_conv12_1/bn 0=128
 Scale            conv12_1/scale   1 1 conv12_1_conv12_1/bn conv12_1_conv12_1/scale 0=128 1=1
 ReLU             conv12_1/relu    1 1 conv12_1_conv12_1/scale conv12_1_conv12_1/relu
 Convolution      conv12_2         1 1 conv12_1_conv12_1/relu conv12_2 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
 BatchNorm        conv12_2/bn      1 1 conv12_2 conv12_2_conv12_2/bn 0=256
 Scale            conv12_2/scale   1 1 conv12_2_conv12_2/bn conv12_2_conv12_2/scale 0=256 1=1
 ReLU             conv12_2/relu    1 1 conv12_2_conv12_2/scale conv12_2_conv12_2/relu
 Split            splitncnn_15     1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3
 Convolution      conv13_1         1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
 BatchNorm        conv13_1/bn      1 1 conv13_1 conv13_1_conv13_1/bn 0=64
 Scale            conv13_1/scale   1 1 conv13_1_conv13_1/bn conv13_1_conv13_1/scale 0=64 1=1
 ReLU             conv13_1/relu    1 1 conv13_1_conv13_1/scale conv13_1_conv13_1/relu
 Convolution      conv13_2         1 1 conv13_1_conv13_1/relu conv13_2 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
 BatchNorm        conv13_2/bn      1 1 conv13_2 conv13_2_conv13_2/bn 0=128
 Scale            conv13_2/scale   1 1 conv13_2_conv13_2/bn conv13_2_conv13_2/scale 0=128 1=1
 ReLU             conv13_2/relu    1 1 conv13_2_conv13_2/scale conv13_2_conv13_2/relu
 Split            splitncnn_16     1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2
 BatchNorm        fire5/bn         1 1 fire5/concat_splitncnn_0 fire5/normal 0=256
 Scale            fire5/scale      1 1 fire5/normal fire5/normal_fire5/scale 0=256 1=1
 Split            splitncnn_17     1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2
 Convolution      fire5_mbox_loc   1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 Permute          fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3
 Flatten          fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat
 Convolution      fire5_mbox_conf  1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=193536 8=2
 Permute          fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3
 Flatten          fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat
 PriorBox         fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
 Convolution      fire9_mbox_loc   1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
 Permute          fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3
 Flatten          fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat
 Convolution      fire9_mbox_conf  1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=580608 8=2
 Permute          fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3
 Flatten          fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat
 PriorBox         fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
 Convolution      fire10_mbox_loc  1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
 Permute          fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3
 Flatten          fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat
 Convolution      fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
 Permute          fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3
 Flatten          fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat
 PriorBox         fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
 Convolution      fire11_mbox_loc  1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
 Permute          fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3
 Flatten          fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat
 Convolution      fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
 Permute          fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3
 Flatten          fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat
 PriorBox         fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
 Convolution      conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=55296 8=2
 Permute          conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3
 Flatten          conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat
 Convolution      conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=290304 8=2
 Permute          conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3
 Flatten          conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat
 PriorBox         conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
 Convolution      conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=18432 8=2
 Permute          conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3
 Flatten          conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat
 Convolution      conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=96768 8=2
 Permute          conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3
 Flatten          conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat
 PriorBox         conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
 Concat           mbox_loc         6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc 0=0
 Concat           mbox_conf        6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf 0=0
 Concat           mbox_priorbox    6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1
 Reshape          mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
 Softmax          mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
 Flatten          mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
 DetectionOutput  detection_out    3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.050000
--- a/benchmark/vgg16_int8.param
+++ b/benchmark/vgg16_int8.param
@@ -0,0 +1,42 @@
 7767517
 40 40
 Input            data             0 1 data 0=224 1=224 2=3
 Convolution      conv1_1          1 1 data conv1_1 0=64 1=3 2=1 3=1 4=1 5=1 6=1728 8=2
 ReLU             relu1_1          1 1 conv1_1 conv1_1_relu1_1
 Convolution      conv1_2          1 1 conv1_1_relu1_1 conv1_2 0=64 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
 ReLU             relu1_2          1 1 conv1_2 conv1_2_relu1_2
 Pooling          pool1            1 1 conv1_2_relu1_2 pool1 0=0 1=2 2=2 3=0 4=0
 Convolution      conv2_1          1 1 pool1 conv2_1 0=128 1=3 2=1 3=1 4=1 5=1 6=73728 8=2
 ReLU             relu2_1          1 1 conv2_1 conv2_1_relu2_1
 Convolution      conv2_2          1 1 conv2_1_relu2_1 conv2_2 0=128 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
 ReLU             relu2_2          1 1 conv2_2 conv2_2_relu2_2
 Pooling          pool2            1 1 conv2_2_relu2_2 pool2 0=0 1=2 2=2 3=0 4=0
 Convolution      conv3_1          1 1 pool2 conv3_1 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
 ReLU             relu3_1          1 1 conv3_1 conv3_1_relu3_1
 Convolution      conv3_2          1 1 conv3_1_relu3_1 conv3_2 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
 ReLU             relu3_2          1 1 conv3_2 conv3_2_relu3_2
 Convolution      conv3_3          1 1 conv3_2_relu3_2 conv3_3 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
 ReLU             relu3_3          1 1 conv3_3 conv3_3_relu3_3
 Pooling          pool3            1 1 conv3_3_relu3_3 pool3 0=0 1=2 2=2 3=0 4=0
 Convolution      conv4_1          1 1 pool3 conv4_1 0=512 1=3 2=1 3=1 4=1 5=1 6=1179648 8=2
 ReLU             relu4_1          1 1 conv4_1 conv4_1_relu4_1
 Convolution      conv4_2          1 1 conv4_1_relu4_1 conv4_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
 ReLU             relu4_2          1 1 conv4_2 conv4_2_relu4_2
 Convolution      conv4_3          1 1 conv4_2_relu4_2 conv4_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
 ReLU             relu4_3          1 1 conv4_3 conv4_3_relu4_3
 Pooling          pool4            1 1 conv4_3_relu4_3 pool4 0=0 1=2 2=2 3=0 4=0
 Convolution      conv5_1          1 1 pool4 conv5_1 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
 ReLU             relu5_1          1 1 conv5_1 conv5_1_relu5_1
 Convolution      conv5_2          1 1 conv5_1_relu5_1 conv5_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
 ReLU             relu5_2          1 1 conv5_2 conv5_2_relu5_2
 Convolution      conv5_3          1 1 conv5_2_relu5_2 conv5_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
 ReLU             relu5_3          1 1 conv5_3 conv5_3_relu5_3
 Pooling          pool5            1 1 conv5_3_relu5_3 pool5 0=0 1=2 2=2 3=0 4=0
 InnerProduct     fc6              1 1 pool5 fc6 0=4096 1=1 2=102760448
 ReLU             relu6            1 1 fc6 fc6_relu6
 Dropout          drop6            1 1 fc6_relu6 fc6_drop6
 InnerProduct     fc7              1 1 fc6_drop6 fc7 0=4096 1=1 2=16777216
 ReLU             relu7            1 1 fc7 fc7_relu7
 Dropout          drop7            1 1 fc7_relu7 fc7_drop7
 InnerProduct     fc8              1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000
 Softmax          prob             1 1 fc8 prob 0=0
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -183,6 +183,7 @@ ncnn_add_layer(Yolov3DetectionOutput)
 ncnn_add_layer(PSROIPooling)
 ncnn_add_layer(ROIAlign OFF)
 ncnn_add_layer(Packing)
 ncnn_add_layer(Requantize)

 # message("SHADER_SPV_HEX_FILES = ${SHADER_SPV_HEX_FILES}")
 add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -55,14 +55,14 @@ double get_current_time()

 void benchmark(const Layer* layer, double start, double end)
 {
    fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
    fprintf(stderr, "    |");
    fprintf(stderr, "\n");
 }

 void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end)
 {
    fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
    fprintf(stderr, "    |    feature_map: %4d x %-4d    inch: %4d    outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c);
    if (layer->type == "Convolution")
    {
--- a/src/layer/arm/convolution_1x1_int8.h
+++ b/src/layer/arm/convolution_1x1_int8.h
--- a/src/layer/arm/convolution_3x3_int8.h
+++ b/src/layer/arm/convolution_3x3_int8.h
--- a/src/layer/arm/convolution_5x5_int8.h
+++ b/src/layer/arm/convolution_5x5_int8.h
@@ -0,0 +1,35 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv5x5s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 5;
    int kernel_h = 5;

    int stride_w = 1;
    int stride_h = 1;

    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }

 static void conv5x5s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 5;
    int kernel_h = 5;

    int stride_w = 2;
    int stride_h = 2;

    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }
--- a/src/layer/arm/convolution_7x7_int8.h
+++ b/src/layer/arm/convolution_7x7_int8.h
@@ -0,0 +1,35 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv7x7s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 7;
    int kernel_h = 7;

    int stride_w = 1;
    int stride_h = 1;

    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }

 static void conv7x7s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 7;
    int kernel_h = 7;

    int stride_w = 2;
    int stride_h = 2;

    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -14,6 +14,8 @@

 #include "convolution_arm.h"

 #include "benchmark.h"

 namespace ncnn {

 #include "convolution_1x1.h"
@@ -24,8 +26,11 @@ namespace ncnn {
 #include "convolution_7x7.h"

 #if __ARM_NEON
 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
 #include "convolution_5x5_int8.h"
 #include "convolution_7x7_int8.h"
 #endif // __ARM_NEON

 DEFINE_LAYER_CREATOR(Convolution_arm)
@@ -66,9 +71,12 @@ int Convolution_arm::load_model(const ModelBin& mb)

    if (use_int8_inference)
    {
 #if __ARM_NEON
 #if !__aarch64__
        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        if (use_winograd3x3)
        {
            int num_input = weight_data_size / 9 / num_output;
            conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            int num_input = weight_data_size / 9 / num_output;
            conv3x3s1_transform_kernel_int8_neon(weight_data, weight_3x3s1_int8_data, num_input, num_output);
@@ -78,16 +86,15 @@ int Convolution_arm::load_model(const ModelBin& mb)
        {
            int num_input = weight_data_size / 9 / num_output;
            conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output);
        }   
        }

        if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        {
            int num_input = weight_data_size / num_output;
            conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output);
            use_sgemm1x1 = true;
        }        
 #endif // !__aarch64__
 #endif // __ARM_NEON
        }
        
        return 0;
    }

@@ -233,7 +240,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    }

    const int kernel_size = kernel_w;
    const int stride = stride_w;
    //const int stride = stride_w;
    int stride = stride_w;

    if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
    {
@@ -293,43 +301,50 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

 #if __ARM_NEON
    // kernel_size x stride
    conv_int8_func conv_int8_func_table[5][5] =
    conv_int8_func conv_int8_func_table[7][4] =
    {
        {
            conv1x1s1_int8_neon,
            conv1x1s2_int8_neon,
            0,
            0,
            0
        }, // kernel_size = 1
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 2
        {
            conv3x3s1_int8_neon,
            conv3x3s2_int8_neon,
            0,
            0,
            0
        }, // kernel_size = 3
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 4
        {
            conv5x5s1_int8_neon,
            conv5x5s2_int8_neon,
            0,
            0
        }, // kernel_size = 5
        {
            0,
            0,
            0,
            0
        }  // kernel_size = 5
        }, // kernel_size = 6
        {            
            conv7x7s1_int8_neon,           
            conv7x7s2_int8_neon,
            0,
            0
        }  // kernel_size = 7                
    };
 #endif // __ARM_NEON

@@ -384,9 +399,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
            opt_g.blob_allocator = bottom_blob_int8.allocator;

            quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
        }
        }       

        bottom_blob_unbordered = bottom_blob_int8;
        bottom_blob_unbordered = bottom_blob_int8;       
    }

    Mat bottom_blob_bordered = bottom_blob_unbordered;
@@ -423,34 +438,90 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option

    if (use_int8_inference)
    {
 #if __ARM_NEON
 #if !__aarch64__
        if (use_sgemm1x1)
        {
            conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
        if (use_int8_requantize == true)
        {
            conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
            Mat top_blob_tm;
            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
            if (top_blob_tm.empty())
                return -100;
            
            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
            if (top_blob.empty())
                return -100; 

            if (use_sgemm1x1)
            {
                conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_1x1s1_sgemm_int8_data, opt);
            }
            else if (use_winograd3x3)
            {
                conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt);
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s1_int8_data, opt);
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt);
            }        
            else
            {
                conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);
            }

            // requantize, reverse scale inplace
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = top_blob.allocator;

                Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
                Mat top_blob_g = top_blob.channel_range(p, 1);
                requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
            }          
        }
        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
        {
            conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
        }        
        else
 #endif // !__aarch64__
 #endif // __ARM_NEON
        {
            conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
        }
            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
            if (top_blob.empty())
                return -100; 

        // dequantize, reverse scale inplace
        {
            ncnn::Option opt_g = opt;
            opt_g.blob_allocator = top_blob.allocator;
            if (use_sgemm1x1)
            {
                conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
            }
            else if (use_winograd3x3)
            {
                conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt);
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
            {
                conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
            }
            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
            {
                conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
            }        
            else
            {
                conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
            }          

            dequantize->forward_inplace(top_blob, opt_g);
        }
            // dequantize, reverse scale inplace
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = top_blob.allocator;

                Mat top_blob_g = top_blob.channel_range(p, 1);
                dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
            }           
        } 

        return 0;
    }
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -40,6 +40,8 @@ public:
    Mat weight_3x3s1_int8_data;
    Mat weight_3x3s2_int8_data;
    Mat weight_1x1s1_sgemm_int8_data;
    Mat weight_3x3_winograd23_data;
    std::vector<Mat> weight_3x3_winograd23_int8_data;
 };

 } // namespace ncnn
--- a/src/layer/arm/convolution_sgemm_int8.h
+++ b/src/layer/arm/convolution_sgemm_int8.h
--- a/src/layer/arm/convolutiondepthwise_3x3_int8.h
+++ b/src/layer/arm/convolutiondepthwise_3x3_int8.h
@@ -16,347 +16,6 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON

 #if __aarch64__
 static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const signed char* kernel = (const signed char *)_kernel + p*9;
        
        int* outptr0 = out;
        int* outptr0n = outptr0 + outw;
    
        const signed char* img0 = bottom_blob.channel(p);
        
        const signed char* r0 = img0;
        const signed char* r1 = img0 + w;
        const signed char* r2 = img0 + w*2;
        const signed char* r3 = img0 + w*3;

        int i = 0;

        int8x8_t _k0 = vdup_n_s8(kernel[0]);
        int8x8_t _k1 = vdup_n_s8(kernel[1]);
        int8x8_t _k2 = vdup_n_s8(kernel[2]);

        int8x8_t _k3 = vdup_n_s8(kernel[3]);
        int8x8_t _k4 = vdup_n_s8(kernel[4]);
        int8x8_t _k5 = vdup_n_s8(kernel[5]);

        int8x8_t _k6 = vdup_n_s8(kernel[6]);
        int8x8_t _k7 = vdup_n_s8(kernel[7]);
        int8x8_t _k8 = vdup_n_s8(kernel[8]);

        for (; i+1 < outh; i+=2)
        {
            int nn = outw >> 3;
            int remain = outw & 7;

            for (; nn >0; nn--)
            {
                int8x8_t _r0 = vld1_s8(r0);
                int8x8_t _r0n = vld1_s8(r0+8);
                int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
                int8x8_t _r02 = vext_s8(_r0, _r0n, 2);

                int16x8_t _sum0 = vmull_s8(_r0, _k0);
                _sum0 = vmlal_s8(_sum0, _r01, _k1);
                _sum0 = vmlal_s8(_sum0, _r02, _k2);

                int8x8_t _r1 = vld1_s8(r1);
                int8x8_t _r1n = vld1_s8(r1+8);
                int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
                int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
                _sum0 = vmlal_s8(_sum0, _r1, _k3);
                _sum0 = vmlal_s8(_sum0, _r11, _k4);
                _sum0 = vmlal_s8(_sum0, _r12, _k5);

                int16x8_t _sum1 = vmull_s8(_r1, _k0);
                _sum1 = vmlal_s8(_sum1, _r11, _k1);
                _sum1 = vmlal_s8(_sum1, _r12, _k2);

                int8x8_t _r2 = vld1_s8(r2);
                int8x8_t _r2n = vld1_s8(r2+8);
                int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
                int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
                _sum0 = vmlal_s8(_sum0, _r2, _k6);
                _sum0 = vmlal_s8(_sum0, _r21, _k7);
                _sum0 = vmlal_s8(_sum0, _r22, _k8);

                _sum1 = vmlal_s8(_sum1, _r2, _k3);
                _sum1 = vmlal_s8(_sum1, _r21, _k4);
                _sum1 = vmlal_s8(_sum1, _r22, _k5);

                int8x8_t _r3 = vld1_s8(r3);
                int8x8_t _r3n = vld1_s8(r3+8);
                int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
                int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
                _sum1 = vmlal_s8(_sum1, _r3, _k6);
                _sum1 = vmlal_s8(_sum1, _r31, _k7);
                _sum1 = vmlal_s8(_sum1, _r32, _k8);

                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));

                vst1q_s32(outptr0, sum0_s32);
                vst1q_s32(outptr0+4, sum0n_s32);

                int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1));
                int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1));

                vst1q_s32(outptr0n, sum1_s32);
                vst1q_s32(outptr0n+4, sum1n_s32);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                r3 += 8;
                outptr0 += 8;
                outptr0n += 8;
            }

            for (; remain>0; remain--)
            {
                //Todo Neon

                int sum0 = 0;
                int sum0n = 0;

                sum0 += (int)r0[0] * kernel[0];
                sum0 += (int)r0[1] * kernel[1];
                sum0 += (int)r0[2] * kernel[2];
                sum0 += (int)r1[0] * kernel[3];
                sum0 += (int)r1[1] * kernel[4];
                sum0 += (int)r1[2] * kernel[5];
                sum0 += (int)r2[0] * kernel[6];
                sum0 += (int)r2[1] * kernel[7];
                sum0 += (int)r2[2] * kernel[8];

                sum0n += (int)r1[0] * kernel[0];
                sum0n += (int)r1[1] * kernel[1];
                sum0n += (int)r1[2] * kernel[2];
                sum0n += (int)r2[0] * kernel[3];
                sum0n += (int)r2[1] * kernel[4];
                sum0n += (int)r2[2] * kernel[5];
                sum0n += (int)r3[0] * kernel[6];
                sum0n += (int)r3[1] * kernel[7];
                sum0n += (int)r3[2] * kernel[8];

                *outptr0 = sum0;
                *outptr0n = sum0n;

                r0++;
                r1++;
                r2++;
                r3++;
                outptr0++;
                outptr0n++;
            }

            r0 += 2 + w;
            r1 += 2 + w;
            r2 += 2 + w;
            r3 += 2 + w;

            outptr0 += outw;
            outptr0n += outw;
        }

        for (; i < outh; i++)
        {
            int nn = outw >> 3;
            int remain = outw & 7;

            for (; nn >0; nn--)
            {
                int8x8_t _r0 = vld1_s8(r0);
                int8x8_t _r0n = vld1_s8(r0+8);
                int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
                int8x8_t _r02 = vext_s8(_r0, _r0n, 2);

                int16x8_t _sum0 = vmull_s8(_r0, _k0);
                _sum0 = vmlal_s8(_sum0, _r01, _k1);
                _sum0 = vmlal_s8(_sum0, _r02, _k2);

                int8x8_t _r1 = vld1_s8(r1);
                int8x8_t _r1n = vld1_s8(r1+8);
                int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
                int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
                _sum0 = vmlal_s8(_sum0, _r1, _k3);
                _sum0 = vmlal_s8(_sum0, _r11, _k4);
                _sum0 = vmlal_s8(_sum0, _r12, _k5);

                int8x8_t _r2 = vld1_s8(r2);
                int8x8_t _r2n = vld1_s8(r2+8);
                int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
                int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
                _sum0 = vmlal_s8(_sum0, _r2, _k6);
                _sum0 = vmlal_s8(_sum0, _r21, _k7);
                _sum0 = vmlal_s8(_sum0, _r22, _k8);

                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));

                vst1q_s32(outptr0, sum0_s32);
                vst1q_s32(outptr0+4, sum0n_s32);

                r0 += 8;
                r1 += 8;
                r2 += 8;
                outptr0 += 8;
            }

            for (; remain>0; remain--)
            {
                int sum = 0;

                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr0 = sum;

                r0++;
                r1++;
                r2++;
                outptr0++;
            }   

            r0 += 2;
            r1 += 2;
            r2 += 2;
        }
    }
 }

 static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2*outw + w;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p=0; p<outch; p++)
    {
        Mat out = top_blob.channel(p);

        const signed char* kernel = (const signed char*)_kernel + p*9;

        int* outptr = out;

        const signed char* img = bottom_blob.channel(p);

        const signed char* r0 = img;
        const signed char* r1 = img + w;
        const signed char* r2 = img + w*2;

        int i = 0;

        int8x8_t _k0 = vdup_n_s8(kernel[0]);
        int8x8_t _k1 = vdup_n_s8(kernel[1]);
        int8x8_t _k2 = vdup_n_s8(kernel[2]);
        int8x8_t _k3 = vdup_n_s8(kernel[3]);
        int8x8_t _k4 = vdup_n_s8(kernel[4]);
        int8x8_t _k5 = vdup_n_s8(kernel[5]);
        int8x8_t _k6 = vdup_n_s8(kernel[6]);
        int8x8_t _k7 = vdup_n_s8(kernel[7]);
        int8x8_t _k8 = vdup_n_s8(kernel[8]);

        for (; i < outh; i++)
        {           
            int nn = outw >> 3;
            int remain = outw & 7;

            for (; nn > 0; nn--)
            {
                int8x8x2_t _r0 = vld2_s8(r0);
                int8x8x2_t _r0n = vld2_s8(r0+16);
                int8x8_t _r00 = _r0.val[0];
                int8x8_t _r01 = _r0.val[1];
                int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1);

                int16x8_t _sum = vmull_s8(_r00, _k0);
                _sum = vmlal_s8(_sum, _r01, _k1);
                _sum = vmlal_s8(_sum, _r02, _k2);

                int8x8x2_t _r1 = vld2_s8(r1);
                int8x8x2_t _r1n = vld2_s8(r1+16);
                int8x8_t _r10 = _r1.val[0];
                int8x8_t _r11 = _r1.val[1];
                int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1);
                _sum = vmlal_s8(_sum, _r10, _k3);
                _sum = vmlal_s8(_sum, _r11, _k4);
                _sum = vmlal_s8(_sum, _r12, _k5);

                int8x8x2_t _r2 = vld2_s8(r2);
                int8x8x2_t _r2n = vld2_s8(r2+16);
                int8x8_t _r20 = _r2.val[0];
                int8x8_t _r21 = _r2.val[1];
                int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1);
                _sum = vmlal_s8(_sum, _r20, _k6);
                _sum = vmlal_s8(_sum, _r21, _k7);
                _sum = vmlal_s8(_sum, _r22, _k8);

                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum));
                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum));

                vst1q_s32(outptr, sum0_s32);
                vst1q_s32(outptr+4, sum0n_s32);

                r0 += 16;
                r1 += 16;
                r2 += 16;
                outptr += 8;
            }       

            for (; remain>0; remain--)
            {
                int sum = 0;
                
                sum += (int)r0[0] * kernel[0];
                sum += (int)r0[1] * kernel[1];
                sum += (int)r0[2] * kernel[2];
                sum += (int)r1[0] * kernel[3];
                sum += (int)r1[1] * kernel[4];
                sum += (int)r1[2] * kernel[5];
                sum += (int)r2[0] * kernel[6];
                sum += (int)r2[1] * kernel[7];
                sum += (int)r2[2] * kernel[8];

                *outptr = sum;

                r0 += 2;
                r1 += 2;
                r2 += 2;
                outptr++;
            }

            r0 += tailstep;
            r1 += tailstep;
            r2 += tailstep;
        }
    }
 }
 #else // __aarch64__
 static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
@@ -824,5 +483,3 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M
        }
    }
 }

 #endif
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -13,7 +13,7 @@
 // specific language governing permissions and limitations under the License.

 #include "convolutiondepthwise_arm.h"

 #include "benchmark.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -147,6 +147,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
    Mat bottom_blob_unbordered = bottom_blob;
    if (use_int8_inference && elemsize != 1)
    {
        // start = ncnn::get_current_time();

        Mat bottom_blob_int8;
        bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
        if (bottom_blob_int8.empty())
@@ -167,8 +169,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
            quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
        }

        bottom_blob_unbordered = bottom_blob_int8;
    }
        bottom_blob_unbordered = bottom_blob_int8;    
    }    

    Mat bottom_blob_bordered = bottom_blob_unbordered;
    if (pad_w > 0 || pad_h > 0)
@@ -211,25 +213,67 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
            {
                if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
                {
                    if (stride_w == 1 && stride_h == 1)
                    if (use_int8_requantize)
                    {
                        convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
                        Mat top_blob_tm;
                        top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
                        if (top_blob_tm.empty())
                            return -100;
                        
                        top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
                        if (top_blob.empty())
                            return -100;

                        if (stride_w == 1 && stride_h == 1)
                        {
                            convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
                        }
                        else if (stride_w == 2 && stride_h == 2)
                        {
                            convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
                        }                     

                        // requantize, reverse scale inplace
                        #pragma omp parallel for num_threads(opt.num_threads)
                        for (int g=0; g<group; g++)
                        {
                            ncnn::Option opt_g = opt;
                            opt_g.num_threads = 1;
                            opt_g.blob_allocator = top_blob.allocator;

                            Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
                            Mat top_blob_g = top_blob.channel_range(g, 1);
                            requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
                        }                
                    }
                    else if (stride_w == 2 && stride_h == 2)
                    else
                    {
                        convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
                    }

                    // dequantize, reverse scale inplace
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = top_blob.allocator;

                        Mat top_blob_g = top_blob.channel_range(g, 1);
                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
                        // start = ncnn::get_current_time();

                        top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
                        if (top_blob.empty())
                            return -100;

                        if (stride_w == 1 && stride_h == 1)
                        {
                            convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
                        }
                        else if (stride_w == 2 && stride_h == 2)
                        {
                            convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
                        }                        

                        // dequantize, reverse scale inplace
                        #pragma omp parallel for num_threads(opt.num_threads)
                        for (int g=0; g<group; g++)
                        {
                            ncnn::Option opt_g = opt;
                            opt_g.num_threads = 1;
                            opt_g.blob_allocator = top_blob.allocator;

                            Mat top_blob_g = top_blob.channel_range(g, 1);
                            dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
                        }           
                    }

                    return 0;
--- a/src/layer/arm/quantize_arm.cpp
+++ b/src/layer/arm/quantize_arm.cpp
@@ -31,19 +31,6 @@ static inline signed char float2int8(float v)

 int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
 #if !__aarch64__ && __ARM_NEON
    int FPSCR_value = 0;

    asm volatile(
        "vmrs   %0, FPSCR               \n"
        "bic    r10, %0, #0x00c00000    \n"
        "vmsr   FPSCR, r10              \n"
        : "=r"(FPSCR_value)
        :
        : "memory", "r10"
    );
 #endif

    int dims = bottom_blob.dims;

    if (dims == 1)
@@ -200,15 +187,6 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
        }
    }

 #if !__aarch64__ && __ARM_NEON
    asm volatile(
        "vmsr   FPSCR, %0           \n"
        :
        : "r"(FPSCR_value)
        : "memory"
    );
 #endif

    return 0;
 }

--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -22,8 +22,92 @@ namespace ncnn {

 DEFINE_LAYER_CREATOR(ReLU_arm)

 int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            signed char* ptr = bottom_top_blob.channel(q);

 #if __ARM_NEON
            int nn = size >> 4;
            int remain = size - (nn << 4);
 #else
            int remain = size;
 #endif // __ARM_NEON

 #if __ARM_NEON
 #if __aarch64__
            int8x16_t _zero = vdupq_n_s8(0);
            for (; nn>0; nn--)
            {
                int8x16_t _p = vld1q_s8(ptr);
                _p = vmaxq_s8(_p, _zero);
                vst1q_s8(ptr, _p);

                ptr += 16;
            }
 #else
            if (nn > 0)
            {
            asm volatile(
                "veor       q1, q0, q0          \n"
                "0:                             \n"
                "pld        [%1, #128]          \n"
                "vld1.s8    {d0-d1}, [%1 :128]  \n"
                "vmax.s8    q0, q0, q1          \n"
                "subs       %0, #1              \n"
                "vst1.s8    {d0-d1}, [%1 :128]! \n"
                "bne        0b                  \n"
                : "=r"(nn),     // %0
                  "=r"(ptr)     // %1
                : "0"(nn),
                  "1"(ptr)
                : "cc", "memory", "q0", "q1"
            );
            }
 #endif // __aarch64__
 #endif // __ARM_NEON
            for (; remain>0; remain--)
            {
                if (*ptr < 0)
                    *ptr = 0;

                ptr++;
            }
        }
    }
    else
    {
        // TODO
        // #pragma omp parallel for num_threads(opt.num_threads)
        // for (int q=0; q<channels; q++)
        // {
        //     float* ptr = bottom_top_blob.channel(q);

        //     for (int i=0; i<size; i++)
        //     {
        //         if (ptr[i] < 0)
        //             ptr[i] *= slope;
        //     }
        // }
    }

    return 0;
 }

 int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    if (bottom_top_blob.elemsize == 1u)
        return ReLU_arm::forward_inplace_int8(bottom_top_blob, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -23,6 +23,7 @@ class ReLU_arm : public ReLU
 {
 public:
    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
    virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
 };

 } // namespace ncnn
--- a/src/layer/arm/requantize_arm.cpp
+++ b/src/layer/arm/requantize_arm.cpp
@@ -0,0 +1,325 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "requantize_arm.h"

 #include <math.h>

 #if __ARM_NEON
 #include <arm_neon.h>
 #endif // __ARM_NEON

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Requantize_arm)

 static inline signed char float2int8(float v)
 {
    int int32 = round(v);
    if (int32 > 127) return 127;
    if (int32 < -128) return -128;
    return (signed char)int32;
 }

 int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 { 
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        int w = bottom_blob.w;

        const int* intptr = bottom_blob;
        signed char * ptr = top_blob;

        if (bias_term)
        {
            if (bias_data_size > 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i=0; i<w; i++)
                {
                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
            else
            {
                float bias = bias_data[0];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i=0; i<w; i++)
                {
                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<w; i++)
            {
                ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
                if (fusion_relu && ptr[i] < 0)
                    ptr[i] = 0;
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                const int* intptr = bottom_blob.row<const int>(i);
                signed char* ptr = top_blob.row<signed char>(i);

                float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];

                for (int j=0; j<w; j++)
                {
                    ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
                    if (fusion_relu && ptr[j] < 0)
                        ptr[j] = 0;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                const int* intptr = bottom_blob.row<const int>(i);
                signed char* ptr = top_blob.row<signed char>(i);

                for (int j=0; j<w; j++)
                {
                    ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
                    if (fusion_relu && ptr[j] < 0)
                        ptr[j] = 0;
                }
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;      

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const int* intptr = bottom_blob.channel(q);
                signed char* ptr = top_blob.channel(q);

                float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];

 #if __ARM_NEON
                int nn = size >> 3;
                int remain = size & 7;

 #if __aarch64__
                for (; nn>0; nn--)
                {
                    ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out);
                    ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out);
                    ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out);
                    ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out);
                    ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out);
                    ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out);
                    ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out);
                    ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out);

                    ptr += 8;
                    intptr += 8;
                }
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%1, #256]          \n"
                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
                    "vdup.f32   q10, %6             \n" //q10 scale_in
                    "vdup.f32   q11, %7             \n" //q11 scale_out
                    "vdup.f32   q12, %8             \n" //q12 bias
                    "0:                             \n"
                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q0, q0            \n" 
                    "vcvt.f32.s32 q1, q1            \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q0, q10         \n"
                    "vmul.f32   q1, q1, q10         \n"
                    // top_f32 = top_f32 + bias
                    "vadd.f32   q0, q0, q12         \n"
                    "vadd.f32   q1, q1, q12         \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q11         \n"
                    "vmul.f32   q1, q1, q11         \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0           \n"
                    "vcvtr.s32.f32 s1, s1           \n"
                    "vcvtr.s32.f32 s2, s2           \n"
                    "vcvtr.s32.f32 s3, s3           \n"
                    "vcvtr.s32.f32 s4, s4           \n"
                    "vcvtr.s32.f32 s5, s5           \n"
                    "vcvtr.s32.f32 s6, s6           \n"
                    "vcvtr.s32.f32 s7, s7           \n" 
                    // top_s32 -> top_s16
                    "vqmovn.s32 d4, q0              \n"
                    "vqmovn.s32 d5, q1              \n"
                    "pld        [%1, #256]          \n"
                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
                    // top_s16 -> top_s8
                    "vqmovn.s16   d4, q2            \n"
                    // save top_s8
                    "vst1.8     {d4}, [%2:64]!      \n"
                    "subs       %0, #1              \n"
                    "bne        0b                  \n"
                    "sub        %1, #32             \n"
                    : "=r"(nn),         // %0
                      "=r"(intptr),     // %1
                      "=r"(ptr)         // %2
                    : "0"(nn),
                      "1"(intptr),
                      "2"(ptr),
                      "r"(scale_in),    // %6
                      "r"(scale_out),   // %7
                      "r"(bias)         // %8
                    : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12"
                );
                }
 #endif // __aarch64__           
 #else
                int remain = size;
 #endif // __ARM_NEON

                for (; remain > 0; remain--)
                {
                    *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out);

                    intptr++;
                    ptr ++;                     
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const int* intptr = bottom_blob.channel(q);
                signed char* ptr = top_blob.channel(q);

 #if __ARM_NEON
                int nn = size >> 3;
                int remain = size & 7;

 #if __aarch64__
                //TODO
                for (; nn>0; nn--)
                {
                    ptr[0] = float2int8(intptr[0] * scale_in * scale_out);
                    ptr[1] = float2int8(intptr[1] * scale_in * scale_out);
                    ptr[2] = float2int8(intptr[2] * scale_in * scale_out);
                    ptr[3] = float2int8(intptr[3] * scale_in * scale_out);
                    ptr[4] = float2int8(intptr[4] * scale_in * scale_out);
                    ptr[5] = float2int8(intptr[5] * scale_in * scale_out);
                    ptr[6] = float2int8(intptr[6] * scale_in * scale_out);
                    ptr[7] = float2int8(intptr[7] * scale_in * scale_out);

                    ptr += 8;
                    intptr += 8;
                }                
 #else
                if (nn > 0)
                {
                asm volatile(
                    "pld        [%1, #256]          \n"
                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
                    "vdup.f32   q10, %6             \n" //q10 scale_in
                    "vdup.f32   q11, %7             \n" //q11 scale_out
                    "0:                             \n"
                    // top_s32 -> top_f32
                    "vcvt.f32.s32 q0, q0            \n"
                    "vcvt.f32.s32 q1, q1            \n"
                    // top_f32 = top_f32 * scale_int
                    "vmul.f32   q0, q0, q10         \n"
                    "vmul.f32   q1, q1, q10         \n"
                    // top_f32 = top_f32 * scale_out
                    "vmul.f32   q0, q0, q11         \n"
                    "vmul.f32   q1, q1, q11         \n"
                    // top_f32 -> top_s32
                    "vcvtr.s32.f32 s0, s0           \n"
                    "vcvtr.s32.f32 s1, s1           \n"
                    "vcvtr.s32.f32 s2, s2           \n"
                    "vcvtr.s32.f32 s3, s3           \n"
                    "vcvtr.s32.f32 s4, s4           \n"
                    "vcvtr.s32.f32 s5, s5           \n"
                    "vcvtr.s32.f32 s6, s6           \n"
                    "vcvtr.s32.f32 s7, s7           \n" 
                    // top_s32 -> top_s16
                    "vqmovn.s32 d4, q0              \n"
                    "vqmovn.s32 d5, q1              \n"
                    "pld        [%1, #256]          \n"
                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
                    // top_s16 -> top_s8
                    "vqmovn.s16   d4, q2            \n"
                    // save top_s8
                    "vst1.8     {d4}, [%2:64]!      \n"
                    "subs       %0, #1              \n"
                    "bne        0b                  \n"
                    "sub        %1, #32             \n"
                    : "=r"(nn),         // %0
                      "=r"(intptr),     // %1
                      "=r"(ptr)         // %2
                    : "0"(nn),
                      "1"(intptr),
                      "2"(ptr),
                      "r"(scale_in),    // %6
                      "r"(scale_out)    // %7
                    : "cc", "memory", "q0", "q1", "q2", "q10", "q11"
                );
                } 
 #endif // __aarch64__      
 #else
                int remain = size;
 #endif // __ARM_NEON

                for (; remain > 0; remain--)
                {
                    *ptr = float2int8(*intptr * scale_in * scale_out);

                    intptr++;
                    ptr ++;
                }
            }
        }    
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/arm/requantize_arm.h
+++ b/src/layer/arm/requantize_arm.h
@@ -0,0 +1,30 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_REQUANTIZE_ARM_H
 #define LAYER_REQUANTIZE_ARM_H

 #include "requantize.h"

 namespace ncnn {

 class Requantize_arm : public Requantize
 {
 public:
    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 };

 } // namespace ncnn

 #endif // LAYER_REQUANTIZE_ARM_H
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -25,6 +25,7 @@ Convolution::Convolution()
    one_blob_only = true;
    support_inplace = false;
    support_vulkan = true;
    use_int8_requantize = false;

 #if NCNN_VULKAN
    padding = 0;
@@ -42,7 +43,6 @@ Convolution::Convolution()
 #endif // NCNN_VULKAN

    quantize = 0;
    dequantize = 0;
 }

 Convolution::~Convolution()
@@ -52,7 +52,14 @@ Convolution::~Convolution()
 #endif // NCNN_VULKAN

    delete quantize;
    delete dequantize;

    for (int i=0; i<(int)dequantize_ops.size(); i++)
        delete dequantize_ops[i];
    dequantize_ops.clear();

    for (int i=0; i<(int)requantize_ops.size(); i++)
        delete requantize_ops[i];
    requantize_ops.clear();
 }

 int Convolution::load_param(const ParamDict& pd)
@@ -113,10 +120,18 @@ int Convolution::load_model(const ModelBin& mb)

    if (int8_scale_term)
    {
        weight_data_int8_scale = mb.load(1, 1)[0];
        weight_data_int8_scales = mb.load(num_output, 1);
        bottom_blob_int8_scale = mb.load(1, 1)[0];
    }

    for (int i=0; i<(int)dequantize_ops.size(); i++)
        delete dequantize_ops[i];
    dequantize_ops.clear();

    for (int i=0; i<(int)requantize_ops.size(); i++)
        delete requantize_ops[i];
    requantize_ops.clear();

    bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);

@@ -126,27 +141,39 @@ int Convolution::load_model(const ModelBin& mb)
        return -1;
    }

    // runtime quantize the weight data
    if (weight_data_is_float32 && use_int8_inference)
    {
        // quantize weight to int8
        Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
        Mat int8_weight_data(weight_data_size, (size_t)1u);
        if (int8_weight_data.empty())
            return -100;

        ncnn::ParamDict pd;
        pd.set(0, weight_data_int8_scale);// scale
        const int weight_data_size_output = weight_data_size / num_output;

        for (int n=0; n<num_output; n++)
        {
            Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);

        op->load_param(pd);
            ncnn::ParamDict pd;
            pd.set(0, weight_data_int8_scales[n]);// scale

        Mat int8_weight_data;
        op->forward(weight_data, int8_weight_data);
            op->load_param(pd);

        delete op;
            ncnn::Option opt = ncnn::get_default_option();
            opt.blob_allocator = int8_weight_data.allocator;

        if (int8_weight_data.empty())
            return -100;
            const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
            Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
            op->forward(weight_data_n, int8_weight_data_n, opt);

            delete op;
        }

        weight_data = int8_weight_data;
    }

    // initial the quantize,dequantize op layer
    if (use_int8_inference)
    {
        quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
@@ -157,22 +184,74 @@ int Convolution::load_model(const ModelBin& mb)
            quantize->load_param(pd);
        }

        dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
        dequantize_ops.resize(num_output);
        for (int n=0; n<num_output; n++)
        {
            float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);
            dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);

            float top_rescale = 1.f;

            if (weight_data_int8_scales[n] == 0)
                top_rescale = 0;
            else
                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);

            ncnn::ParamDict pd;
            pd.set(0, top_rescale);// scale
            pd.set(1, bias_term);// bias_term
            pd.set(2, num_output);// bias_data_size
            pd.set(1, bias_term);  // bias_term
            pd.set(2, 1);          // bias_data_size

            dequantize->load_param(pd);
            dequantize_ops[n]->load_param(pd);

            ncnn::Mat weights[1];
            weights[0] = bias_data;
            weights[0] = bias_data.range(n, 1);

            dequantize->load_model(ModelBinFromMatArray(weights));
            dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
        }
    }

    return 0;
 }

 int Convolution::create_requantize_op(void)
 {
    if (!use_int8_requantize)
    {
        fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
        return -1;
    }

    requantize_ops.resize(num_output);
    for (int n=0; n<num_output; n++)
    {
        requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize);

        float scale_in = 1.f;
        float scale_out = 1.f;

        if (weight_data_int8_scales[n] == 0)
        {
            scale_in = 0;
        }
        else
        {
            scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
        }

        scale_out = top_blob_int8_scale;

        ncnn::ParamDict pd;
        pd.set(0, scale_in);   // scale in
        pd.set(1, scale_out);  // scale_out
        pd.set(2, bias_term);  // bias_term
        pd.set(3, 1);          // bias_data_size

        requantize_ops[n]->load_param(pd);

        ncnn::Mat weights[1];
        weights[0] = bias_data.range(n, 1);

        requantize_ops[n]->load_model(ModelBinFromMatArray(weights));
    }

    return 0;
@@ -210,7 +289,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op

            if (int8_scale_term)
            {
                weights[2] = Mat(1, (size_t)4u, (void*)&weight_data_int8_scale);
                weights[2] = weight_data_int8_scales;
                weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale);
            }

@@ -309,50 +388,118 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op

    if (use_int8_inference)
    {
        // num_output
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<num_output; p++)
        if (use_int8_requantize == true)
        {
            int* outptr = top_blob.channel(p);
            Mat top_blob_tm;
            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
            if (top_blob_tm.empty())
                return -100;
            
            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
            if (top_blob.empty())
                return -100; 

            for (int i = 0; i < outh; i++)
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                for (int j = 0; j < outw; j++)
                {
                    int sum = 0;
                int* outptr = top_blob_tm.channel(p);

                    const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

                    // channels
                    for (int q=0; q<channels; q++)
                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        const Mat m = bottom_blob_bordered.channel(q);
                        const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
                        int sum = 0;

                        const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

                        for (int k = 0; k < maxk; k++)
                        // channels
                        for (int q=0; q<channels; q++)
                        {
                            int val = sptr[ space_ofs[k] ];
                            int w = kptr[k];
                            sum += val * w;
                            const Mat m = bottom_blob_bordered.channel(q);
                            const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                int val = sptr[ space_ofs[k] ];
                                int w = kptr[k];
                                sum += val * w;
                            }

                            kptr += maxk;
                        }

                        kptr += maxk;
                        outptr[j] = sum;
                    }

                    outptr[j] = sum;
                    outptr += outw;
                }

                outptr += outw;
                // requantize, reverse scale inplace
                {
                    ncnn::Option opt_g = opt;
                    opt_g.num_threads = 1;
                    opt_g.blob_allocator = top_blob.allocator;

                    Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
                    Mat top_blob_g = top_blob.channel_range(p, 1);
                    requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
                }                        
            }
        }

        // dequantize, reverse scale inplace
        else
        {
            ncnn::Option opt_g = opt;
            opt_g.blob_allocator = top_blob.allocator;
            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
            if (top_blob.empty())
                return -100;
      
            // num_output
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                int* outptr = top_blob.channel(p);

            dequantize->forward_inplace(top_blob, opt_g);
        }
                for (int i = 0; i < outh; i++)
                {
                    for (int j = 0; j < outw; j++)
                    {
                        int sum = 0;

                        const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;

                        // channels
                        for (int q=0; q<channels; q++)
                        {
                            const Mat m = bottom_blob_bordered.channel(q);
                            const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;

                            for (int k = 0; k < maxk; k++)
                            {
                                int val = sptr[ space_ofs[k] ];
                                int w = kptr[k];
                                sum += val * w;
                            }

                            kptr += maxk;
                        }

                        outptr[j] = sum;
                    }

                    outptr += outw;
                }

                // dequantize, reverse scale inplace
                {
                    ncnn::Option opt_g = opt;
                    opt_g.num_threads = 1;
                    opt_g.blob_allocator = top_blob.allocator;

                    Mat top_blob_g = top_blob.channel_range(p, 1);
                    dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
                }          
            }   
        }        

        return 0;
    }
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -29,6 +29,8 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int create_requantize_op(void);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 #if NCNN_VULKAN
@@ -91,13 +93,16 @@ public:
    Pipeline* pipeline_innerproduct_pack4to1;
 #endif // NCNN_VULKAN

    float weight_data_int8_scale;
    Mat weight_data_int8_scales;
    float bottom_blob_int8_scale;
    float top_blob_int8_scale;

    bool use_int8_inference;
    bool use_int8_requantize;

    ncnn::Layer* quantize;
    ncnn::Layer* dequantize;
    std::vector<ncnn::Layer*> dequantize_ops;
    std::vector<ncnn::Layer*> requantize_ops;
 };

 } // namespace ncnn
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -25,6 +25,7 @@ ConvolutionDepthWise::ConvolutionDepthWise()
    one_blob_only = true;
    support_inplace = false;
    support_vulkan = true;
    use_int8_requantize = false;

 #if NCNN_VULKAN
    padding = 0;
@@ -58,6 +59,11 @@ ConvolutionDepthWise::~ConvolutionDepthWise()
        delete dequantize_ops[i];

    dequantize_ops.clear();

    for (int i=0; i<(int)requantize_ops.size(); i++)
        delete requantize_ops[i];

    requantize_ops.clear();    
 }

 int ConvolutionDepthWise::load_param(const ParamDict& pd)
@@ -150,7 +156,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
    if (int8_scale_term == 1)
    {
        weight_data_int8_scales = mb.load(group, 1);
        bottom_blob_int8_scales = mb.load(group, 1);
        bottom_blob_int8_scales = mb.load(1, 1);

        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
        bottom_blob_int8_scales = Mat(group);
        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
    }
    else if (int8_scale_term == 2)
    {
@@ -177,6 +187,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)

    dequantize_ops.clear();

    for (int i=0; i<(int)requantize_ops.size(); i++)
        delete requantize_ops[i];

    requantize_ops.clear();    

    bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
    bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);

@@ -236,7 +251,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
        {
            dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);

            float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
            float top_rescale = 1.f;
            if (weight_data_int8_scales[g] == 0)
                top_rescale = 0;
            else
                top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);

            ncnn::ParamDict pd;
            pd.set(0, top_rescale);// scale
@@ -255,6 +274,50 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
    return 0;
 }

 int ConvolutionDepthWise::create_requantize_op(void)
 {
    if (!use_int8_requantize)
    {
        fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
        return -1;
    }

    requantize_ops.resize(group);
    for (int g=0; g<group; g++)
    {
        requantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Requantize);

        float scale_in = 1.f;
        float scale_out = 1.f;

        if (weight_data_int8_scales[g] == 0)
        {
            scale_in = 0;
        }
        else
        {
            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
        }

        scale_out = top_blob_int8_scale;

        ncnn::ParamDict pd;
        pd.set(0, scale_in);   // scale in
        pd.set(1, scale_out);  // scale_out
        pd.set(2, bias_term);  // bias_term
        pd.set(3, 1);          // bias_data_size

        requantize_ops[g]->load_param(pd);

        ncnn::Mat weights[1];
        weights[0] = bias_data.range(g, 1);

        requantize_ops[g]->load_model(ModelBinFromMatArray(weights));
    }

    return 0;
 }

 int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
    // convolv with NxN kernel
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -29,6 +29,8 @@ public:

    virtual int load_model(const ModelBin& mb);

    virtual int create_requantize_op(void);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 #if NCNN_VULKAN
@@ -92,11 +94,14 @@ public:

    Mat weight_data_int8_scales;
    Mat bottom_blob_int8_scales;
    float top_blob_int8_scale;

    bool use_int8_inference;
    bool use_int8_requantize;

    std::vector<ncnn::Layer*> quantize_ops;
    std::vector<ncnn::Layer*> dequantize_ops;
    std::vector<ncnn::Layer*> requantize_ops;
 };

 } // namespace ncnn
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -36,7 +36,6 @@ InnerProduct::InnerProduct()
 #endif // NCNN_VULKAN

    quantize = 0;
    dequantize = 0;
 }

 InnerProduct::~InnerProduct()
@@ -46,7 +45,11 @@ InnerProduct::~InnerProduct()
 #endif // NCNN_VULKAN

    delete quantize;
    delete dequantize;

    for (int i=0; i<(int)dequantize_ops.size(); i++)
        delete dequantize_ops[i];

    dequantize_ops.clear();
 }

 int InnerProduct::load_param(const ParamDict& pd)
@@ -92,7 +95,7 @@ int InnerProduct::load_model(const ModelBin& mb)

    if (int8_scale_term)
    {
        weight_data_int8_scale = mb.load(1, 1)[0];
        weight_data_int8_scales = mb.load(num_output, 1);
        bottom_blob_int8_scale = mb.load(1, 1)[0];
    }

@@ -105,25 +108,71 @@ int InnerProduct::load_model(const ModelBin& mb)
        return -1;
    }

    // initial the quantize,dequantize op layer
    if (use_int8_inference)
    {
        quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
        dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
        {
            ncnn::ParamDict pd;
            pd.set(0, bottom_blob_int8_scale);// scale

            quantize->load_param(pd);
        }

        dequantize_ops.resize(num_output);
        for (int n=0; n<num_output; n++)
        {
            dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);

            float top_rescale = 1.f;

            if (weight_data_int8_scales[n] == 0)
                top_rescale = 0;
            else
                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);

            ncnn::ParamDict pd;
            pd.set(0, top_rescale);// scale
            pd.set(1, bias_term);  // bias_term
            pd.set(2, 1);          // bias_data_size

            dequantize_ops[n]->load_param(pd);

            ncnn::Mat weights[1];
            weights[0] = bias_data.range(n, 1);

            dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
        }
    }

    // runtime quantize the weight data
    if (weight_data_is_float32 && use_int8_inference)
    {
        // quantize weight to int8
        ncnn::ParamDict pd;
        pd.set(0, weight_data_int8_scale);// scale
        Mat int8_weight_data(weight_data_size, (size_t)1u);
        if (int8_weight_data.empty())
            return -100;

        quantize->load_param(pd);
        const int weight_data_size_output = weight_data_size / num_output;

        Mat int8_weight_data;
        quantize->forward(weight_data, int8_weight_data);
        for (int n=0; n<num_output; n++)
        {
            Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);

        if (int8_weight_data.empty())
            return -100;
            ncnn::ParamDict pd;
            pd.set(0, weight_data_int8_scales[n]);// scale

            op->load_param(pd);

            ncnn::Option opt = ncnn::get_default_option();
            opt.blob_allocator = int8_weight_data.allocator;

            const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
            Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
            op->forward(weight_data_n, int8_weight_data_n, opt);

            delete op;
        }

        weight_data = int8_weight_data;
    }
@@ -152,12 +201,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o

        // quantize, scale and round to nearest
        {
            ncnn::ParamDict pd;
            pd.set(0, bottom_blob_int8_scale);// scale
            ncnn::Option opt_g = opt;
            opt_g.blob_allocator = bottom_blob_int8.allocator;

            quantize->load_param(pd);

            quantize->forward(bottom_blob, bottom_blob_int8, opt);
            quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
        }

        // num_output
@@ -179,26 +226,24 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
                }
            }

            out[p] = sum;
            out[p] = sum;       
        }

        // dequantize, reverse scale inplace
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<num_output; p++)
        {
            float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);

            ncnn::ParamDict pd;
            pd.set(0, top_rescale);// scale
            pd.set(1, bias_term);// bias_term
            pd.set(2, num_output);// bias_data_size

            dequantize->load_param(pd);

            ncnn::Mat weights[1];
            weights[0] = bias_data;

            dequantize->load_model(ModelBinFromMatArray(weights));

            dequantize->forward_inplace(top_blob, opt);
            int* out_s32 = top_blob;
            float* out_f32 = top_blob;
            float top_rescale = 1.f;
            if (weight_data_int8_scales[p] == 0)
                top_rescale = 0;
            else
                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);

            if (bias_term)
                out_f32[p] = out_s32[p] * top_rescale + bias_data[p];
            else
                out_f32[p] = out_s32[p] * top_rescale;
        }

        return 0;
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -76,13 +76,13 @@ public:
    Pipeline* pipeline_innerproduct_pack4to1;
 #endif // NCNN_VULKAN

    float weight_data_int8_scale;
    Mat weight_data_int8_scales;
    float bottom_blob_int8_scale;

    bool use_int8_inference;

    ncnn::Layer* quantize;
    ncnn::Layer* dequantize;
    std::vector<ncnn::Layer*> dequantize_ops;
 };

 } // namespace ncnn
--- a/src/layer/relu.cpp
+++ b/src/layer/relu.cpp
@@ -38,8 +38,51 @@ int ReLU::load_param(const ParamDict& pd)
    return 0;
 }

 int ReLU::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
 {
    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
    int size = w * h;

    if (slope == 0.f)
    {
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<channels; q++)
        {
            signed char* ptr = bottom_top_blob.channel(q);

            for (int i=0; i<size; i++)
            {
                if (ptr[i] < 0)
                    ptr[i] = 0;
            }
        }
    }
    else
    {
        // TODO
        // #pragma omp parallel for num_threads(opt.num_threads)
        // for (int q=0; q<channels; q++)
        // {
        //     float* ptr = bottom_top_blob.channel(q);

        //     for (int i=0; i<size; i++)
        //     {
        //         if (ptr[i] < 0)
        //             ptr[i] *= slope;
        //     }
        // }
    }

    return 0;
 }

 int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
    if (bottom_top_blob.elemsize == 1u)
        return ReLU::forward_inplace_int8(bottom_top_blob, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
--- a/src/layer/relu.h
+++ b/src/layer/relu.h
@@ -27,6 +27,7 @@ public:
    virtual int load_param(const ParamDict& pd);

    virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
    virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;

 #if NCNN_VULKAN
    virtual int create_pipeline();
--- a/src/layer/requantize.cpp
+++ b/src/layer/requantize.cpp
@@ -0,0 +1,195 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #include "requantize.h"

 #include <math.h>

 namespace ncnn {

 DEFINE_LAYER_CREATOR(Requantize)

 Requantize::Requantize()
 {
    one_blob_only = true;
    support_inplace = false;
    fusion_relu = false;
 }

 static inline signed char float2int8(float v)
 {
    int int32 = round(v);
    if (int32 > 127) return 127;
    if (int32 < -128) return -128;
    return (signed char)int32;
 }

 int Requantize::load_param(const ParamDict& pd)
 {
    scale_in = pd.get(0, 1.f);	// bottom_blob_scale * weight_scale
 	scale_out = pd.get(1, 1.f);	// top_blob_scale
    bias_term = pd.get(2, 0);
    bias_data_size = pd.get(3, 0);
    fusion_relu = pd.get(4, 0);

    return 0;
 }

 int Requantize::load_model(const ModelBin& mb)
 {
    if (bias_term)
    {
        bias_data = mb.load(bias_data_size, 1);
        if (bias_data.empty())
            return -100;
    }

    return 0;
 }

 int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 { 
    int dims = bottom_blob.dims;

    if (dims == 1)
    {
        int w = bottom_blob.w;

        const int* intptr = bottom_blob;
        signed char * ptr = top_blob;

        if (bias_term)
        {
            if (bias_data_size > 1)
            {
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i=0; i<w; i++)
                {
                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
            else
            {
                float bias = bias_data[0];
                #pragma omp parallel for num_threads(opt.num_threads)
                for (int i=0; i<w; i++)
                {
                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<w; i++)
            {
                ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
                if (fusion_relu && ptr[i] < 0)
                    ptr[i] = 0;
            }
        }
    }

    if (dims == 2)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                const int* intptr = bottom_blob.row<const int>(i);
                signed char* ptr = top_blob.row<signed char>(i);

                float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];

                for (int j=0; j<w; j++)
                {
                    ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
                    if (fusion_relu && ptr[j] < 0)
                        ptr[j] = 0;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int i=0; i<h; i++)
            {
                const int* intptr = bottom_blob.row<const int>(i);
                signed char* ptr = top_blob.row<signed char>(i);

                for (int j=0; j<w; j++)
                {
                    ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
                    if (fusion_relu && ptr[j] < 0)
                        ptr[j] = 0;
                }
            }
        }
    }

    if (dims == 3)
    {
        int w = bottom_blob.w;
        int h = bottom_blob.h;
        int channels = bottom_blob.c;
        int size = w * h;      

        if (bias_term)
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const int* intptr = bottom_blob.channel(q);
                signed char* ptr = top_blob.channel(q);

                float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];

                for (int i=0; i<size; i++)
                {
                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
        }
        else
        {
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int q=0; q<channels; q++)
            {
                const int* intptr = bottom_blob.channel(q);
                signed char* ptr = top_blob.channel(q);

                for (int i=0; i<size; i++)
                {
                    ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
                    if (fusion_relu && ptr[i] < 0)
                        ptr[i] = 0;
                }
            }
        }    
    }

    return 0;
 }

 } // namespace ncnn
--- a/src/layer/requantize.h
+++ b/src/layer/requantize.h
@@ -0,0 +1,46 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #ifndef LAYER_REQUANTIZE_H
 #define LAYER_REQUANTIZE_H

 #include "layer.h"

 namespace ncnn {

 class Requantize : public Layer
 {
 public:
    Requantize();

    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;

 public:
    float scale_in;	// bottom_blob_scale * weight_scale
 	float scale_out;// top_blob_scale / (bottom_blob_scale * weight_scale)
    int bias_term;
    int bias_data_size;

    bool fusion_relu;

    Mat bias_data;
 };

 } // namespace ncnn

 #endif // LAYER_REQUANTIZE_H
--- a/src/layer/x86/convolution_3x3.h
+++ b/src/layer/x86/convolution_3x3.h
@@ -1,6 +1,7 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -138,3 +139,496 @@ static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
    }

 }

 static void conv3x3s1_winograd23_transform_kernel_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
 {
    kernel_tm.create(4*4, inch, outch);

    // G
    const float ktm[4][3] = {
        {   1.0f,     0.0f,     0.0f},
        { 1.0f/2,   1.0f/2,   1.0f/2},
        { 1.0f/2,  -1.0f/2,   1.0f/2},
        {   0.0f,     0.0f,     1.0f}
    };

    #pragma omp parallel for
    for (int p = 0; p<outch; p++)
    {
        for (int q = 0; q<inch; q++)
        {
            const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9;
            float* kernel_tm0 = kernel_tm.channel(p).row(q);

            // transform kernel
            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            // h
            float tmp[4][3];
            for (int i=0; i<4; i++)
            {
                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j=0; j<4; j++)
            {
                float* tmpp = &tmp[j][0];

                for (int i=0; i<4; i++)
                {
                    kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }
 }

 static void conv3x3s1_winograd23_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 2n+2, winograd F(2,3)
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 1) / 2 * 2;
    outh = (outh + 1) / 2 * 2;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);

    const float* bias = _bias;    

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        const int tiles = nColBlocks * nRowBlocks;

        bottom_blob_tm.create(4*4, tiles, inch, 4u, opt.workspace_allocator);

        // BT
        // const float itm[4][4] = {
        //     {1.0f,  0.0f, -1.0f,  0.0f},
        //     {0.0f,  1.0f,  1.00f, 0.0f},
        //     {0.0f, -1.0f,  1.00f, 0.0f},
        //     {0.0f, -1.0f,  0.00f, 1.0f}
        // };        

        for (int q=0; q<inch; q++)
        {
            const float* img = bottom_blob_bordered.channel(q);
            float* out_tm0 = bottom_blob_tm.channel(q);

            for (int j = 0; j < nColBlocks; j++)
            {
                const float* r0 = img + w * j * 2;
                const float* r1 = r0 + w;
                const float* r2 = r1 + w;
                const float* r3 = r2 + w;

                for (int i = 0; i < nRowBlocks; i++)
                {
                    float d0[4],d1[4],d2[4],d3[4];
                    float w0[4],w1[4],w2[4],w3[4];
                    float t0[4],t1[4],t2[4],t3[4];
                    // load
                    for (int n = 0; n < 4; n++)
                    {
                        d0[n] = r0[n];
                        d1[n] = r1[n];
                        d2[n] = r2[n];
                        d3[n] = r3[n];
                    }                                  
                    // w = B_t * d
                    for (int n = 0; n < 4; n++)
                    {   
                        w0[n] = d0[n] - d2[n];
                        w1[n] = d1[n] + d2[n];
                        w2[n] = d2[n] - d1[n];
                        w3[n] = d3[n] - d1[n];
                    }                                
                    // transpose d to d_t
                    {
                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
                    }
                    // d = B_t * d_t
                    for (int n = 0; n < 4; n++)
                    {   
                        d0[n] = t0[n] - t2[n];
                        d1[n] = t1[n] + t2[n];
                        d2[n] = t2[n] - t1[n];
                        d3[n] = t3[n] - t1[n];
                    }
                    // save to out_tm
                    for (int n = 0; n < 4; n++)
                    {
                        out_tm0[n   ] = d0[n];
                        out_tm0[n+ 4] = d1[n];
                        out_tm0[n+ 8] = d2[n];
                        out_tm0[n+12] = d3[n];
                    }                  

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;

                    out_tm0 += 16;
                }
            }
        }
    }
    bottom_blob_bordered = Mat();

    // BEGIN dot
    Mat top_blob_tm;
    {
        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        const int tiles = nColBlocks * nRowBlocks; 

        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);

        int nn_outch = outch >> 2;
        int remain_outch_start = nn_outch << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = pp * 4;

            Mat out0_tm = top_blob_tm.channel(p);
            Mat out1_tm = top_blob_tm.channel(p+1);
            Mat out2_tm = top_blob_tm.channel(p+2);
            Mat out3_tm = top_blob_tm.channel(p+3);

            const Mat kernel0_tm = kernel_tm.channel(p);
            const Mat kernel1_tm = kernel_tm.channel(p+1);
            const Mat kernel2_tm = kernel_tm.channel(p+2);
            const Mat kernel3_tm = kernel_tm.channel(p+3);

            for (int i=0; i<tiles; i++)
            {
                float* output0_tm = out0_tm.row(i);
                float* output1_tm = out1_tm.row(i);
                float* output2_tm = out2_tm.row(i);
                float* output3_tm = out3_tm.row(i);

                float sum0[16] = {0.0f};
                float sum1[16] = {0.0f};
                float sum2[16] = {0.0f};
                float sum3[16] = {0.0f};

                int q = 0;
                for (; q+3<inch; q+=4)
                {   
                    const float* r0 = bottom_blob_tm.channel(q).row(i);
                    const float* r1 = bottom_blob_tm.channel(q+1).row(i);
                    const float* r2 = bottom_blob_tm.channel(q+2).row(i);
                    const float* r3 = bottom_blob_tm.channel(q+3).row(i);

                    const float* k0 = kernel0_tm.row(q);
                    const float* k1 = kernel1_tm.row(q);
                    const float* k2 = kernel2_tm.row(q);
                    const float* k3 = kernel3_tm.row(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += r0[n] * k0[n];
                        k0 += 16;
                        sum0[n] += r1[n] * k0[n];
                        k0 += 16;
                        sum0[n] += r2[n] * k0[n];
                        k0 += 16;
                        sum0[n] += r3[n] * k0[n];
                        k0 -= 16 * 3;

                        sum1[n] += r0[n] * k1[n];
                        k1 += 16;
                        sum1[n] += r1[n] * k1[n];
                        k1 += 16;
                        sum1[n] += r2[n] * k1[n];
                        k1 += 16;
                        sum1[n] += r3[n] * k1[n];
                        k1 -= 16 * 3;

                        sum2[n] += r0[n] * k2[n];
                        k2 += 16;
                        sum2[n] += r1[n] * k2[n];
                        k2 += 16;
                        sum2[n] += r2[n] * k2[n];
                        k2 += 16;
                        sum2[n] += r3[n] * k2[n];
                        k2 -= 16 * 3;

                        sum3[n] += r0[n] * k3[n];
                        k3 += 16;
                        sum3[n] += r1[n] * k3[n];
                        k3 += 16;
                        sum3[n] += r2[n] * k3[n];
                        k3 += 16;
                        sum3[n] += r3[n] * k3[n];
                        k3 -= 16 * 3;
                    }
                }

                for (; q<inch; q++)
                {
                    const float* r0 = bottom_blob_tm.channel(q).row(i);

                    const float* k0 = kernel0_tm.row(q);
                    const float* k1 = kernel1_tm.row(q);
                    const float* k2 = kernel2_tm.row(q);
                    const float* k3 = kernel3_tm.row(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += r0[n] * k0[n];
                        sum1[n] += r0[n] * k1[n];
                        sum2[n] += r0[n] * k2[n];
                        sum3[n] += r0[n] * k3[n];
                    }
                }

                for (int n=0; n<16; n++)
                {
                    output0_tm[n] = sum0[n];
                    output1_tm[n] = sum1[n];
                    output2_tm[n] = sum2[n];
                    output3_tm[n] = sum3[n];
                }
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=remain_outch_start; p<outch; p++)
        {
            Mat out0_tm = top_blob_tm.channel(p);
            const Mat kernel0_tm = kernel_tm.channel(p);

            for (int i=0; i<tiles; i++)
            {
                float* output0_tm = out0_tm.row(i);

                float sum0[16] = {0.0f};

                int q = 0;
                for (; q+3<inch; q+=4)
                {   
                    const float* r0 = bottom_blob_tm.channel(q).row(i);
                    const float* r1 = bottom_blob_tm.channel(q+1).row(i);
                    const float* r2 = bottom_blob_tm.channel(q+2).row(i);
                    const float* r3 = bottom_blob_tm.channel(q+3).row(i);

                    const float* k0 = kernel0_tm.row(q);
                    const float* k1 = kernel0_tm.row(q+1);
                    const float* k2 = kernel0_tm.row(q+2);
                    const float* k3 = kernel0_tm.row(q+3);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += r0[n] * k0[n];
                        sum0[n] += r1[n] * k1[n];
                        sum0[n] += r2[n] * k2[n];
                        sum0[n] += r3[n] * k3[n];
                    }
                }

                for (; q<inch; q++)
                {
                    const float* r0 = bottom_blob_tm.channel(q).row(i);
                    const float* k0 = kernel0_tm.row(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += r0[n] * k0[n];
                    }             
                }

                for (int n=0; n<16; n++)
                {
                    output0_tm[n] = sum0[n];
                }
            }
        }
    }
    bottom_blob_tm = Mat();
    // END dot

    // BEGIN transform output
    Mat top_blob_bordered;
    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    {
        // AT
        // const float itm[2][4] = {
        //     {1.0f,  1.0f,  1.0f,  0.0f},
        //     {0.0f,  1.0f, -1.0f,  1.0f}
        // }; 

        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<outch; p++)
        {
            Mat out_tm = top_blob_tm.channel(p);
            Mat out = top_blob_bordered.channel(p);

            const float bias0 = bias ? bias[p] : 0.f;

            for (int j=0; j<nColBlocks; j++)
            {
                float* outRow0 = out.row(j*2);
                float* outRow1 = out.row(j*2+1);

                for(int i=0; i<nRowBlocks; i++)
                {
                    float* out_tile = out_tm.row(j*nRowBlocks + i);

                    float s0[4],s1[4],s2[4],s3[4];
                    float w0[4],w1[4];
                    float d0[2],d1[2],d2[2],d3[2];
                    float o0[2],o1[2];
                    // load
                    for (int n = 0; n < 4; n++)
                    {
                        s0[n] = out_tile[n];
                        s1[n] = out_tile[n+ 4];
                        s2[n] = out_tile[n+ 8];
                        s3[n] = out_tile[n+12];
                    }
                    // w = A_T * W
                    for (int n = 0; n < 4; n++)
                    {
                        w0[n] = s0[n] + s1[n] + s2[n];
                        w1[n] = s1[n] - s2[n] + s3[n];
                    }
                    // transpose w to w_t
                    {
                        d0[0] = w0[0]; d0[1] = w1[0];
                        d1[0] = w0[1]; d1[1] = w1[1];
                        d2[0] = w0[2]; d2[1] = w1[2];
                        d3[0] = w0[3]; d3[1] = w1[3];
                    }
                    // Y = A_T * w_t
                    for (int n = 0; n < 2; n++)
                    {
                        o0[n] = d0[n] + d1[n] + d2[n] + bias0;
                        o1[n] = d1[n] - d2[n] + d3[n] + bias0;
                    }
                    // save to top blob tm
                    outRow0[0] = o0[0];
                    outRow0[1] = o0[1];
                    outRow1[0] = o1[0];
                    outRow1[1] = o1[1];

                    outRow0 += 2;
                    outRow1 += 2;      
                }
            }
        }        
    }
    // END transform output 

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
 }

 static void conv3x3s2_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Mat& _bias, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const int tailstep = w - 2 * outw + w;

    const float* kernel = _kernel;
    const float* bias = _bias;

    #pragma omp parallel for num_threads(opt.num_threads)
    for (int p = 0; p < outch; p++)
    {
        Mat out = top_blob.channel(p);

        const float bias0 = bias ? bias[p] : 0.f;

        out.fill(bias0);

        for (int q = 0; q < inch; q++)
        {
            float *outptr = out;

            const float *img = bottom_blob.channel(q);
            const float* kernel0 = kernel + p*inch*9  + q*9;

            const float *r0 = img;
            const float *r1 = img + w;
            const float *r2 = img + w * 2;

            const float* k0 = kernel0;
            const float* k1 = kernel0 + 3;
            const float* k2 = kernel0 + 6;

            for (int i = 0; i < outh; i++)
            {
                int remain = outw;

                for (; remain > 0; remain--)
                {
                    float sum = 0;

                    sum += r0[0] * k0[0];
                    sum += r0[1] * k0[1];
                    sum += r0[2] * k0[2];
                    sum += r1[0] * k1[0];
                    sum += r1[1] * k1[1];
                    sum += r1[2] * k1[2];
                    sum += r2[0] * k2[0];
                    sum += r2[1] * k2[1];
                    sum += r2[2] * k2[2];

                    *outptr += sum;

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    outptr++;
                }

                r0 += tailstep;
                r1 += tailstep;
                r2 += tailstep;
            }
        }
    }
 }
--- a/src/layer/x86/convolution_3x3_int8.h
+++ b/src/layer/x86/convolution_3x3_int8.h
@@ -11,12 +11,6 @@
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
 static inline short saturate2int16(int v)
 {
    if (v > 32767) return 32767;
    if (v < -32768) return -32768;
    return (short)v;
 }

 static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
@@ -84,6 +78,424 @@ static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat
    }
 }

 static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
 {
    kernel_tm.create(4*4, inch, outch, 2ul);  

    // G
    const short ktm[4][3] = {
        {   2,     0,     0},
        {   1,     1,     1},
        {   1,    -1,     1},
        {   0,     0,     2}
    };

    #pragma omp parallel for
    for (int p = 0; p<outch; p++)
    {
        for (int q = 0; q<inch; q++)
        {
            const signed char* kernel0 = (const signed char*)kernel + p*inch * 9 + q * 9;
            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);

            // transform kernel
            const signed char* k0 = kernel0;
            const signed char* k1 = kernel0 + 3;
            const signed char* k2 = kernel0 + 6;

            // h
            short tmp[4][3];
            for (int i=0; i<4; i++)
            {
                tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
                tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
                tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
            }

            // U
            for (int j=0; j<4; j++)
            {
                short* tmpp = &tmp[j][0];

                for (int i=0; i<4; i++)
                {
                    kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
                }
            }
        }
    }
 }

 static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
 {
    int w = bottom_blob.w;
    int h = bottom_blob.h;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    // pad to 2n+2, winograd F(2,3)
    Mat bottom_blob_bordered = bottom_blob;

    outw = (outw + 1) / 2 * 2;
    outh = (outh + 1) / 2 * 2;

    w = outw + 2;
    h = outh + 2;
    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);  

    // BEGIN transform input
    Mat bottom_blob_tm;
    {
        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        const int tiles = nColBlocks * nRowBlocks;

        bottom_blob_tm.create(4*4, tiles, inch, 2u, opt.workspace_allocator);

        // BT
        // const float itm[4][4] = {
        //     {1.0f,  0.0f, -1.0f,  0.0f},
        //     {0.0f,  1.0f,  1.00f, 0.0f},
        //     {0.0f, -1.0f,  1.00f, 0.0f},
        //     {0.0f, -1.0f,  0.00f, 1.0f}
        // };
        
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int q=0; q<inch; q++)
        {
            const signed char* img = bottom_blob_bordered.channel(q);
            short* out_tm0 = bottom_blob_tm.channel(q);

            for (int j = 0; j < nColBlocks; j++)
            {
                const signed char* r0 = img + w * j * 2;
                const signed char* r1 = r0 + w;
                const signed char* r2 = r1 + w;
                const signed char* r3 = r2 + w;

                for (int i = 0; i < nRowBlocks; i++)
                {
                    short d0[4],d1[4],d2[4],d3[4];
                    short w0[4],w1[4],w2[4],w3[4];
                    short t0[4],t1[4],t2[4],t3[4];
                    // load 
                    for (int n = 0; n < 4; n++)
                    {
                        d0[n] = r0[n];
                        d1[n] = r1[n];
                        d2[n] = r2[n];
                        d3[n] = r3[n];
                    }                                  
                    // w = B_t * d
                    for (int n = 0; n < 4; n++)
                    {   
                        w0[n] = d0[n] - d2[n];
                        w1[n] = d1[n] + d2[n];
                        w2[n] = d2[n] - d1[n];
                        w3[n] = d3[n] - d1[n];
                    }                                
                    // transpose d to d_t
                    {
                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
                    }
                    // U = B_t * d_t
                    for (int n = 0; n < 4; n++)
                    {   
                        d0[n] = t0[n] - t2[n];
                        d1[n] = t1[n] + t2[n];
                        d2[n] = t2[n] - t1[n];
                        d3[n] = t3[n] - t1[n];
                    }                
                    // save to out_tm
                    for (int n = 0; n < 4; n++)
                    {
                        out_tm0[n   ] = d0[n];
                        out_tm0[n+ 4] = d1[n];
                        out_tm0[n+ 8] = d2[n];
                        out_tm0[n+12] = d3[n];
                    }                  

                    r0 += 2;
                    r1 += 2;
                    r2 += 2;
                    r3 += 2;

                    out_tm0 += 16;
                }
            }
        }
    }
    bottom_blob_bordered = Mat();
    
    // BEGIN dot
    Mat top_blob_tm;
    {
        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        const int tiles = nColBlocks * nRowBlocks; 

        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);

        int nn_outch = outch >> 2;
        int remain_outch_start = nn_outch << 2;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = pp * 4;

            Mat out0_tm = top_blob_tm.channel(p);
            Mat out1_tm = top_blob_tm.channel(p+1);
            Mat out2_tm = top_blob_tm.channel(p+2);
            Mat out3_tm = top_blob_tm.channel(p+3);

            const Mat kernel0_tm = kernel_tm.channel(p);
            const Mat kernel1_tm = kernel_tm.channel(p+1);
            const Mat kernel2_tm = kernel_tm.channel(p+2);
            const Mat kernel3_tm = kernel_tm.channel(p+3);

            for (int i=0; i<tiles; i++)
            {
                int* output0_tm = out0_tm.row<int>(i);
                int* output1_tm = out1_tm.row<int>(i);
                int* output2_tm = out2_tm.row<int>(i);
                int* output3_tm = out3_tm.row<int>(i);

                int sum0[16] = {0};
                int sum1[16] = {0};
                int sum2[16] = {0};
                int sum3[16] = {0};

                int q = 0;
                for (; q+3<inch; q+=4)
                {   
                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
                    const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
                    const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
                    const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);

                    const short* k0 = kernel0_tm.row<short>(q);
                    const short* k1 = kernel1_tm.row<short>(q);
                    const short* k2 = kernel2_tm.row<short>(q);
                    const short* k3 = kernel3_tm.row<short>(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += (int)r0[n] * k0[n];
                        k0 += 16;
                        sum0[n] += (int)r1[n] * k0[n];
                        k0 += 16;
                        sum0[n] += (int)r2[n] * k0[n];
                        k0 += 16;
                        sum0[n] += (int)r3[n] * k0[n];
                        k0 -= 16 * 3;

                        sum1[n] += (int)r0[n] * k1[n];
                        k1 += 16;
                        sum1[n] += (int)r1[n] * k1[n];
                        k1 += 16;
                        sum1[n] += (int)r2[n] * k1[n];
                        k1 += 16;
                        sum1[n] += (int)r3[n] * k1[n];
                        k1 -= 16 * 3;

                        sum2[n] += (int)r0[n] * k2[n];
                        k2 += 16;
                        sum2[n] += (int)r1[n] * k2[n];
                        k2 += 16;
                        sum2[n] += (int)r2[n] * k2[n];
                        k2 += 16;
                        sum2[n] += (int)r3[n] * k2[n];
                        k2 -= 16 * 3;

                        sum3[n] += (int)r0[n] * k3[n];
                        k3 += 16;
                        sum3[n] += (int)r1[n] * k3[n];
                        k3 += 16;
                        sum3[n] += (int)r2[n] * k3[n];
                        k3 += 16;
                        sum3[n] += (int)r3[n] * k3[n];
                        k3 -= 16 * 3;
                    }
                }

                for (; q<inch; q++)
                {
                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);

                    const short* k0 = kernel0_tm.row<short>(q);
                    const short* k1 = kernel1_tm.row<short>(q);
                    const short* k2 = kernel2_tm.row<short>(q);
                    const short* k3 = kernel3_tm.row<short>(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += (int)r0[n] * k0[n];
                        sum1[n] += (int)r0[n] * k1[n];
                        sum2[n] += (int)r0[n] * k2[n];
                        sum3[n] += (int)r0[n] * k3[n];
                    }
                }

                for (int n=0; n<16; n++)
                {
                    output0_tm[n] = sum0[n];
                    output1_tm[n] = sum1[n];
                    output2_tm[n] = sum2[n];
                    output3_tm[n] = sum3[n];
                }
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=remain_outch_start; p<outch; p++)
        {
            Mat out0_tm = top_blob_tm.channel(p);
            const Mat kernel0_tm = kernel_tm.channel(p);

            for (int i=0; i<tiles; i++)
            {
                int* output0_tm = out0_tm.row<int>(i);

                int sum0[16] = {0};

                int q = 0;
                for (; q+3<inch; q+=4)
                {   
                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
                    const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
                    const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
                    const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);

                    const short* k0 = kernel0_tm.row<short>(q);
                    const short* k1 = kernel0_tm.row<short>(q+1);
                    const short* k2 = kernel0_tm.row<short>(q+2);
                    const short* k3 = kernel0_tm.row<short>(q+3);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += (int)r0[n] * k0[n];
                        sum0[n] += (int)r1[n] * k1[n];
                        sum0[n] += (int)r2[n] * k2[n];
                        sum0[n] += (int)r3[n] * k3[n];
                    }
                }

                for (; q<inch; q++)
                {
                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
                    const short* k0 = kernel0_tm.row<short>(q);

                    for (int n=0; n<16; n++)
                    {
                        sum0[n] += (int)r0[n] * k0[n];
                    }             
                }

                for (int n=0; n<16; n++)
                {
                    output0_tm[n] = sum0[n];
                }
            }
        }
    }
    bottom_blob_tm = Mat();
    // END dot    

    // BEGIN transform output
    Mat top_blob_bordered;
    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
    {
        // AT
        // const float itm[2][4] = {
        //     {1.0f,  1.0f,  1.0f,  0.0f},
        //     {0.0f,  1.0f, -1.0f,  1.0f}
        // }; 

        int w_tm = outw / 2 * 4;
        int h_tm = outh / 2 * 4;

        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
        int nRowBlocks = w_tm/4;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<outch; p++)
        {
            Mat out_tm = top_blob_tm.channel(p);
            Mat out = top_blob_bordered.channel(p);

            for (int j=0; j<nColBlocks; j++)
            {
                int* outRow0 = out.row<int>(j*2);
                int* outRow1 = out.row<int>(j*2+1);

                for(int i=0; i<nRowBlocks; i++)
                {
                    int* out_tile = out_tm.row<int>(j*nRowBlocks + i);

                    int s0[4],s1[4],s2[4],s3[4];
                    int w0[4],w1[4];
                    int d0[2],d1[2],d2[2],d3[2];
                    int o0[2],o1[2];
                    // load
                    for (int n = 0; n < 4; n++)
                    {
                        s0[n] = out_tile[n];
                        s1[n] = out_tile[n+ 4];
                        s2[n] = out_tile[n+ 8];
                        s3[n] = out_tile[n+12];
                    }
                    // w = A_T * W
                    for (int n = 0; n < 4; n++)
                    {
                        w0[n] = s0[n] + s1[n] + s2[n];
                        w1[n] = s1[n] - s2[n] + s3[n];
                    }
                    // transpose w to w_t
                    {
                        d0[0] = w0[0]; d0[1] = w1[0];
                        d1[0] = w0[1]; d1[1] = w1[1];
                        d2[0] = w0[2]; d2[1] = w1[2];
                        d3[0] = w0[3]; d3[1] = w1[3];
                    }
                    // Y = A_T * w_t
                    for (int n = 0; n < 2; n++)
                    {
                        o0[n] = d0[n] + d1[n] + d2[n];
                        o1[n] = d1[n] - d2[n] + d3[n];
                    }
                    // save to top blob tm,why right 2,because the G' = G*2
                    outRow0[0] = o0[0] >> 2;
                    outRow0[1] = o0[1] >> 2;
                    outRow1[0] = o1[0] >> 2;
                    outRow1[1] = o1[1] >> 2;

                    outRow0 += 2;
                    outRow1 += 2;           
                }
            }
        }        
    }
    // END transform output 

    // cut result pad
    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);  
 }

 static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int w = bottom_blob.w;
@@ -122,23 +534,19 @@ static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat

                for (; remain > 0; remain--)
                {
                    short sum0 = 0;
                    short sum1 = 0;
                    short sum2 = 0;

                    sum0 += (short)r0[0] * kernel0[0];
                    sum0 += (short)r0[1] * kernel0[1];
                    sum0 += (short)r0[2] * kernel0[2];
                    sum1 += (short)r1[0] * kernel0[3];
                    sum1 += (short)r1[1] * kernel0[4];
                    sum1 += (short)r1[2] * kernel0[5];
                    sum2 += (short)r2[0] * kernel0[6];
                    sum2 += (short)r2[1] * kernel0[7];
                    sum2 += (short)r2[2] * kernel0[8];

                    *outptr0 = saturate2int16(*outptr0 + sum0);
                    *outptr0 = saturate2int16(*outptr0 + sum1);
                    *outptr0 = saturate2int16(*outptr0 + sum2);
                    int sum0 = 0;

                    sum0 += (int)r0[0] * kernel0[0];
                    sum0 += (int)r0[1] * kernel0[1];
                    sum0 += (int)r0[2] * kernel0[2];
                    sum0 += (int)r1[0] * kernel0[3];
                    sum0 += (int)r1[1] * kernel0[4];
                    sum0 += (int)r1[2] * kernel0[5];
                    sum0 += (int)r2[0] * kernel0[6];
                    sum0 += (int)r2[1] * kernel0[7];
                    sum0 += (int)r2[2] * kernel0[8];

                    *outptr0 += sum0;

                    r0 += 2;
                    r1 += 2;
--- a/src/layer/x86/convolution_5x5_int8.h
+++ b/src/layer/x86/convolution_5x5_int8.h
@@ -0,0 +1,35 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv5x5s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 5;
    int kernel_h = 5;

    int stride_w = 1;
    int stride_h = 1;

    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }

 static void conv5x5s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 5;
    int kernel_h = 5;

    int stride_w = 2;
    int stride_h = 2;

    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }
--- a/src/layer/x86/convolution_7x7_int8.h
+++ b/src/layer/x86/convolution_7x7_int8.h
@@ -0,0 +1,35 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv7x7s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 7;
    int kernel_h = 7;

    int stride_w = 1;
    int stride_h = 1;

    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }

 static void conv7x7s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
    int kernel_w = 7;
    int kernel_h = 7;

    int stride_w = 2;
    int stride_h = 2;

    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }
--- a/src/layer/x86/convolution_sgemm_int8.h
+++ b/src/layer/x86/convolution_sgemm_int8.h
@@ -0,0 +1,381 @@
 // SenseNets is pleased to support the open source community by supporting ncnn available.
 //
 // Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 static void conv_im2col_sgemm_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \
            const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
 {
    int w = bottom_blob.w;
    int inch = bottom_blob.c;

    int outw = top_blob.w;
    int outh = top_blob.h;
    int outch = top_blob.c;

    const signed char *kernel = _kernel;

    // im2col
    Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator);
    {
        const int stride = kernel_h*kernel_w*outw*outh;
        signed char* ret = (signed char*)bottom_im2col;
    
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=0; p<inch; p++)
        {
            const signed char* input = bottom_blob.channel(p);
            int retID = stride * p;
            for (int u=0; u<kernel_h; u++)
            {
                for (int v=0; v<kernel_w; v++)
                {
                    for (int i=0; i<outh; i++)
                    {
                        for (int j=0; j<outw; j++)
                        {
                            int row = u + i * stride_h;
                            int col = v + j * stride_w;
                            int index = row * w + col;
                            ret[retID] = input[index];
                            retID++;
                        }
                    }
                }
            }
        }
    }

    int kernel_size = kernel_w * kernel_h;
    int out_size = outw * outh;

    // bottom_im2col memory packed 4 x 8
    Mat bottom_tm(8*kernel_size, inch, out_size/8 + out_size%8, (size_t)1u, opt.workspace_allocator);
    {
        int nn_size = out_size >> 3;
        int remain_size_start = nn_size << 3;

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int ii=0; ii<nn_size; ii++)
        {
            int i = ii * 8;

            const signed char* img0 = bottom_im2col.channel(0);
            img0 += i;

            signed char* tmpptr = bottom_tm.channel(i/8);

            for (int q=0; q<inch*kernel_size; q++)
            {
                tmpptr[0] = img0[0];
                tmpptr[1] = img0[1];
                tmpptr[2] = img0[2];
                tmpptr[3] = img0[3];
                tmpptr[4] = img0[4];
                tmpptr[5] = img0[5];
                tmpptr[6] = img0[6];
                tmpptr[7] = img0[7];

                tmpptr += 8;
                img0 += out_size;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=remain_size_start; i<out_size; i++)
        {
            const signed char* img0 = bottom_im2col.channel(0);
            img0 += i;

            signed char* tmpptr = bottom_tm.channel(i/8 + i%8);

            for (int q=0; q<inch*kernel_size; q++)
            {
                tmpptr[0] = img0[0];

                tmpptr += 1;
                img0 += out_size;
            }
        }       
    }

    // kernel memory packed 4 x 8
    Mat kernel_tm(4*kernel_size, inch, outch/4 + outch%4, (size_t)1u, opt.workspace_allocator);
    {
        int nn_outch = 0;
        int remain_outch_start = 0;

        nn_outch = outch >> 2;
        remain_outch_start = nn_outch << 2;
        
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int p = pp * 4;

            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
            const signed char* k1 = kernel + (p+1)*inch*kernel_size;
            const signed char* k2 = kernel + (p+2)*inch*kernel_size;
            const signed char* k3 = kernel + (p+3)*inch*kernel_size;

            signed char* ktmp = kernel_tm.channel(p/4);

            for (int q=0; q<inch*kernel_size; q++)
            {
                ktmp[0] = k0[0];
                ktmp[1] = k1[0];
                ktmp[2] = k2[0];
                ktmp[3] = k3[0];
                ktmp += 4;

                k0 += 1;
                k1 += 1;
                k2 += 1;
                k3 += 1;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int p=remain_outch_start; p<outch; p++)
        {
            const signed char* k0 = kernel + (p+0)*inch*kernel_size;

            signed char* ktmp = kernel_tm.channel(p/4 + p%4);

            for (int q=0; q<inch*kernel_size; q++)
            {
                ktmp[0] = k0[0];
                ktmp++;
                k0++;
            }
        }
    }

    // sgemm(int M, int N, int L, float* A, float* B, float* C)
    {
        // int M = outch;  // outch
        int N = outw * outh; // outsize or out stride
        int L = kernel_w * kernel_h * inch; // ksize * inch

        int nn_outch = 0;
        int remain_outch_start = 0;

        nn_outch = outch >> 2;
        remain_outch_start = nn_outch << 2;
        
        #pragma omp parallel for num_threads(opt.num_threads)
        for (int pp=0; pp<nn_outch; pp++)
        {
            int i = pp * 4;

            int* output0 = top_blob.channel(i);
            int* output1 = top_blob.channel(i+1);
            int* output2 = top_blob.channel(i+2);
            int* output3 = top_blob.channel(i+3);

            int j=0;
            for (; j+7<N; j=j+8)
            {
                signed char* vb = bottom_tm.channel(j/8);
                signed char* va = kernel_tm.channel(i/4);
                
                int sum0[8] = {0};
                int sum1[8] = {0};
                int sum2[8] = {0};
                int sum3[8] = {0};
               
                int k=0;
                for (; k+7<L; k=k+8)
                {
                    for (int n=0; n<8; n++)
                    {
                        sum0[n] += (int)va[0] * vb[n];
                        sum1[n] += (int)va[1] * vb[n];
                        sum2[n] += (int)va[2] * vb[n];
                        sum3[n] += (int)va[3] * vb[n];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+8];
                        sum1[n] += (int)va[1] * vb[n+8];
                        sum2[n] += (int)va[2] * vb[n+8];
                        sum3[n] += (int)va[3] * vb[n+8];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+16];
                        sum1[n] += (int)va[1] * vb[n+16];
                        sum2[n] += (int)va[2] * vb[n+16];
                        sum3[n] += (int)va[3] * vb[n+16];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+24];
                        sum1[n] += (int)va[1] * vb[n+24];
                        sum2[n] += (int)va[2] * vb[n+24];
                        sum3[n] += (int)va[3] * vb[n+24];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+32];
                        sum1[n] += (int)va[1] * vb[n+32];
                        sum2[n] += (int)va[2] * vb[n+32];
                        sum3[n] += (int)va[3] * vb[n+32];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+40];
                        sum1[n] += (int)va[1] * vb[n+40];
                        sum2[n] += (int)va[2] * vb[n+40];
                        sum3[n] += (int)va[3] * vb[n+40];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+48];
                        sum1[n] += (int)va[1] * vb[n+48];
                        sum2[n] += (int)va[2] * vb[n+48];
                        sum3[n] += (int)va[3] * vb[n+48];
                        va += 4;

                        sum0[n] += (int)va[0] * vb[n+56];
                        sum1[n] += (int)va[1] * vb[n+56];
                        sum2[n] += (int)va[2] * vb[n+56];
                        sum3[n] += (int)va[3] * vb[n+56];
                        va -= 28;
                    }

                    va += 32;
                    vb += 64;
                }

                for (; k<L; k++)
                {
                    for (int n=0; n<8; n++)
                    {
                        sum0[n] += (int)va[0] * vb[n];
                        sum1[n] += (int)va[1] * vb[n];
                        sum2[n] += (int)va[2] * vb[n];
                        sum3[n] += (int)va[3] * vb[n];
                    }
                    
                    va += 4;
                    vb += 8;
                }

                for (int n=0; n<8; n++)
                {
                    output0[n] = sum0[n];
                    output1[n] = sum1[n];
                    output2[n] = sum2[n];
                    output3[n] = sum3[n];
                }
                output0 += 8;
                output1 += 8;
                output2 += 8;
                output3 += 8;
            }

            for (; j<N; j++)
            {                
                int sum0 = 0;
                int sum1 = 0;
                int sum2 = 0;
                int sum3 = 0;

                signed char* vb = bottom_tm.channel(j/8 + j%8);
                signed char* va = kernel_tm.channel(i/4);

                for (int k=0; k<L; k++)
                {
                    sum0 += (int)va[0] * vb[0];
                    sum1 += (int)va[1] * vb[0];
                    sum2 += (int)va[2] * vb[0];
                    sum3 += (int)va[3] * vb[0];

                    va += 4;
                    vb += 1;
                }
                
                output0[0] = sum0;
                output1[0] = sum1;
                output2[0] = sum2;
                output3[0] = sum3;

                output0++;
                output1++;
                output2++;
                output3++;
            }
        }

        #pragma omp parallel for num_threads(opt.num_threads)
        for (int i=remain_outch_start; i<outch; i++)
        {
            int* output = top_blob.channel(i);

            int j=0;
            for (; j+7<N; j=j+8)
            {
                signed char* vb = bottom_tm.channel(j/8);
                signed char* va = kernel_tm.channel(i/4 + i%4);
                int sum[8] = {0};

                int k=0;
                for (; k+7<L; k=k+8)
                {
                    for (int n=0; n<8; n++)
                    {
                        sum[n] += (int)va[0] * vb[n];
                        sum[n] += (int)va[1] * vb[n+8];
                        sum[n] += (int)va[2] * vb[n+16];
                        sum[n] += (int)va[3] * vb[n+24];
                        sum[n] += (int)va[4] * vb[n+32];
                        sum[n] += (int)va[5] * vb[n+40];
                        sum[n] += (int)va[6] * vb[n+48];
                        sum[n] += (int)va[7] * vb[n+56];
                    }
                    va += 8;
                    vb += 64;
                }

                for (; k<L; k++)
                {
                    for (int n=0; n<8; n++)
                    {
                        sum[n] += (int)va[0] * vb[n];
                    }
                    va += 1;
                    vb += 8;
                }

                for (int n=0; n<8; n++)
                {
                    output[n] = sum[n];
                }
                output += 8;
            }

            for (; j<N; j++)
            {
                int sum = 0;

                signed char* vb = bottom_tm.channel(j/8 + j%8);
                signed char* va = kernel_tm.channel(i/4 + i%4);

                for (int k=0; k<L; k++)
                {
                    sum += (int)va[0] * vb[0];

                    va += 1;
                    vb += 1;
                }
                output[0] = sum;

                output++;
            }
        }
    }
 }
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -14,17 +14,61 @@

 #include "convolution_x86.h"

 #include "layer_type.h"
 #include "benchmark.h"

 namespace ncnn {

 #include "convolution_1x1.h"
 #include "convolution_3x3.h"
 #include "convolution_5x5.h"

 #include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
 #include "convolution_5x5_int8.h"
 #include "convolution_7x7_int8.h"

 DEFINE_LAYER_CREATOR(Convolution_x86)

 int Convolution_x86::load_param(const ParamDict& pd)
 {
    int ret = Convolution::load_param(pd);
    if (ret != 0)
        return ret;

    use_winograd3x3 = false;

    if (pd.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
    {
        int num_input = weight_data_size / 9 / num_output;
        // winograd is slow on small channel count
        if(num_input >= 16 && num_output >= 16)
            use_winograd3x3 = true;
    }           

    return 0;
 }

 int Convolution_x86::load_model(const ModelBin& mb)
 {
    int ret = Convolution::load_model(mb);
    if (ret != 0)
        return ret;

    if (use_winograd3x3)
    {
        int num_input = weight_data_size / 9 / num_output;

        if (use_int8_inference)
            conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
        else
            conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
    }

    return 0;
 }

 int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
 {
    int w = bottom_blob.w;
@@ -147,7 +191,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    const int kernel_size = kernel_w;
    const int stride = stride_w;

    if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h)
    if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h)
    {
        return Convolution::forward(bottom_blob, top_blob, opt);
    }
@@ -155,26 +199,23 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
    typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);

    // kernel_size x stride
    conv_func conv_func_table[5][5] =
    conv_func conv_func_table[7][4] =
    {
        {
            conv1x1s1_sse,
            conv1x1s2_sse,
            0,
            0,
            0
        }, // kernel_size = 1
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 2
        {
            conv3x3s1_sse,
            0,
            0,
            conv3x3s2_sse,
            0,
            0
        }, // kernel_size = 3
@@ -182,35 +223,43 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 4
        {
            conv5x5s1_sse,
            0,
            0,
            0
        }, // kernel_size = 5
        {
            0,
            0,
            0,
            0
        }, // kernel_size = 6
        {
            0,          
            0,          
            0,
            0
        }  // kernel_size = 5
        }  // kernel_size = 7        
    };

    typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);

    // kernel_size x stride
    conv_int8_func conv_int8_func_table[5][5] =
    conv_int8_func conv_int8_func_table[7][4] =
    {
        {
            conv1x1s1_int8_sse,
            conv1x1s2_int8_sse,
            0,
            0,
            0
        }, // kernel_size = 1
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 2
        {
@@ -218,22 +267,31 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
            conv3x3s2_int8_sse,
            0,
            0,
            0
        }, // kernel_size = 3
        {
            0,
            0,
            0,
            0,
            0
        }, // kernel_size = 4
        {        
            conv5x5s1_int8_sse,
            conv5x5s2_int8_sse,    
            0,
            0
        }, // kernel_size = 5
        {
            0,
            0,
            0,
            0
        }, // kernel_size = 6
        {
            conv7x7s1_int8_sse,          
            conv7x7s2_int8_sse, 
            0,
            0
        }  // kernel_size = 5
        }  // kernel_size = 7
    };

    conv_func conv = 0;
@@ -322,21 +380,69 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option

    if (use_int8_inference)
    {
        conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);

        // dequantize, reverse scale inplace
        if (use_int8_requantize == true)
        {
            ncnn::Option opt_g = opt;
            opt_g.blob_allocator = top_blob.allocator;
            Mat top_blob_tm;
            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
            if (top_blob_tm.empty())
                return -100;
            
            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
            if (top_blob.empty())
                return -100; 

            if (use_winograd3x3)
                conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt);
            else
                conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);

            dequantize->forward_inplace(top_blob, opt_g);
            // requantize, reverse scale inplace
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = top_blob.allocator;

                Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
                Mat top_blob_g = top_blob.channel_range(p, 1);
                requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
            }                                       
        }
        else
        {
            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
            if (top_blob.empty())
                return -100; 

            if (use_winograd3x3)
                conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt);
            else
                conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);

            // dequantize, reverse scale inplace
            #pragma omp parallel for num_threads(opt.num_threads)
            for (int p=0; p<num_output; p++)
            {
                ncnn::Option opt_g = opt;
                opt_g.num_threads = 1;
                opt_g.blob_allocator = top_blob.allocator;

                Mat top_blob_g = top_blob.channel_range(p, 1);
                dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
            }                    
        } 
      
        return 0;
    }

    conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);

    if (use_winograd3x3)
    {
        conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt);
    }    
    else
        conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
        
    return 0;
 }

--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -24,8 +24,16 @@ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option
 class Convolution_x86 : public Convolution
 {
 public:
    virtual int load_param(const ParamDict& pd);

    virtual int load_model(const ModelBin& mb);

    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
    virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const;

 public:
    bool use_winograd3x3;
    Mat weight_3x3_winograd23_data;
 };

 } // namespace ncnn
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -134,7 +134,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
    }

    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;     

    Mat bottom_blob_unbordered = bottom_blob;
    if (use_int8_inference && elemsize != 1)
@@ -159,8 +159,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
            quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
        }

        bottom_blob_unbordered = bottom_blob_int8;
    }
        bottom_blob_unbordered = bottom_blob_int8;       
    }     

    Mat bottom_blob_bordered = bottom_blob_unbordered;
    if (pad_w > 0 || pad_h > 0)
@@ -203,25 +203,65 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
            {
                if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
                {
                    if (stride_w == 1 && stride_h == 1)
                    if (use_int8_requantize)
                    {
                        convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
                        Mat top_blob_tm;
                        top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
                        if (top_blob_tm.empty())
                            return -100;
                        
                        top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
                        if (top_blob.empty())
                            return -100;

                        if (stride_w == 1 && stride_h == 1)
                        {
                            convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
                        }
                        else if (stride_w == 2 && stride_h == 2)
                        {
                            convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
                        }

                        // requantize, reverse scale inplace
                        #pragma omp parallel for num_threads(opt.num_threads)
                        for (int g=0; g<group; g++)
                        {
                            ncnn::Option opt_g = opt;
                            opt_g.num_threads = 1;
                            opt_g.blob_allocator = top_blob.allocator;

                            Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
                            Mat top_blob_g = top_blob.channel_range(g, 1);
                            requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
                        }                      
                    }
                    else if (stride_w == 2 && stride_h == 2)
                    else
                    {
                        convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
                    }

                    // dequantize, reverse scale inplace
                    #pragma omp parallel for num_threads(opt.num_threads)
                    for (int g=0; g<group; g++)
                    {
                        ncnn::Option opt_g = opt;
                        opt_g.num_threads = 1;
                        opt_g.blob_allocator = top_blob.allocator;

                        Mat top_blob_g = top_blob.channel(g);
                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
                        top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
                        if (top_blob.empty())
                            return -100;                       

                        if (stride_w == 1 && stride_h == 1)
                        {
                            convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
                        }
                        else if (stride_w == 2 && stride_h == 2)
                        {
                            convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
                        }

                        // dequantize, reverse scale inplace
                        #pragma omp parallel for num_threads(opt.num_threads)
                        for (int g=0; g<group; g++)
                        {
                            ncnn::Option opt_g = opt;
                            opt_g.num_threads = 1;
                            opt_g.blob_allocator = top_blob.allocator;

                            Mat top_blob_g = top_blob.channel(g);
                            dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
                        }
                    }

                    return 0;
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -16,6 +16,9 @@
 #include "layer_type.h"
 #include "modelbin.h"
 #include "paramdict.h"
 #include "convolution.h"
 #include "convolutiondepthwise.h"
 #include "relu.h"

 #include <stdarg.h>
 #include <stdio.h>
@@ -679,6 +682,8 @@ int Net::load_model(FILE* fp)
    }
 #endif // NCNN_VULKAN

    fuse_network();

    return ret;
 }

@@ -898,6 +903,110 @@ int Net::load_model(const unsigned char* _mem)
    return mem - _mem;
 }

 void Net::fuse_network()
 {
    // set the int8 op fusion:requantize
 #if NCNN_STRING && NCNN_REQUANT    
    // fprintf(stderr, "Test op fusion to int8 implement:\n");
    for (size_t i=0; i<layers.size(); i++)
    {
        Layer* layer = layers[i];

        if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise")
        {
            if (((Convolution*)layer)->use_int8_inference == false)
                continue;

            for (size_t n=0; n<blobs[layer->tops[0]].consumers.size(); n++)
            {
                int layer_next_index = blobs[layer->tops[0]].consumers[n];
                Layer* layer_next = layers[layer_next_index];

                if (layer_next->type == "ReLU")
                {
                    int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0];
                    Layer* layer_next_2 = layers[layer_next_2_index];

                    if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise")
                    {
                        // fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
                        if (layer->type == "Convolution" && layer_next_2->type == "Convolution")
                        {
                            ((Convolution*)layer)->use_int8_requantize = true;
                            ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
                            ((Convolution*)layer)->create_requantize_op();
                        }
                        else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution")
                        {
                            ((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
                            ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
                            ((ConvolutionDepthWise*)layer)->create_requantize_op();
                        }
                        else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise")
                        {
                            ((Convolution*)layer)->use_int8_requantize = true;
                            ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
                            ((Convolution*)layer)->create_requantize_op();
                        }
                        else
                        {
                            ((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
                            ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
                            ((ConvolutionDepthWise*)layer)->create_requantize_op();
                        }
                    }
                    else if (layer_next_2->type == "Split")
                    {
                        bool all_conv = true;
                        for (size_t i=0; i<layer_next_2->tops.size(); i++)
                        {
                            int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
                            if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox" )
                            {
                                // fprintf(stderr, "%s, %s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str());
                                all_conv = false;
                            }
                        }

                        if (all_conv == true && layer_next_2->tops.size() >= size_t(2))
                        {
                            // fprintf(stderr, "%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
                            for (size_t i=0; i<layer_next_2->tops.size(); i++)
                            {
                                int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
                                Layer* layer_next_3 = layers[layer_next_3_index];

                                // fprintf(stderr, "%s, ", layer_next_3->name.c_str());
                                if (layer_next_3->type == "Convolution")
                                {
                                    ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale; 
                                }    
                            }

                            ((Convolution*)layer)->use_int8_requantize = true;
                            ((Convolution*)layer)->create_requantize_op();    
                            // fprintf(stderr, "\n");
                        }
                    }
                    else
                    {
                        // fprintf(stderr, "%s, %s\n", layer->name.c_str(), layer_next->name.c_str());
                    }
                }
                else if (layer_next->type == "Pooling")
                {
                    // ToDo
                }
                else
                {
                    // fprintf(stderr, "%s\n", layer->name.c_str());
                }                  
            }
        }
    }
 #endif
 }

 void Net::clear()
 {
    blobs.clear();
--- a/src/net.h
+++ b/src/net.h
@@ -76,6 +76,10 @@ public:
    // return bytes consumed
    int load_model(const unsigned char* mem);

    // parse the structure of network
    // fuse int8 op dequantize and quantize by requantize
    void fuse_network();

    // unload network structure and weight data
    void clear();

--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -22,5 +22,7 @@
 #cmakedefine01 NCNN_PIXEL
 #cmakedefine01 NCNN_PIXEL_ROTATE
 #cmakedefine01 NCNN_VULKAN
 #cmakedefine01 NCNN_REQUANT
 #cmakedefine01 NCNN_IM2COL_SGEMM

 #endif // NCNN_PLATFORM_H
--- a/tools/caffe/caffe2ncnn.cpp
+++ b/tools/caffe/caffe2ncnn.cpp
@@ -685,7 +685,7 @@ int main(int argc, char** argv)

                if (int8_scale_term)
                {
                    if ((int)weight_int8scale.size() == num_group && (int)blob_int8scale.size() == num_group)
                    if ((int)weight_int8scale.size() == num_group)
                    {
                        fprintf(pp, " 8=1");
                    }