diff --git a/CMakeLists.txt b/CMakeLists.txt
index c430db591..15b8ba95a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -32,6 +32,8 @@ option(NCNN_PIXEL "convert and resize from/to image pixel" ON)
 option(NCNN_PIXEL_ROTATE "rotate image pixel orientation" OFF)
 option(NCNN_CMAKE_VERBOSE "print verbose cmake messages" OFF)
 option(NCNN_VULKAN "vulkan compute support" OFF)
+option(NCNN_REQUANT "auto merge int8 quant and dequant" OFF)
+option(NCNN_IM2COL_SGEMM "im2col sgemm support" OFF)
 
 if(NCNN_OPENMP)
     find_package(OpenMP)
diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp
index fa7978c19..454e086f9 100644
--- a/benchmark/benchncnn.cpp
+++ b/benchmark/benchncnn.cpp
@@ -202,7 +202,7 @@ void benchmark(const char* comment, void (*init)(ncnn::Net&), void (*run)(const
 
     time_avg /= g_loop_count;
 
-    fprintf(stderr, "%16s  min = %7.2f  max = %7.2f  avg = %7.2f\n", comment, time_min, time_max, time_avg);
+    fprintf(stderr, "%-20s  min = %7.2f  max = %7.2f  avg = %7.2f\n", comment, time_min, time_max, time_avg);
 }
 
 void squeezenet_init(ncnn::Net& net)
@@ -210,6 +210,11 @@ void squeezenet_init(ncnn::Net& net)
     net.load_param("squeezenet.param");
 }
 
+void squeezenet_int8_init(ncnn::Net& net)
+{
+    net.load_param("squeezenet_int8.param");
+}
+
 void squeezenet_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -226,6 +231,11 @@ void mobilenet_init(ncnn::Net& net)
     net.load_param("mobilenet.param");
 }
 
+void mobilenet_int8_init(ncnn::Net& net)
+{
+    net.load_param("mobilenet_int8.param");
+}
+
 void mobilenet_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -306,6 +316,11 @@ void googlenet_init(ncnn::Net& net)
     net.load_param("googlenet.param");
 }
 
+void googlenet_int8_init(ncnn::Net& net)
+{
+    net.load_param("googlenet_int8.param");
+}
+
 void googlenet_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -322,6 +337,11 @@ void resnet18_init(ncnn::Net& net)
     net.load_param("resnet18.param");
 }
 
+void resnet18_int8_init(ncnn::Net& net)
+{
+    net.load_param("resnet18_int8.param");
+}
+
 void resnet18_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -354,6 +374,11 @@ void vgg16_init(ncnn::Net& net)
     net.load_param("vgg16.param");
 }
 
+void vgg16_int8_init(ncnn::Net& net)
+{
+    net.load_param("vgg16_int8.param");
+}
+
 void vgg16_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -365,11 +390,37 @@ void vgg16_run(const ncnn::Net& net)
     ex.extract("prob", out);
 }
 
+void resnet50_init(ncnn::Net& net)
+{
+    net.load_param("resnet50.param");
+}
+
+void resnet50_int8_init(ncnn::Net& net)
+{
+    net.load_param("resnet50_int8.param");
+}
+
+void resnet50_run(const ncnn::Net& net)
+{
+    ncnn::Extractor ex = net.create_extractor();
+
+    ncnn::Mat in(224, 224, 3);
+    ex.input("data", in);
+
+    ncnn::Mat out;
+    ex.extract("prob", out);
+}
+
 void squeezenet_ssd_init(ncnn::Net& net)
 {
     net.load_param("squeezenet_ssd.param");
 }
 
+void squeezenet_ssd_int8_init(ncnn::Net& net)
+{
+    net.load_param("squeezenet_ssd_int8.param");
+}
+
 void squeezenet_ssd_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -386,6 +437,11 @@ void mobilenet_ssd_init(ncnn::Net& net)
     net.load_param("mobilenet_ssd.param");
 }
 
+void mobilenet_ssd_int8_init(ncnn::Net& net)
+{
+    net.load_param("mobilenet_ssd_int8.param");
+}
+
 void mobilenet_ssd_run(const ncnn::Net& net)
 {
     ncnn::Extractor ex = net.create_extractor();
@@ -497,8 +553,12 @@ int main(int argc, char** argv)
     // run
     benchmark("squeezenet", squeezenet_init, squeezenet_run);
 
+    benchmark("squeezenet-int8", squeezenet_int8_init, squeezenet_run);
+
     benchmark("mobilenet", mobilenet_init, mobilenet_run);
 
+    benchmark("mobilenet-int8", mobilenet_int8_init, mobilenet_run);
+
     benchmark("mobilenet_v2", mobilenet_v2_init, mobilenet_v2_run);
 
     benchmark("shufflenet", shufflenet_init, shufflenet_run);
@@ -509,16 +569,28 @@ int main(int argc, char** argv)
 
     benchmark("googlenet", googlenet_init, googlenet_run);
 
+    benchmark("googlenet-int8", googlenet_int8_init, googlenet_run);
+
     benchmark("resnet18", resnet18_init, resnet18_run);
 
+    benchmark("resnet18-int8", resnet18_int8_init, resnet18_run);
+
     benchmark("alexnet", alexnet_init, alexnet_run);
 
     benchmark("vgg16", vgg16_init, vgg16_run);
 
+    benchmark("resnet50", resnet50_init, resnet50_run);
+
+    benchmark("resnet50-int8", resnet50_int8_init, resnet50_run);
+
     benchmark("squeezenet-ssd", squeezenet_ssd_init, squeezenet_ssd_run);
 
+    benchmark("squeezenet-ssd-int8", squeezenet_ssd_int8_init, squeezenet_ssd_run);
+
     benchmark("mobilenet-ssd", mobilenet_ssd_init, mobilenet_ssd_run);
 
+    benchmark("mobilenet-ssd-int8", mobilenet_ssd_int8_init, mobilenet_ssd_run);
+
     benchmark("mobilenet-yolo", mobilenet_yolo_init, mobilenet_yolo_run);
 
     benchmark("mobilenet-yolov3", mobilenet_yolov3_init, mobilenet_yolov3_run);
diff --git a/benchmark/googlenet_int8.param b/benchmark/googlenet_int8.param
new file mode 100755
index 000000000..d2dfeadc1
--- /dev/null
+++ b/benchmark/googlenet_int8.param
@@ -0,0 +1,154 @@
+7767517
+152 179
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1/7x7_s2     1 1 data conv1/7x7_s2 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
+ReLU             conv1/relu_7x7   1 1 conv1/7x7_s2 conv1/7x7_s2_conv1/relu_7x7
+Pooling          pool1/3x3_s2     1 1 conv1/7x7_s2_conv1/relu_7x7 pool1/3x3_s2 0=0 1=3 2=2 3=0 4=0
+LRN              pool1/norm1      1 1 pool1/3x3_s2 pool1/norm1 0=0 1=5 2=0.000100 3=0.750000
+Convolution      conv2/3x3_reduce 1 1 pool1/norm1 conv2/3x3_reduce 0=64 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             conv2/relu_3x3_reduce 1 1 conv2/3x3_reduce conv2/3x3_reduce_conv2/relu_3x3_reduce
+Convolution      conv2/3x3        1 1 conv2/3x3_reduce_conv2/relu_3x3_reduce conv2/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
+ReLU             conv2/relu_3x3   1 1 conv2/3x3 conv2/3x3_conv2/relu_3x3
+LRN              conv2/norm2      1 1 conv2/3x3_conv2/relu_3x3 conv2/norm2 0=0 1=5 2=0.000100 3=0.750000
+Pooling          pool2/3x3_s2     1 1 conv2/norm2 pool2/3x3_s2 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_0      1 4 pool2/3x3_s2 pool2/3x3_s2_splitncnn_0 pool2/3x3_s2_splitncnn_1 pool2/3x3_s2_splitncnn_2 pool2/3x3_s2_splitncnn_3
+Convolution      inception_3a/1x1 1 1 pool2/3x3_s2_splitncnn_3 inception_3a/1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+ReLU             inception_3a/relu_1x1 1 1 inception_3a/1x1 inception_3a/1x1_inception_3a/relu_1x1
+Convolution      inception_3a/3x3_reduce 1 1 pool2/3x3_s2_splitncnn_2 inception_3a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
+ReLU             inception_3a/relu_3x3_reduce 1 1 inception_3a/3x3_reduce inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce
+Convolution      inception_3a/3x3 1 1 inception_3a/3x3_reduce_inception_3a/relu_3x3_reduce inception_3a/3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
+ReLU             inception_3a/relu_3x3 1 1 inception_3a/3x3 inception_3a/3x3_inception_3a/relu_3x3
+Convolution      inception_3a/5x5_reduce 1 1 pool2/3x3_s2_splitncnn_1 inception_3a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
+ReLU             inception_3a/relu_5x5_reduce 1 1 inception_3a/5x5_reduce inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce
+Convolution      inception_3a/5x5 1 1 inception_3a/5x5_reduce_inception_3a/relu_5x5_reduce inception_3a/5x5 0=32 1=5 2=1 3=1 4=2 5=1 6=12800 8=2
+ReLU             inception_3a/relu_5x5 1 1 inception_3a/5x5 inception_3a/5x5_inception_3a/relu_5x5
+Pooling          inception_3a/pool 1 1 pool2/3x3_s2_splitncnn_0 inception_3a/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_3a/pool_proj 1 1 inception_3a/pool inception_3a/pool_proj 0=32 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
+ReLU             inception_3a/relu_pool_proj 1 1 inception_3a/pool_proj inception_3a/pool_proj_inception_3a/relu_pool_proj
+Concat           inception_3a/output 4 1 inception_3a/1x1_inception_3a/relu_1x1 inception_3a/3x3_inception_3a/relu_3x3 inception_3a/5x5_inception_3a/relu_5x5 inception_3a/pool_proj_inception_3a/relu_pool_proj inception_3a/output 0=0
+Split            splitncnn_1      1 4 inception_3a/output inception_3a/output_splitncnn_0 inception_3a/output_splitncnn_1 inception_3a/output_splitncnn_2 inception_3a/output_splitncnn_3
+Convolution      inception_3b/1x1 1 1 inception_3a/output_splitncnn_3 inception_3b/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             inception_3b/relu_1x1 1 1 inception_3b/1x1 inception_3b/1x1_inception_3b/relu_1x1
+Convolution      inception_3b/3x3_reduce 1 1 inception_3a/output_splitncnn_2 inception_3b/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             inception_3b/relu_3x3_reduce 1 1 inception_3b/3x3_reduce inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce
+Convolution      inception_3b/3x3 1 1 inception_3b/3x3_reduce_inception_3b/relu_3x3_reduce inception_3b/3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=221184 8=2
+ReLU             inception_3b/relu_3x3 1 1 inception_3b/3x3 inception_3b/3x3_inception_3b/relu_3x3
+Convolution      inception_3b/5x5_reduce 1 1 inception_3a/output_splitncnn_1 inception_3b/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
+ReLU             inception_3b/relu_5x5_reduce 1 1 inception_3b/5x5_reduce inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce
+Convolution      inception_3b/5x5 1 1 inception_3b/5x5_reduce_inception_3b/relu_5x5_reduce inception_3b/5x5 0=96 1=5 2=1 3=1 4=2 5=1 6=76800 8=2
+ReLU             inception_3b/relu_5x5 1 1 inception_3b/5x5 inception_3b/5x5_inception_3b/relu_5x5
+Pooling          inception_3b/pool 1 1 inception_3a/output_splitncnn_0 inception_3b/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_3b/pool_proj 1 1 inception_3b/pool inception_3b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             inception_3b/relu_pool_proj 1 1 inception_3b/pool_proj inception_3b/pool_proj_inception_3b/relu_pool_proj
+Concat           inception_3b/output 4 1 inception_3b/1x1_inception_3b/relu_1x1 inception_3b/3x3_inception_3b/relu_3x3 inception_3b/5x5_inception_3b/relu_5x5 inception_3b/pool_proj_inception_3b/relu_pool_proj inception_3b/output 0=0
+Pooling          pool3/3x3_s2     1 1 inception_3b/output pool3/3x3_s2 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_2      1 4 pool3/3x3_s2 pool3/3x3_s2_splitncnn_0 pool3/3x3_s2_splitncnn_1 pool3/3x3_s2_splitncnn_2 pool3/3x3_s2_splitncnn_3
+Convolution      inception_4a/1x1 1 1 pool3/3x3_s2_splitncnn_3 inception_4a/1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=92160 8=2
+ReLU             inception_4a/relu_1x1 1 1 inception_4a/1x1 inception_4a/1x1_inception_4a/relu_1x1
+Convolution      inception_4a/3x3_reduce 1 1 pool3/3x3_s2_splitncnn_2 inception_4a/3x3_reduce 0=96 1=1 2=1 3=1 4=0 5=1 6=46080 8=2
+ReLU             inception_4a/relu_3x3_reduce 1 1 inception_4a/3x3_reduce inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce
+Convolution      inception_4a/3x3 1 1 inception_4a/3x3_reduce_inception_4a/relu_3x3_reduce inception_4a/3x3 0=208 1=3 2=1 3=1 4=1 5=1 6=179712 8=2
+ReLU             inception_4a/relu_3x3 1 1 inception_4a/3x3 inception_4a/3x3_inception_4a/relu_3x3
+Convolution      inception_4a/5x5_reduce 1 1 pool3/3x3_s2_splitncnn_1 inception_4a/5x5_reduce 0=16 1=1 2=1 3=1 4=0 5=1 6=7680 8=2
+ReLU             inception_4a/relu_5x5_reduce 1 1 inception_4a/5x5_reduce inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce
+Convolution      inception_4a/5x5 1 1 inception_4a/5x5_reduce_inception_4a/relu_5x5_reduce inception_4a/5x5 0=48 1=5 2=1 3=1 4=2 5=1 6=19200 8=2
+ReLU             inception_4a/relu_5x5 1 1 inception_4a/5x5 inception_4a/5x5_inception_4a/relu_5x5
+Pooling          inception_4a/pool 1 1 pool3/3x3_s2_splitncnn_0 inception_4a/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_4a/pool_proj 1 1 inception_4a/pool inception_4a/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=30720 8=2
+ReLU             inception_4a/relu_pool_proj 1 1 inception_4a/pool_proj inception_4a/pool_proj_inception_4a/relu_pool_proj
+Concat           inception_4a/output 4 1 inception_4a/1x1_inception_4a/relu_1x1 inception_4a/3x3_inception_4a/relu_3x3 inception_4a/5x5_inception_4a/relu_5x5 inception_4a/pool_proj_inception_4a/relu_pool_proj inception_4a/output 0=0
+Split            splitncnn_3      1 4 inception_4a/output inception_4a/output_splitncnn_0 inception_4a/output_splitncnn_1 inception_4a/output_splitncnn_2 inception_4a/output_splitncnn_3
+Convolution      inception_4b/1x1 1 1 inception_4a/output_splitncnn_3 inception_4b/1x1 0=160 1=1 2=1 3=1 4=0 5=1 6=81920 8=2
+ReLU             inception_4b/relu_1x1 1 1 inception_4b/1x1 inception_4b/1x1_inception_4b/relu_1x1
+Convolution      inception_4b/3x3_reduce 1 1 inception_4a/output_splitncnn_2 inception_4b/3x3_reduce 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
+ReLU             inception_4b/relu_3x3_reduce 1 1 inception_4b/3x3_reduce inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce
+Convolution      inception_4b/3x3 1 1 inception_4b/3x3_reduce_inception_4b/relu_3x3_reduce inception_4b/3x3 0=224 1=3 2=1 3=1 4=1 5=1 6=225792 8=2
+ReLU             inception_4b/relu_3x3 1 1 inception_4b/3x3 inception_4b/3x3_inception_4b/relu_3x3
+Convolution      inception_4b/5x5_reduce 1 1 inception_4a/output_splitncnn_1 inception_4b/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+ReLU             inception_4b/relu_5x5_reduce 1 1 inception_4b/5x5_reduce inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce
+Convolution      inception_4b/5x5 1 1 inception_4b/5x5_reduce_inception_4b/relu_5x5_reduce inception_4b/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
+ReLU             inception_4b/relu_5x5 1 1 inception_4b/5x5 inception_4b/5x5_inception_4b/relu_5x5
+Pooling          inception_4b/pool 1 1 inception_4a/output_splitncnn_0 inception_4b/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_4b/pool_proj 1 1 inception_4b/pool inception_4b/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             inception_4b/relu_pool_proj 1 1 inception_4b/pool_proj inception_4b/pool_proj_inception_4b/relu_pool_proj
+Concat           inception_4b/output 4 1 inception_4b/1x1_inception_4b/relu_1x1 inception_4b/3x3_inception_4b/relu_3x3 inception_4b/5x5_inception_4b/relu_5x5 inception_4b/pool_proj_inception_4b/relu_pool_proj inception_4b/output 0=0
+Split            splitncnn_4      1 4 inception_4b/output inception_4b/output_splitncnn_0 inception_4b/output_splitncnn_1 inception_4b/output_splitncnn_2 inception_4b/output_splitncnn_3
+Convolution      inception_4c/1x1 1 1 inception_4b/output_splitncnn_3 inception_4c/1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
+ReLU             inception_4c/relu_1x1 1 1 inception_4c/1x1 inception_4c/1x1_inception_4c/relu_1x1
+Convolution      inception_4c/3x3_reduce 1 1 inception_4b/output_splitncnn_2 inception_4c/3x3_reduce 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
+ReLU             inception_4c/relu_3x3_reduce 1 1 inception_4c/3x3_reduce inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce
+Convolution      inception_4c/3x3 1 1 inception_4c/3x3_reduce_inception_4c/relu_3x3_reduce inception_4c/3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
+ReLU             inception_4c/relu_3x3 1 1 inception_4c/3x3 inception_4c/3x3_inception_4c/relu_3x3
+Convolution      inception_4c/5x5_reduce 1 1 inception_4b/output_splitncnn_1 inception_4c/5x5_reduce 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+ReLU             inception_4c/relu_5x5_reduce 1 1 inception_4c/5x5_reduce inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce
+Convolution      inception_4c/5x5 1 1 inception_4c/5x5_reduce_inception_4c/relu_5x5_reduce inception_4c/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=38400 8=2
+ReLU             inception_4c/relu_5x5 1 1 inception_4c/5x5 inception_4c/5x5_inception_4c/relu_5x5
+Pooling          inception_4c/pool 1 1 inception_4b/output_splitncnn_0 inception_4c/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_4c/pool_proj 1 1 inception_4c/pool inception_4c/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             inception_4c/relu_pool_proj 1 1 inception_4c/pool_proj inception_4c/pool_proj_inception_4c/relu_pool_proj
+Concat           inception_4c/output 4 1 inception_4c/1x1_inception_4c/relu_1x1 inception_4c/3x3_inception_4c/relu_3x3 inception_4c/5x5_inception_4c/relu_5x5 inception_4c/pool_proj_inception_4c/relu_pool_proj inception_4c/output 0=0
+Split            splitncnn_5      1 4 inception_4c/output inception_4c/output_splitncnn_0 inception_4c/output_splitncnn_1 inception_4c/output_splitncnn_2 inception_4c/output_splitncnn_3
+Convolution      inception_4d/1x1 1 1 inception_4c/output_splitncnn_3 inception_4d/1x1 0=112 1=1 2=1 3=1 4=0 5=1 6=57344 8=2
+ReLU             inception_4d/relu_1x1 1 1 inception_4d/1x1 inception_4d/1x1_inception_4d/relu_1x1
+Convolution      inception_4d/3x3_reduce 1 1 inception_4c/output_splitncnn_2 inception_4d/3x3_reduce 0=144 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
+ReLU             inception_4d/relu_3x3_reduce 1 1 inception_4d/3x3_reduce inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce
+Convolution      inception_4d/3x3 1 1 inception_4d/3x3_reduce_inception_4d/relu_3x3_reduce inception_4d/3x3 0=288 1=3 2=1 3=1 4=1 5=1 6=373248 8=2
+ReLU             inception_4d/relu_3x3 1 1 inception_4d/3x3 inception_4d/3x3_inception_4d/relu_3x3
+Convolution      inception_4d/5x5_reduce 1 1 inception_4c/output_splitncnn_1 inception_4d/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             inception_4d/relu_5x5_reduce 1 1 inception_4d/5x5_reduce inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce
+Convolution      inception_4d/5x5 1 1 inception_4d/5x5_reduce_inception_4d/relu_5x5_reduce inception_4d/5x5 0=64 1=5 2=1 3=1 4=2 5=1 6=51200 8=2
+ReLU             inception_4d/relu_5x5 1 1 inception_4d/5x5 inception_4d/5x5_inception_4d/relu_5x5
+Pooling          inception_4d/pool 1 1 inception_4c/output_splitncnn_0 inception_4d/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_4d/pool_proj 1 1 inception_4d/pool inception_4d/pool_proj 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             inception_4d/relu_pool_proj 1 1 inception_4d/pool_proj inception_4d/pool_proj_inception_4d/relu_pool_proj
+Concat           inception_4d/output 4 1 inception_4d/1x1_inception_4d/relu_1x1 inception_4d/3x3_inception_4d/relu_3x3 inception_4d/5x5_inception_4d/relu_5x5 inception_4d/pool_proj_inception_4d/relu_pool_proj inception_4d/output 0=0
+Split            splitncnn_6      1 4 inception_4d/output inception_4d/output_splitncnn_0 inception_4d/output_splitncnn_1 inception_4d/output_splitncnn_2 inception_4d/output_splitncnn_3
+Convolution      inception_4e/1x1 1 1 inception_4d/output_splitncnn_3 inception_4e/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=135168 8=2
+ReLU             inception_4e/relu_1x1 1 1 inception_4e/1x1 inception_4e/1x1_inception_4e/relu_1x1
+Convolution      inception_4e/3x3_reduce 1 1 inception_4d/output_splitncnn_2 inception_4e/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=84480 8=2
+ReLU             inception_4e/relu_3x3_reduce 1 1 inception_4e/3x3_reduce inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce
+Convolution      inception_4e/3x3 1 1 inception_4e/3x3_reduce_inception_4e/relu_3x3_reduce inception_4e/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
+ReLU             inception_4e/relu_3x3 1 1 inception_4e/3x3 inception_4e/3x3_inception_4e/relu_3x3
+Convolution      inception_4e/5x5_reduce 1 1 inception_4d/output_splitncnn_1 inception_4e/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=16896 8=2
+ReLU             inception_4e/relu_5x5_reduce 1 1 inception_4e/5x5_reduce inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce
+Convolution      inception_4e/5x5 1 1 inception_4e/5x5_reduce_inception_4e/relu_5x5_reduce inception_4e/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
+ReLU             inception_4e/relu_5x5 1 1 inception_4e/5x5 inception_4e/5x5_inception_4e/relu_5x5
+Pooling          inception_4e/pool 1 1 inception_4d/output_splitncnn_0 inception_4e/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_4e/pool_proj 1 1 inception_4e/pool inception_4e/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=67584 8=2
+ReLU             inception_4e/relu_pool_proj 1 1 inception_4e/pool_proj inception_4e/pool_proj_inception_4e/relu_pool_proj
+Concat           inception_4e/output 4 1 inception_4e/1x1_inception_4e/relu_1x1 inception_4e/3x3_inception_4e/relu_3x3 inception_4e/5x5_inception_4e/relu_5x5 inception_4e/pool_proj_inception_4e/relu_pool_proj inception_4e/output 0=0
+Pooling          pool4/3x3_s2     1 1 inception_4e/output pool4/3x3_s2 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_7      1 4 pool4/3x3_s2 pool4/3x3_s2_splitncnn_0 pool4/3x3_s2_splitncnn_1 pool4/3x3_s2_splitncnn_2 pool4/3x3_s2_splitncnn_3
+Convolution      inception_5a/1x1 1 1 pool4/3x3_s2_splitncnn_3 inception_5a/1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=212992 8=2
+ReLU             inception_5a/relu_1x1 1 1 inception_5a/1x1 inception_5a/1x1_inception_5a/relu_1x1
+Convolution      inception_5a/3x3_reduce 1 1 pool4/3x3_s2_splitncnn_2 inception_5a/3x3_reduce 0=160 1=1 2=1 3=1 4=0 5=1 6=133120 8=2
+ReLU             inception_5a/relu_3x3_reduce 1 1 inception_5a/3x3_reduce inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce
+Convolution      inception_5a/3x3 1 1 inception_5a/3x3_reduce_inception_5a/relu_3x3_reduce inception_5a/3x3 0=320 1=3 2=1 3=1 4=1 5=1 6=460800 8=2
+ReLU             inception_5a/relu_3x3 1 1 inception_5a/3x3 inception_5a/3x3_inception_5a/relu_3x3
+Convolution      inception_5a/5x5_reduce 1 1 pool4/3x3_s2_splitncnn_1 inception_5a/5x5_reduce 0=32 1=1 2=1 3=1 4=0 5=1 6=26624 8=2
+ReLU             inception_5a/relu_5x5_reduce 1 1 inception_5a/5x5_reduce inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce
+Convolution      inception_5a/5x5 1 1 inception_5a/5x5_reduce_inception_5a/relu_5x5_reduce inception_5a/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=102400 8=2
+ReLU             inception_5a/relu_5x5 1 1 inception_5a/5x5 inception_5a/5x5_inception_5a/relu_5x5
+Pooling          inception_5a/pool 1 1 pool4/3x3_s2_splitncnn_0 inception_5a/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_5a/pool_proj 1 1 inception_5a/pool inception_5a/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
+ReLU             inception_5a/relu_pool_proj 1 1 inception_5a/pool_proj inception_5a/pool_proj_inception_5a/relu_pool_proj
+Concat           inception_5a/output 4 1 inception_5a/1x1_inception_5a/relu_1x1 inception_5a/3x3_inception_5a/relu_3x3 inception_5a/5x5_inception_5a/relu_5x5 inception_5a/pool_proj_inception_5a/relu_pool_proj inception_5a/output 0=0
+Split            splitncnn_8      1 4 inception_5a/output inception_5a/output_splitncnn_0 inception_5a/output_splitncnn_1 inception_5a/output_splitncnn_2 inception_5a/output_splitncnn_3
+Convolution      inception_5b/1x1 1 1 inception_5a/output_splitncnn_3 inception_5b/1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=319488 8=2
+ReLU             inception_5b/relu_1x1 1 1 inception_5b/1x1 inception_5b/1x1_inception_5b/relu_1x1
+Convolution      inception_5b/3x3_reduce 1 1 inception_5a/output_splitncnn_2 inception_5b/3x3_reduce 0=192 1=1 2=1 3=1 4=0 5=1 6=159744 8=2
+ReLU             inception_5b/relu_3x3_reduce 1 1 inception_5b/3x3_reduce inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce
+Convolution      inception_5b/3x3 1 1 inception_5b/3x3_reduce_inception_5b/relu_3x3_reduce inception_5b/3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=663552 8=2
+ReLU             inception_5b/relu_3x3 1 1 inception_5b/3x3 inception_5b/3x3_inception_5b/relu_3x3
+Convolution      inception_5b/5x5_reduce 1 1 inception_5a/output_splitncnn_1 inception_5b/5x5_reduce 0=48 1=1 2=1 3=1 4=0 5=1 6=39936 8=2
+ReLU             inception_5b/relu_5x5_reduce 1 1 inception_5b/5x5_reduce inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce
+Convolution      inception_5b/5x5 1 1 inception_5b/5x5_reduce_inception_5b/relu_5x5_reduce inception_5b/5x5 0=128 1=5 2=1 3=1 4=2 5=1 6=153600 8=2
+ReLU             inception_5b/relu_5x5 1 1 inception_5b/5x5 inception_5b/5x5_inception_5b/relu_5x5
+Pooling          inception_5b/pool 1 1 inception_5a/output_splitncnn_0 inception_5b/pool 0=0 1=3 2=1 3=1 4=0
+Convolution      inception_5b/pool_proj 1 1 inception_5b/pool inception_5b/pool_proj 0=128 1=1 2=1 3=1 4=0 5=1 6=106496 8=2
+ReLU             inception_5b/relu_pool_proj 1 1 inception_5b/pool_proj inception_5b/pool_proj_inception_5b/relu_pool_proj
+Concat           inception_5b/output 4 1 inception_5b/1x1_inception_5b/relu_1x1 inception_5b/3x3_inception_5b/relu_3x3 inception_5b/5x5_inception_5b/relu_5x5 inception_5b/pool_proj_inception_5b/relu_pool_proj inception_5b/output 0=0
+Pooling          pool5/7x7_s1     1 1 inception_5b/output pool5/7x7_s1 0=1 1=7 2=1 3=0 4=0
+Dropout          pool5/drop_7x7_s1 1 1 pool5/7x7_s1 pool5/7x7_s1_pool5/drop_7x7_s1
+InnerProduct     loss3/classifier 1 1 pool5/7x7_s1_pool5/drop_7x7_s1 loss3/classifier 0=1000 1=1 2=1024000
+Softmax          prob             1 1 loss3/classifier prob 0=0
diff --git a/benchmark/mobilenet_int8.param b/benchmark/mobilenet_int8.param
new file mode 100755
index 000000000..7994d3aeb
--- /dev/null
+++ b/benchmark/mobilenet_int8.param
@@ -0,0 +1,114 @@
+7767517
+112 112
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1            1 1 data conv1 0=32 1=3 2=1 3=2 4=1 5=0 6=864 8=2
+BatchNorm        conv1/bn         1 1 conv1 conv1_conv1/bn 0=32
+Scale            conv1/scale      1 1 conv1_conv1/bn conv1_conv1/scale 0=32 1=1
+ReLU             relu1            1 1 conv1_conv1/scale conv1_relu1
+ConvolutionDepthWise conv2_1/dw       1 1 conv1_relu1 conv2_1/dw 0=32 1=3 2=1 3=1 4=1 5=0 6=288 7=32 8=1
+BatchNorm        conv2_1/dw/bn    1 1 conv2_1/dw conv2_1/dw_conv2_1/dw/bn 0=32
+Scale            conv2_1/dw/scale 1 1 conv2_1/dw_conv2_1/dw/bn conv2_1/dw_conv2_1/dw/scale 0=32 1=1
+ReLU             relu2_1/dw       1 1 conv2_1/dw_conv2_1/dw/scale conv2_1/dw_relu2_1/dw
+Convolution      conv2_1/sep      1 1 conv2_1/dw_relu2_1/dw conv2_1/sep 0=64 1=1 2=1 3=1 4=0 5=0 6=2048 8=2
+BatchNorm        conv2_1/sep/bn   1 1 conv2_1/sep conv2_1/sep_conv2_1/sep/bn 0=64
+Scale            conv2_1/sep/scale 1 1 conv2_1/sep_conv2_1/sep/bn conv2_1/sep_conv2_1/sep/scale 0=64 1=1
+ReLU             relu2_1/sep      1 1 conv2_1/sep_conv2_1/sep/scale conv2_1/sep_relu2_1/sep
+ConvolutionDepthWise conv2_2/dw       1 1 conv2_1/sep_relu2_1/sep conv2_2/dw 0=64 1=3 2=1 3=2 4=1 5=0 6=576 7=64 8=1
+BatchNorm        conv2_2/dw/bn    1 1 conv2_2/dw conv2_2/dw_conv2_2/dw/bn 0=64
+Scale            conv2_2/dw/scale 1 1 conv2_2/dw_conv2_2/dw/bn conv2_2/dw_conv2_2/dw/scale 0=64 1=1
+ReLU             relu2_2/dw       1 1 conv2_2/dw_conv2_2/dw/scale conv2_2/dw_relu2_2/dw
+Convolution      conv2_2/sep      1 1 conv2_2/dw_relu2_2/dw conv2_2/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=8192 8=2
+BatchNorm        conv2_2/sep/bn   1 1 conv2_2/sep conv2_2/sep_conv2_2/sep/bn 0=128
+Scale            conv2_2/sep/scale 1 1 conv2_2/sep_conv2_2/sep/bn conv2_2/sep_conv2_2/sep/scale 0=128 1=1
+ReLU             relu2_2/sep      1 1 conv2_2/sep_conv2_2/sep/scale conv2_2/sep_relu2_2/sep
+ConvolutionDepthWise conv3_1/dw       1 1 conv2_2/sep_relu2_2/sep conv3_1/dw 0=128 1=3 2=1 3=1 4=1 5=0 6=1152 7=128 8=1
+BatchNorm        conv3_1/dw/bn    1 1 conv3_1/dw conv3_1/dw_conv3_1/dw/bn 0=128
+Scale            conv3_1/dw/scale 1 1 conv3_1/dw_conv3_1/dw/bn conv3_1/dw_conv3_1/dw/scale 0=128 1=1
+ReLU             relu3_1/dw       1 1 conv3_1/dw_conv3_1/dw/scale conv3_1/dw_relu3_1/dw
+Convolution      conv3_1/sep      1 1 conv3_1/dw_relu3_1/dw conv3_1/sep 0=128 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        conv3_1/sep/bn   1 1 conv3_1/sep conv3_1/sep_conv3_1/sep/bn 0=128
+Scale            conv3_1/sep/scale 1 1 conv3_1/sep_conv3_1/sep/bn conv3_1/sep_conv3_1/sep/scale 0=128 1=1
+ReLU             relu3_1/sep      1 1 conv3_1/sep_conv3_1/sep/scale conv3_1/sep_relu3_1/sep
+ConvolutionDepthWise conv3_2/dw       1 1 conv3_1/sep_relu3_1/sep conv3_2/dw 0=128 1=3 2=1 3=2 4=1 5=0 6=1152 7=128 8=1
+BatchNorm        conv3_2/dw/bn    1 1 conv3_2/dw conv3_2/dw_conv3_2/dw/bn 0=128
+Scale            conv3_2/dw/scale 1 1 conv3_2/dw_conv3_2/dw/bn conv3_2/dw_conv3_2/dw/scale 0=128 1=1
+ReLU             relu3_2/dw       1 1 conv3_2/dw_conv3_2/dw/scale conv3_2/dw_relu3_2/dw
+Convolution      conv3_2/sep      1 1 conv3_2/dw_relu3_2/dw conv3_2/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=32768 8=2
+BatchNorm        conv3_2/sep/bn   1 1 conv3_2/sep conv3_2/sep_conv3_2/sep/bn 0=256
+Scale            conv3_2/sep/scale 1 1 conv3_2/sep_conv3_2/sep/bn conv3_2/sep_conv3_2/sep/scale 0=256 1=1
+ReLU             relu3_2/sep      1 1 conv3_2/sep_conv3_2/sep/scale conv3_2/sep_relu3_2/sep
+ConvolutionDepthWise conv4_1/dw       1 1 conv3_2/sep_relu3_2/sep conv4_1/dw 0=256 1=3 2=1 3=1 4=1 5=0 6=2304 7=256 8=1
+BatchNorm        conv4_1/dw/bn    1 1 conv4_1/dw conv4_1/dw_conv4_1/dw/bn 0=256
+Scale            conv4_1/dw/scale 1 1 conv4_1/dw_conv4_1/dw/bn conv4_1/dw_conv4_1/dw/scale 0=256 1=1
+ReLU             relu4_1/dw       1 1 conv4_1/dw_conv4_1/dw/scale conv4_1/dw_relu4_1/dw
+Convolution      conv4_1/sep      1 1 conv4_1/dw_relu4_1/dw conv4_1/sep 0=256 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        conv4_1/sep/bn   1 1 conv4_1/sep conv4_1/sep_conv4_1/sep/bn 0=256
+Scale            conv4_1/sep/scale 1 1 conv4_1/sep_conv4_1/sep/bn conv4_1/sep_conv4_1/sep/scale 0=256 1=1
+ReLU             relu4_1/sep      1 1 conv4_1/sep_conv4_1/sep/scale conv4_1/sep_relu4_1/sep
+ConvolutionDepthWise conv4_2/dw       1 1 conv4_1/sep_relu4_1/sep conv4_2/dw 0=256 1=3 2=1 3=2 4=1 5=0 6=2304 7=256 8=1
+BatchNorm        conv4_2/dw/bn    1 1 conv4_2/dw conv4_2/dw_conv4_2/dw/bn 0=256
+Scale            conv4_2/dw/scale 1 1 conv4_2/dw_conv4_2/dw/bn conv4_2/dw_conv4_2/dw/scale 0=256 1=1
+ReLU             relu4_2/dw       1 1 conv4_2/dw_conv4_2/dw/scale conv4_2/dw_relu4_2/dw
+Convolution      conv4_2/sep      1 1 conv4_2/dw_relu4_2/dw conv4_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=131072 8=2
+BatchNorm        conv4_2/sep/bn   1 1 conv4_2/sep conv4_2/sep_conv4_2/sep/bn 0=512
+Scale            conv4_2/sep/scale 1 1 conv4_2/sep_conv4_2/sep/bn conv4_2/sep_conv4_2/sep/scale 0=512 1=1
+ReLU             relu4_2/sep      1 1 conv4_2/sep_conv4_2/sep/scale conv4_2/sep_relu4_2/sep
+ConvolutionDepthWise conv5_1/dw       1 1 conv4_2/sep_relu4_2/sep conv5_1/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_1/dw/bn    1 1 conv5_1/dw conv5_1/dw_conv5_1/dw/bn 0=512
+Scale            conv5_1/dw/scale 1 1 conv5_1/dw_conv5_1/dw/bn conv5_1/dw_conv5_1/dw/scale 0=512 1=1
+ReLU             relu5_1/dw       1 1 conv5_1/dw_conv5_1/dw/scale conv5_1/dw_relu5_1/dw
+Convolution      conv5_1/sep      1 1 conv5_1/dw_relu5_1/dw conv5_1/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        conv5_1/sep/bn   1 1 conv5_1/sep conv5_1/sep_conv5_1/sep/bn 0=512
+Scale            conv5_1/sep/scale 1 1 conv5_1/sep_conv5_1/sep/bn conv5_1/sep_conv5_1/sep/scale 0=512 1=1
+ReLU             relu5_1/sep      1 1 conv5_1/sep_conv5_1/sep/scale conv5_1/sep_relu5_1/sep
+ConvolutionDepthWise conv5_2/dw       1 1 conv5_1/sep_relu5_1/sep conv5_2/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_2/dw/bn    1 1 conv5_2/dw conv5_2/dw_conv5_2/dw/bn 0=512
+Scale            conv5_2/dw/scale 1 1 conv5_2/dw_conv5_2/dw/bn conv5_2/dw_conv5_2/dw/scale 0=512 1=1
+ReLU             relu5_2/dw       1 1 conv5_2/dw_conv5_2/dw/scale conv5_2/dw_relu5_2/dw
+Convolution      conv5_2/sep      1 1 conv5_2/dw_relu5_2/dw conv5_2/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        conv5_2/sep/bn   1 1 conv5_2/sep conv5_2/sep_conv5_2/sep/bn 0=512
+Scale            conv5_2/sep/scale 1 1 conv5_2/sep_conv5_2/sep/bn conv5_2/sep_conv5_2/sep/scale 0=512 1=1
+ReLU             relu5_2/sep      1 1 conv5_2/sep_conv5_2/sep/scale conv5_2/sep_relu5_2/sep
+ConvolutionDepthWise conv5_3/dw       1 1 conv5_2/sep_relu5_2/sep conv5_3/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_3/dw/bn    1 1 conv5_3/dw conv5_3/dw_conv5_3/dw/bn 0=512
+Scale            conv5_3/dw/scale 1 1 conv5_3/dw_conv5_3/dw/bn conv5_3/dw_conv5_3/dw/scale 0=512 1=1
+ReLU             relu5_3/dw       1 1 conv5_3/dw_conv5_3/dw/scale conv5_3/dw_relu5_3/dw
+Convolution      conv5_3/sep      1 1 conv5_3/dw_relu5_3/dw conv5_3/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        conv5_3/sep/bn   1 1 conv5_3/sep conv5_3/sep_conv5_3/sep/bn 0=512
+Scale            conv5_3/sep/scale 1 1 conv5_3/sep_conv5_3/sep/bn conv5_3/sep_conv5_3/sep/scale 0=512 1=1
+ReLU             relu5_3/sep      1 1 conv5_3/sep_conv5_3/sep/scale conv5_3/sep_relu5_3/sep
+ConvolutionDepthWise conv5_4/dw       1 1 conv5_3/sep_relu5_3/sep conv5_4/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_4/dw/bn    1 1 conv5_4/dw conv5_4/dw_conv5_4/dw/bn 0=512
+Scale            conv5_4/dw/scale 1 1 conv5_4/dw_conv5_4/dw/bn conv5_4/dw_conv5_4/dw/scale 0=512 1=1
+ReLU             relu5_4/dw       1 1 conv5_4/dw_conv5_4/dw/scale conv5_4/dw_relu5_4/dw
+Convolution      conv5_4/sep      1 1 conv5_4/dw_relu5_4/dw conv5_4/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        conv5_4/sep/bn   1 1 conv5_4/sep conv5_4/sep_conv5_4/sep/bn 0=512
+Scale            conv5_4/sep/scale 1 1 conv5_4/sep_conv5_4/sep/bn conv5_4/sep_conv5_4/sep/scale 0=512 1=1
+ReLU             relu5_4/sep      1 1 conv5_4/sep_conv5_4/sep/scale conv5_4/sep_relu5_4/sep
+ConvolutionDepthWise conv5_5/dw       1 1 conv5_4/sep_relu5_4/sep conv5_5/dw 0=512 1=3 2=1 3=1 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_5/dw/bn    1 1 conv5_5/dw conv5_5/dw_conv5_5/dw/bn 0=512
+Scale            conv5_5/dw/scale 1 1 conv5_5/dw_conv5_5/dw/bn conv5_5/dw_conv5_5/dw/scale 0=512 1=1
+ReLU             relu5_5/dw       1 1 conv5_5/dw_conv5_5/dw/scale conv5_5/dw_relu5_5/dw
+Convolution      conv5_5/sep      1 1 conv5_5/dw_relu5_5/dw conv5_5/sep 0=512 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        conv5_5/sep/bn   1 1 conv5_5/sep conv5_5/sep_conv5_5/sep/bn 0=512
+Scale            conv5_5/sep/scale 1 1 conv5_5/sep_conv5_5/sep/bn conv5_5/sep_conv5_5/sep/scale 0=512 1=1
+ReLU             relu5_5/sep      1 1 conv5_5/sep_conv5_5/sep/scale conv5_5/sep_relu5_5/sep
+ConvolutionDepthWise conv5_6/dw       1 1 conv5_5/sep_relu5_5/sep conv5_6/dw 0=512 1=3 2=1 3=2 4=1 5=0 6=4608 7=512 8=1
+BatchNorm        conv5_6/dw/bn    1 1 conv5_6/dw conv5_6/dw_conv5_6/dw/bn 0=512
+Scale            conv5_6/dw/scale 1 1 conv5_6/dw_conv5_6/dw/bn conv5_6/dw_conv5_6/dw/scale 0=512 1=1
+ReLU             relu5_6/dw       1 1 conv5_6/dw_conv5_6/dw/scale conv5_6/dw_relu5_6/dw
+Convolution      conv5_6/sep      1 1 conv5_6/dw_relu5_6/dw conv5_6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=524288 8=2
+BatchNorm        conv5_6/sep/bn   1 1 conv5_6/sep conv5_6/sep_conv5_6/sep/bn 0=1024
+Scale            conv5_6/sep/scale 1 1 conv5_6/sep_conv5_6/sep/bn conv5_6/sep_conv5_6/sep/scale 0=1024 1=1
+ReLU             relu5_6/sep      1 1 conv5_6/sep_conv5_6/sep/scale conv5_6/sep_relu5_6/sep
+ConvolutionDepthWise conv6/dw         1 1 conv5_6/sep_relu5_6/sep conv6/dw 0=1024 1=3 2=1 3=1 4=1 5=0 6=9216 7=1024 8=1
+BatchNorm        conv6/dw/bn      1 1 conv6/dw conv6/dw_conv6/dw/bn 0=1024
+Scale            conv6/dw/scale   1 1 conv6/dw_conv6/dw/bn conv6/dw_conv6/dw/scale 0=1024 1=1
+ReLU             relu6/dw         1 1 conv6/dw_conv6/dw/scale conv6/dw_relu6/dw
+Convolution      conv6/sep        1 1 conv6/dw_relu6/dw conv6/sep 0=1024 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        conv6/sep/bn     1 1 conv6/sep conv6/sep_conv6/sep/bn 0=1024
+Scale            conv6/sep/scale  1 1 conv6/sep_conv6/sep/bn conv6/sep_conv6/sep/scale 0=1024 1=1
+ReLU             relu6/sep        1 1 conv6/sep_conv6/sep/scale conv6/sep_relu6/sep
+Pooling          pool6            1 1 conv6/sep_relu6/sep pool6 0=1 1=0 2=1 3=0 4=1
+Convolution      fc7              1 1 pool6 fc7 0=1000 1=1 2=1 3=1 4=0 5=1 6=1024000 8=2
+Softmax          prob             1 1 fc7 prob 0=0
diff --git a/benchmark/mobilenet_ssd_int8.param b/benchmark/mobilenet_ssd_int8.param
new file mode 100755
index 000000000..287c49e26
--- /dev/null
+++ b/benchmark/mobilenet_ssd_int8.param
@@ -0,0 +1,129 @@
+7767517
+127 150
+Input            data             0 1 data 0=300 1=300 2=3
+Split            splitncnn_0      1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
+Convolution      conv0            1 1 data_splitncnn_6 conv0 0=32 1=3 2=1 3=2 4=1 5=1 6=864 8=2
+ReLU             conv0/relu       1 1 conv0 conv0_conv0/relu
+ConvolutionDepthWise conv1/dw         1 1 conv0_conv0/relu conv1/dw 0=32 1=3 2=1 3=1 4=1 5=1 6=288 7=32 8=1
+ReLU             conv1/dw/relu    1 1 conv1/dw conv1/dw_conv1/dw/relu
+Convolution      conv1            1 1 conv1/dw_conv1/dw/relu conv1 0=64 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
+ReLU             conv1/relu       1 1 conv1 conv1_conv1/relu
+ConvolutionDepthWise conv2/dw         1 1 conv1_conv1/relu conv2/dw 0=64 1=3 2=1 3=2 4=1 5=1 6=576 7=64 8=1
+ReLU             conv2/dw/relu    1 1 conv2/dw conv2/dw_conv2/dw/relu
+Convolution      conv2            1 1 conv2/dw_conv2/dw/relu conv2 0=128 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
+ReLU             conv2/relu       1 1 conv2 conv2_conv2/relu
+ConvolutionDepthWise conv3/dw         1 1 conv2_conv2/relu conv3/dw 0=128 1=3 2=1 3=1 4=1 5=1 6=1152 7=128 8=1
+ReLU             conv3/dw/relu    1 1 conv3/dw conv3/dw_conv3/dw/relu
+Convolution      conv3            1 1 conv3/dw_conv3/dw/relu conv3 0=128 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             conv3/relu       1 1 conv3 conv3_conv3/relu
+ConvolutionDepthWise conv4/dw         1 1 conv3_conv3/relu conv4/dw 0=128 1=3 2=1 3=2 4=1 5=1 6=1152 7=128 8=1
+ReLU             conv4/dw/relu    1 1 conv4/dw conv4/dw_conv4/dw/relu
+Convolution      conv4            1 1 conv4/dw_conv4/dw/relu conv4 0=256 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             conv4/relu       1 1 conv4 conv4_conv4/relu
+ConvolutionDepthWise conv5/dw         1 1 conv4_conv4/relu conv5/dw 0=256 1=3 2=1 3=1 4=1 5=1 6=2304 7=256 8=1
+ReLU             conv5/dw/relu    1 1 conv5/dw conv5/dw_conv5/dw/relu
+Convolution      conv5            1 1 conv5/dw_conv5/dw/relu conv5 0=256 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
+ReLU             conv5/relu       1 1 conv5 conv5_conv5/relu
+ConvolutionDepthWise conv6/dw         1 1 conv5_conv5/relu conv6/dw 0=256 1=3 2=1 3=2 4=1 5=1 6=2304 7=256 8=1
+ReLU             conv6/dw/relu    1 1 conv6/dw conv6/dw_conv6/dw/relu
+Convolution      conv6            1 1 conv6/dw_conv6/dw/relu conv6 0=512 1=1 2=1 3=1 4=0 5=1 6=131072 8=2
+ReLU             conv6/relu       1 1 conv6 conv6_conv6/relu
+ConvolutionDepthWise conv7/dw         1 1 conv6_conv6/relu conv7/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv7/dw/relu    1 1 conv7/dw conv7/dw_conv7/dw/relu
+Convolution      conv7            1 1 conv7/dw_conv7/dw/relu conv7 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv7/relu       1 1 conv7 conv7_conv7/relu
+ConvolutionDepthWise conv8/dw         1 1 conv7_conv7/relu conv8/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv8/dw/relu    1 1 conv8/dw conv8/dw_conv8/dw/relu
+Convolution      conv8            1 1 conv8/dw_conv8/dw/relu conv8 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv8/relu       1 1 conv8 conv8_conv8/relu
+ConvolutionDepthWise conv9/dw         1 1 conv8_conv8/relu conv9/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv9/dw/relu    1 1 conv9/dw conv9/dw_conv9/dw/relu
+Convolution      conv9            1 1 conv9/dw_conv9/dw/relu conv9 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv9/relu       1 1 conv9 conv9_conv9/relu
+ConvolutionDepthWise conv10/dw        1 1 conv9_conv9/relu conv10/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv10/dw/relu   1 1 conv10/dw conv10/dw_conv10/dw/relu
+Convolution      conv10           1 1 conv10/dw_conv10/dw/relu conv10 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv10/relu      1 1 conv10 conv10_conv10/relu
+ConvolutionDepthWise conv11/dw        1 1 conv10_conv10/relu conv11/dw 0=512 1=3 2=1 3=1 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv11/dw/relu   1 1 conv11/dw conv11/dw_conv11/dw/relu
+Convolution      conv11           1 1 conv11/dw_conv11/dw/relu conv11 0=512 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv11/relu      1 1 conv11 conv11_conv11/relu
+Split            splitncnn_1      1 4 conv11_conv11/relu conv11_conv11/relu_splitncnn_0 conv11_conv11/relu_splitncnn_1 conv11_conv11/relu_splitncnn_2 conv11_conv11/relu_splitncnn_3
+ConvolutionDepthWise conv12/dw        1 1 conv11_conv11/relu_splitncnn_3 conv12/dw 0=512 1=3 2=1 3=2 4=1 5=1 6=4608 7=512 8=1
+ReLU             conv12/dw/relu   1 1 conv12/dw conv12/dw_conv12/dw/relu
+Convolution      conv12           1 1 conv12/dw_conv12/dw/relu conv12 0=1024 1=1 2=1 3=1 4=0 5=1 6=524288 8=2
+ReLU             conv12/relu      1 1 conv12 conv12_conv12/relu
+ConvolutionDepthWise conv13/dw        1 1 conv12_conv12/relu conv13/dw 0=1024 1=3 2=1 3=1 4=1 5=1 6=9216 7=1024 8=1
+ReLU             conv13/dw/relu   1 1 conv13/dw conv13/dw_conv13/dw/relu
+Convolution      conv13           1 1 conv13/dw_conv13/dw/relu conv13 0=1024 1=1 2=1 3=1 4=0 5=1 6=1048576 8=2
+ReLU             conv13/relu      1 1 conv13 conv13_conv13/relu
+Split            splitncnn_2      1 4 conv13_conv13/relu conv13_conv13/relu_splitncnn_0 conv13_conv13/relu_splitncnn_1 conv13_conv13/relu_splitncnn_2 conv13_conv13/relu_splitncnn_3
+Convolution      conv14_1         1 1 conv13_conv13/relu_splitncnn_3 conv14_1 0=256 1=1 2=1 3=1 4=0 5=1 6=262144 8=2
+ReLU             conv14_1/relu    1 1 conv14_1 conv14_1_conv14_1/relu
+Convolution      conv14_2         1 1 conv14_1_conv14_1/relu conv14_2 0=512 1=3 2=1 3=2 4=1 5=1 6=1179648 8=2
+ReLU             conv14_2/relu    1 1 conv14_2 conv14_2_conv14_2/relu
+Split            splitncnn_3      1 4 conv14_2_conv14_2/relu conv14_2_conv14_2/relu_splitncnn_0 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_conv14_2/relu_splitncnn_3
+Convolution      conv15_1         1 1 conv14_2_conv14_2/relu_splitncnn_3 conv15_1 0=128 1=1 2=1 3=1 4=0 5=1 6=65536 8=2
+ReLU             conv15_1/relu    1 1 conv15_1 conv15_1_conv15_1/relu
+Convolution      conv15_2         1 1 conv15_1_conv15_1/relu conv15_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
+ReLU             conv15_2/relu    1 1 conv15_2 conv15_2_conv15_2/relu
+Split            splitncnn_4      1 4 conv15_2_conv15_2/relu conv15_2_conv15_2/relu_splitncnn_0 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_conv15_2/relu_splitncnn_3
+Convolution      conv16_1         1 1 conv15_2_conv15_2/relu_splitncnn_3 conv16_1 0=128 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             conv16_1/relu    1 1 conv16_1 conv16_1_conv16_1/relu
+Convolution      conv16_2         1 1 conv16_1_conv16_1/relu conv16_2 0=256 1=3 2=1 3=2 4=1 5=1 6=294912 8=2
+ReLU             conv16_2/relu    1 1 conv16_2 conv16_2_conv16_2/relu
+Split            splitncnn_5      1 4 conv16_2_conv16_2/relu conv16_2_conv16_2/relu_splitncnn_0 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_conv16_2/relu_splitncnn_3
+Convolution      conv17_1         1 1 conv16_2_conv16_2/relu_splitncnn_3 conv17_1 0=64 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             conv17_1/relu    1 1 conv17_1 conv17_1_conv17_1/relu
+Convolution      conv17_2         1 1 conv17_1_conv17_1/relu conv17_2 0=128 1=3 2=1 3=2 4=1 5=1 6=73728 8=2
+ReLU             conv17_2/relu    1 1 conv17_2 conv17_2_conv17_2/relu
+Split            splitncnn_6      1 3 conv17_2_conv17_2/relu conv17_2_conv17_2/relu_splitncnn_0 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_conv17_2/relu_splitncnn_2
+Convolution      conv11_mbox_loc  1 1 conv11_conv11/relu_splitncnn_2 conv11_mbox_loc 0=12 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
+Permute          conv11_mbox_loc_perm 1 1 conv11_mbox_loc conv11_mbox_loc_perm 0=3
+Flatten          conv11_mbox_loc_flat 1 1 conv11_mbox_loc_perm conv11_mbox_loc_flat
+Convolution      conv11_mbox_conf 1 1 conv11_conv11/relu_splitncnn_1 conv11_mbox_conf 0=63 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
+Permute          conv11_mbox_conf_perm 1 1 conv11_mbox_conf conv11_mbox_conf_perm 0=3
+Flatten          conv11_mbox_conf_flat 1 1 conv11_mbox_conf_perm conv11_mbox_conf_flat
+PriorBox         conv11_mbox_priorbox 2 1 conv11_conv11/relu_splitncnn_0 data_splitncnn_5 conv11_mbox_priorbox -23300=1,60.000000 -23301=0 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Convolution      conv13_mbox_loc  1 1 conv13_conv13/relu_splitncnn_2 conv13_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
+Permute          conv13_mbox_loc_perm 1 1 conv13_mbox_loc conv13_mbox_loc_perm 0=3
+Flatten          conv13_mbox_loc_flat 1 1 conv13_mbox_loc_perm conv13_mbox_loc_flat
+Convolution      conv13_mbox_conf 1 1 conv13_conv13/relu_splitncnn_1 conv13_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=129024 8=2
+Permute          conv13_mbox_conf_perm 1 1 conv13_mbox_conf conv13_mbox_conf_perm 0=3
+Flatten          conv13_mbox_conf_flat 1 1 conv13_mbox_conf_perm conv13_mbox_conf_flat
+PriorBox         conv13_mbox_priorbox 2 1 conv13_conv13/relu_splitncnn_0 data_splitncnn_4 conv13_mbox_priorbox -23300=1,105.000000 -23301=1,150.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Convolution      conv14_2_mbox_loc 1 1 conv14_2_conv14_2/relu_splitncnn_2 conv14_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+Permute          conv14_2_mbox_loc_perm 1 1 conv14_2_mbox_loc conv14_2_mbox_loc_perm 0=3
+Flatten          conv14_2_mbox_loc_flat 1 1 conv14_2_mbox_loc_perm conv14_2_mbox_loc_flat
+Convolution      conv14_2_mbox_conf 1 1 conv14_2_conv14_2/relu_splitncnn_1 conv14_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=64512 8=2
+Permute          conv14_2_mbox_conf_perm 1 1 conv14_2_mbox_conf conv14_2_mbox_conf_perm 0=3
+Flatten          conv14_2_mbox_conf_flat 1 1 conv14_2_mbox_conf_perm conv14_2_mbox_conf_flat
+PriorBox         conv14_2_mbox_priorbox 2 1 conv14_2_conv14_2/relu_splitncnn_0 data_splitncnn_3 conv14_2_mbox_priorbox -23300=1,150.000000 -23301=1,195.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Convolution      conv15_2_mbox_loc 1 1 conv15_2_conv15_2/relu_splitncnn_2 conv15_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
+Permute          conv15_2_mbox_loc_perm 1 1 conv15_2_mbox_loc conv15_2_mbox_loc_perm 0=3
+Flatten          conv15_2_mbox_loc_flat 1 1 conv15_2_mbox_loc_perm conv15_2_mbox_loc_flat
+Convolution      conv15_2_mbox_conf 1 1 conv15_2_conv15_2/relu_splitncnn_1 conv15_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
+Permute          conv15_2_mbox_conf_perm 1 1 conv15_2_mbox_conf conv15_2_mbox_conf_perm 0=3
+Flatten          conv15_2_mbox_conf_flat 1 1 conv15_2_mbox_conf_perm conv15_2_mbox_conf_flat
+PriorBox         conv15_2_mbox_priorbox 2 1 conv15_2_conv15_2/relu_splitncnn_0 data_splitncnn_2 conv15_2_mbox_priorbox -23300=1,195.000000 -23301=1,240.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Convolution      conv16_2_mbox_loc 1 1 conv16_2_conv16_2/relu_splitncnn_2 conv16_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=6144 8=2
+Permute          conv16_2_mbox_loc_perm 1 1 conv16_2_mbox_loc conv16_2_mbox_loc_perm 0=3
+Flatten          conv16_2_mbox_loc_flat 1 1 conv16_2_mbox_loc_perm conv16_2_mbox_loc_flat
+Convolution      conv16_2_mbox_conf 1 1 conv16_2_conv16_2/relu_splitncnn_1 conv16_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=32256 8=2
+Permute          conv16_2_mbox_conf_perm 1 1 conv16_2_mbox_conf conv16_2_mbox_conf_perm 0=3
+Flatten          conv16_2_mbox_conf_flat 1 1 conv16_2_mbox_conf_perm conv16_2_mbox_conf_flat
+PriorBox         conv16_2_mbox_priorbox 2 1 conv16_2_conv16_2/relu_splitncnn_0 data_splitncnn_1 conv16_2_mbox_priorbox -23300=1,240.000000 -23301=1,285.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Convolution      conv17_2_mbox_loc 1 1 conv17_2_conv17_2/relu_splitncnn_2 conv17_2_mbox_loc 0=24 1=1 2=1 3=1 4=0 5=1 6=3072 8=2
+Permute          conv17_2_mbox_loc_perm 1 1 conv17_2_mbox_loc conv17_2_mbox_loc_perm 0=3
+Flatten          conv17_2_mbox_loc_flat 1 1 conv17_2_mbox_loc_perm conv17_2_mbox_loc_flat
+Convolution      conv17_2_mbox_conf 1 1 conv17_2_conv17_2/relu_splitncnn_1 conv17_2_mbox_conf 0=126 1=1 2=1 3=1 4=0 5=1 6=16128 8=2
+Permute          conv17_2_mbox_conf_perm 1 1 conv17_2_mbox_conf conv17_2_mbox_conf_perm 0=3
+Flatten          conv17_2_mbox_conf_flat 1 1 conv17_2_mbox_conf_perm conv17_2_mbox_conf_flat
+PriorBox         conv17_2_mbox_priorbox 2 1 conv17_2_conv17_2/relu_splitncnn_0 data_splitncnn_0 conv17_2_mbox_priorbox -23300=1,285.000000 -23301=1,300.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=-233.000000 12=-233.000000 13=0.500000
+Concat           mbox_loc         6 1 conv11_mbox_loc_flat conv13_mbox_loc_flat conv14_2_mbox_loc_flat conv15_2_mbox_loc_flat conv16_2_mbox_loc_flat conv17_2_mbox_loc_flat mbox_loc 0=0
+Concat           mbox_conf        6 1 conv11_mbox_conf_flat conv13_mbox_conf_flat conv14_2_mbox_conf_flat conv15_2_mbox_conf_flat conv16_2_mbox_conf_flat conv17_2_mbox_conf_flat mbox_conf 0=0
+Concat           mbox_priorbox    6 1 conv11_mbox_priorbox conv13_mbox_priorbox conv14_2_mbox_priorbox conv15_2_mbox_priorbox conv16_2_mbox_priorbox conv17_2_mbox_priorbox mbox_priorbox 0=1
+Reshape          mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
+Softmax          mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
+Flatten          mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
+DetectionOutput  detection_out    3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.250000
diff --git a/benchmark/resnet18_int8.param b/benchmark/resnet18_int8.param
new file mode 100755
index 000000000..cd2be6233
--- /dev/null
+++ b/benchmark/resnet18_int8.param
@@ -0,0 +1,103 @@
+7767517
+101 109
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=0 6=9408 8=2
+BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
+Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
+ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
+Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
+Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
+BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=64
+Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=64 1=1
+Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
+Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
+ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
+Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
+Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
+Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2b_scale2a_branch2b res2a 0=1 -23301=0
+ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
+Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
+Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
+Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
+ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
+Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
+Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
+Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2b_scale2b_branch2b res2b 0=1 -23301=0
+ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
+Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
+Convolution      res3a_branch1    1 1 res2b_res2b_relu_splitncnn_1 res3a_branch1 0=128 1=1 2=1 3=2 4=0 5=0 6=8192 8=2
+BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=128
+Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=128 1=1
+Convolution      res3a_branch2a   1 1 res2b_res2b_relu_splitncnn_0 res3a_branch2a 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
+BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
+Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
+ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
+Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
+Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
+Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2b_scale3a_branch2b res3a 0=1 -23301=0
+ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
+Split            splitncnn_3      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
+Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
+Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
+ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
+Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
+Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
+Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2b_scale3b_branch2b res3b 0=1 -23301=0
+ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
+Split            splitncnn_4      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
+Convolution      res4a_branch1    1 1 res3b_res3b_relu_splitncnn_1 res4a_branch1 0=256 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
+BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=256
+Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=256 1=1
+Convolution      res4a_branch2a   1 1 res3b_res3b_relu_splitncnn_0 res4a_branch2a 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
+BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
+Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
+ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
+Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
+Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
+Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2b_scale4a_branch2b res4a 0=1 -23301=0
+ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
+Split            splitncnn_5      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
+Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
+Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
+ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
+Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
+Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
+Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2b_scale4b_branch2b res4b 0=1 -23301=0
+ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
+Split            splitncnn_6      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
+Convolution      res5a_branch1    1 1 res4b_res4b_relu_splitncnn_1 res5a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
+BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=512
+Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=512 1=1
+Convolution      res5a_branch2a   1 1 res4b_res4b_relu_splitncnn_0 res5a_branch2a 0=512 1=3 2=1 3=2 4=1 5=0 6=1179648 8=2
+BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
+Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
+ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
+Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
+Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
+Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2b_scale5a_branch2b res5a 0=1 -23301=0
+ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
+Split            splitncnn_7      1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
+Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
+Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
+ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
+Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
+Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
+Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2b_scale5b_branch2b res5b 0=1 -23301=0
+ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
+Pooling          pool5            1 1 res5b_res5b_relu pool5 0=1 1=7 2=1 3=0 4=0
+InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=512000
+Softmax          prob             1 1 fc1000 prob 0=0
diff --git a/benchmark/resnet50.param b/benchmark/resnet50.param
new file mode 100755
index 000000000..f9df9a81c
--- /dev/null
+++ b/benchmark/resnet50.param
@@ -0,0 +1,247 @@
+7767517
+245 261
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408
+BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
+Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
+ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
+Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
+Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
+Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
+Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096
+BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
+Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
+ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
+Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
+BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
+Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
+ReLU             res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
+Convolution      res2a_branch2c   1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2a_branch2c    1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
+Scale            scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
+Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
+ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
+Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
+Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
+Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
+ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
+Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
+BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
+Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
+ReLU             res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
+Convolution      res2b_branch2c   1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2b_branch2c    1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
+Scale            scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
+Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
+ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
+Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
+Convolution      res2c_branch2a   1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2c_branch2a    1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
+Scale            scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
+ReLU             res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
+Convolution      res2c_branch2b   1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864
+BatchNorm        bn2c_branch2b    1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
+Scale            scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
+ReLU             res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
+Convolution      res2c_branch2c   1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384
+BatchNorm        bn2c_branch2c    1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
+Scale            scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
+Eltwise          res2c            2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
+ReLU             res2c_relu       1 1 res2c res2c_res2c_relu
+Split            splitncnn_3      1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
+Convolution      res3a_branch1    1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072
+BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
+Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
+Convolution      res3a_branch2a   1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768
+BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
+Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
+ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
+Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
+BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
+Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
+ReLU             res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
+Convolution      res3a_branch2c   1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3a_branch2c    1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
+Scale            scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
+Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
+ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
+Split            splitncnn_4      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
+Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
+Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
+ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
+Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
+BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
+Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
+ReLU             res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
+Convolution      res3b_branch2c   1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3b_branch2c    1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
+Scale            scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
+Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
+ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
+Split            splitncnn_5      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
+Convolution      res3c_branch2a   1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3c_branch2a    1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
+Scale            scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
+ReLU             res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
+Convolution      res3c_branch2b   1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
+BatchNorm        bn3c_branch2b    1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
+Scale            scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
+ReLU             res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
+Convolution      res3c_branch2c   1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3c_branch2c    1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
+Scale            scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
+Eltwise          res3c            2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
+ReLU             res3c_relu       1 1 res3c res3c_res3c_relu
+Split            splitncnn_6      1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
+Convolution      res3d_branch2a   1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3d_branch2a    1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
+Scale            scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
+ReLU             res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
+Convolution      res3d_branch2b   1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456
+BatchNorm        bn3d_branch2b    1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
+Scale            scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
+ReLU             res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
+Convolution      res3d_branch2c   1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536
+BatchNorm        bn3d_branch2c    1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
+Scale            scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
+Eltwise          res3d            2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
+ReLU             res3d_relu       1 1 res3d res3d_res3d_relu
+Split            splitncnn_7      1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
+Convolution      res4a_branch1    1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288
+BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
+Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
+Convolution      res4a_branch2a   1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072
+BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
+Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
+ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
+Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
+Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
+ReLU             res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
+Convolution      res4a_branch2c   1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4a_branch2c    1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
+Scale            scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
+Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
+ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
+Split            splitncnn_8      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
+Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
+Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
+ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
+Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
+Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
+ReLU             res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
+Convolution      res4b_branch2c   1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4b_branch2c    1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
+Scale            scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
+Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
+ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
+Split            splitncnn_9      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
+Convolution      res4c_branch2a   1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4c_branch2a    1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
+Scale            scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
+ReLU             res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
+Convolution      res4c_branch2b   1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4c_branch2b    1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
+Scale            scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
+ReLU             res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
+Convolution      res4c_branch2c   1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4c_branch2c    1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
+Scale            scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
+Eltwise          res4c            2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
+ReLU             res4c_relu       1 1 res4c res4c_res4c_relu
+Split            splitncnn_10     1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
+Convolution      res4d_branch2a   1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4d_branch2a    1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
+Scale            scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
+ReLU             res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
+Convolution      res4d_branch2b   1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4d_branch2b    1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
+Scale            scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
+ReLU             res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
+Convolution      res4d_branch2c   1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4d_branch2c    1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
+Scale            scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
+Eltwise          res4d            2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
+ReLU             res4d_relu       1 1 res4d res4d_res4d_relu
+Split            splitncnn_11     1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
+Convolution      res4e_branch2a   1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4e_branch2a    1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
+Scale            scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
+ReLU             res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
+Convolution      res4e_branch2b   1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4e_branch2b    1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
+Scale            scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
+ReLU             res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
+Convolution      res4e_branch2c   1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4e_branch2c    1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
+Scale            scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
+Eltwise          res4e            2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
+ReLU             res4e_relu       1 1 res4e res4e_res4e_relu
+Split            splitncnn_12     1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
+Convolution      res4f_branch2a   1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4f_branch2a    1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
+Scale            scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
+ReLU             res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
+Convolution      res4f_branch2b   1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824
+BatchNorm        bn4f_branch2b    1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
+Scale            scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
+ReLU             res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
+Convolution      res4f_branch2c   1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144
+BatchNorm        bn4f_branch2c    1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
+Scale            scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
+Eltwise          res4f            2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
+ReLU             res4f_relu       1 1 res4f res4f_res4f_relu
+Split            splitncnn_13     1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
+Convolution      res5a_branch1    1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152
+BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
+Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
+Convolution      res5a_branch2a   1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288
+BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
+Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
+ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
+Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
+BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
+Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
+ReLU             res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
+Convolution      res5a_branch2c   1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
+BatchNorm        bn5a_branch2c    1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
+Scale            scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
+Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
+ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
+Split            splitncnn_14     1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
+Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
+BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
+Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
+ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
+Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
+BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
+Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
+ReLU             res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
+Convolution      res5b_branch2c   1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
+BatchNorm        bn5b_branch2c    1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
+Scale            scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
+Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
+ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
+Split            splitncnn_15     1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
+Convolution      res5c_branch2a   1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576
+BatchNorm        bn5c_branch2a    1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
+Scale            scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
+ReLU             res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
+Convolution      res5c_branch2b   1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296
+BatchNorm        bn5c_branch2b    1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
+Scale            scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
+ReLU             res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
+Convolution      res5c_branch2c   1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576
+BatchNorm        bn5c_branch2c    1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
+Scale            scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
+Eltwise          res5c            2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
+ReLU             res5c_relu       1 1 res5c res5c_res5c_relu
+Pooling          pool5            1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
+InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=2048000
+Softmax          prob             1 1 fc1000 prob 0=0
diff --git a/benchmark/resnet50_int8.param b/benchmark/resnet50_int8.param
new file mode 100755
index 000000000..c8e6c00e5
--- /dev/null
+++ b/benchmark/resnet50_int8.param
@@ -0,0 +1,247 @@
+7767517
+245 261
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=7 2=1 3=2 4=3 5=1 6=9408 8=2
+BatchNorm        bn_conv1         1 1 conv1 conv1_bn_conv1 0=64
+Scale            scale_conv1      1 1 conv1_bn_conv1 conv1_scale_conv1 0=64 1=1
+ReLU             conv1_relu       1 1 conv1_scale_conv1 conv1_conv1_relu
+Pooling          pool1            1 1 conv1_conv1_relu pool1 0=0 1=3 2=2 3=0 4=0
+Split            splitncnn_0      1 2 pool1 pool1_splitncnn_0 pool1_splitncnn_1
+Convolution      res2a_branch1    1 1 pool1_splitncnn_1 res2a_branch1 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2a_branch1     1 1 res2a_branch1 res2a_branch1_bn2a_branch1 0=256
+Scale            scale2a_branch1  1 1 res2a_branch1_bn2a_branch1 res2a_branch1_scale2a_branch1 0=256 1=1
+Convolution      res2a_branch2a   1 1 pool1_splitncnn_0 res2a_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=4096 8=2
+BatchNorm        bn2a_branch2a    1 1 res2a_branch2a res2a_branch2a_bn2a_branch2a 0=64
+Scale            scale2a_branch2a 1 1 res2a_branch2a_bn2a_branch2a res2a_branch2a_scale2a_branch2a 0=64 1=1
+ReLU             res2a_branch2a_relu 1 1 res2a_branch2a_scale2a_branch2a res2a_branch2a_res2a_branch2a_relu
+Convolution      res2a_branch2b   1 1 res2a_branch2a_res2a_branch2a_relu res2a_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2a_branch2b    1 1 res2a_branch2b res2a_branch2b_bn2a_branch2b 0=64
+Scale            scale2a_branch2b 1 1 res2a_branch2b_bn2a_branch2b res2a_branch2b_scale2a_branch2b 0=64 1=1
+ReLU             res2a_branch2b_relu 1 1 res2a_branch2b_scale2a_branch2b res2a_branch2b_res2a_branch2b_relu
+Convolution      res2a_branch2c   1 1 res2a_branch2b_res2a_branch2b_relu res2a_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2a_branch2c    1 1 res2a_branch2c res2a_branch2c_bn2a_branch2c 0=256
+Scale            scale2a_branch2c 1 1 res2a_branch2c_bn2a_branch2c res2a_branch2c_scale2a_branch2c 0=256 1=1
+Eltwise          res2a            2 1 res2a_branch1_scale2a_branch1 res2a_branch2c_scale2a_branch2c res2a 0=1 -23301=0
+ReLU             res2a_relu       1 1 res2a res2a_res2a_relu
+Split            splitncnn_1      1 2 res2a_res2a_relu res2a_res2a_relu_splitncnn_0 res2a_res2a_relu_splitncnn_1
+Convolution      res2b_branch2a   1 1 res2a_res2a_relu_splitncnn_1 res2b_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2b_branch2a    1 1 res2b_branch2a res2b_branch2a_bn2b_branch2a 0=64
+Scale            scale2b_branch2a 1 1 res2b_branch2a_bn2b_branch2a res2b_branch2a_scale2b_branch2a 0=64 1=1
+ReLU             res2b_branch2a_relu 1 1 res2b_branch2a_scale2b_branch2a res2b_branch2a_res2b_branch2a_relu
+Convolution      res2b_branch2b   1 1 res2b_branch2a_res2b_branch2a_relu res2b_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2b_branch2b    1 1 res2b_branch2b res2b_branch2b_bn2b_branch2b 0=64
+Scale            scale2b_branch2b 1 1 res2b_branch2b_bn2b_branch2b res2b_branch2b_scale2b_branch2b 0=64 1=1
+ReLU             res2b_branch2b_relu 1 1 res2b_branch2b_scale2b_branch2b res2b_branch2b_res2b_branch2b_relu
+Convolution      res2b_branch2c   1 1 res2b_branch2b_res2b_branch2b_relu res2b_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2b_branch2c    1 1 res2b_branch2c res2b_branch2c_bn2b_branch2c 0=256
+Scale            scale2b_branch2c 1 1 res2b_branch2c_bn2b_branch2c res2b_branch2c_scale2b_branch2c 0=256 1=1
+Eltwise          res2b            2 1 res2a_res2a_relu_splitncnn_0 res2b_branch2c_scale2b_branch2c res2b 0=1 -23301=0
+ReLU             res2b_relu       1 1 res2b res2b_res2b_relu
+Split            splitncnn_2      1 2 res2b_res2b_relu res2b_res2b_relu_splitncnn_0 res2b_res2b_relu_splitncnn_1
+Convolution      res2c_branch2a   1 1 res2b_res2b_relu_splitncnn_1 res2c_branch2a 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2c_branch2a    1 1 res2c_branch2a res2c_branch2a_bn2c_branch2a 0=64
+Scale            scale2c_branch2a 1 1 res2c_branch2a_bn2c_branch2a res2c_branch2a_scale2c_branch2a 0=64 1=1
+ReLU             res2c_branch2a_relu 1 1 res2c_branch2a_scale2c_branch2a res2c_branch2a_res2c_branch2a_relu
+Convolution      res2c_branch2b   1 1 res2c_branch2a_res2c_branch2a_relu res2c_branch2b 0=64 1=3 2=1 3=1 4=1 5=0 6=36864 8=2
+BatchNorm        bn2c_branch2b    1 1 res2c_branch2b res2c_branch2b_bn2c_branch2b 0=64
+Scale            scale2c_branch2b 1 1 res2c_branch2b_bn2c_branch2b res2c_branch2b_scale2c_branch2b 0=64 1=1
+ReLU             res2c_branch2b_relu 1 1 res2c_branch2b_scale2c_branch2b res2c_branch2b_res2c_branch2b_relu
+Convolution      res2c_branch2c   1 1 res2c_branch2b_res2c_branch2b_relu res2c_branch2c 0=256 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        bn2c_branch2c    1 1 res2c_branch2c res2c_branch2c_bn2c_branch2c 0=256
+Scale            scale2c_branch2c 1 1 res2c_branch2c_bn2c_branch2c res2c_branch2c_scale2c_branch2c 0=256 1=1
+Eltwise          res2c            2 1 res2b_res2b_relu_splitncnn_0 res2c_branch2c_scale2c_branch2c res2c 0=1 -23301=0
+ReLU             res2c_relu       1 1 res2c res2c_res2c_relu
+Split            splitncnn_3      1 2 res2c_res2c_relu res2c_res2c_relu_splitncnn_0 res2c_res2c_relu_splitncnn_1
+Convolution      res3a_branch1    1 1 res2c_res2c_relu_splitncnn_1 res3a_branch1 0=512 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
+BatchNorm        bn3a_branch1     1 1 res3a_branch1 res3a_branch1_bn3a_branch1 0=512
+Scale            scale3a_branch1  1 1 res3a_branch1_bn3a_branch1 res3a_branch1_scale3a_branch1 0=512 1=1
+Convolution      res3a_branch2a   1 1 res2c_res2c_relu_splitncnn_0 res3a_branch2a 0=128 1=1 2=1 3=2 4=0 5=0 6=32768 8=2
+BatchNorm        bn3a_branch2a    1 1 res3a_branch2a res3a_branch2a_bn3a_branch2a 0=128
+Scale            scale3a_branch2a 1 1 res3a_branch2a_bn3a_branch2a res3a_branch2a_scale3a_branch2a 0=128 1=1
+ReLU             res3a_branch2a_relu 1 1 res3a_branch2a_scale3a_branch2a res3a_branch2a_res3a_branch2a_relu
+Convolution      res3a_branch2b   1 1 res3a_branch2a_res3a_branch2a_relu res3a_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3a_branch2b    1 1 res3a_branch2b res3a_branch2b_bn3a_branch2b 0=128
+Scale            scale3a_branch2b 1 1 res3a_branch2b_bn3a_branch2b res3a_branch2b_scale3a_branch2b 0=128 1=1
+ReLU             res3a_branch2b_relu 1 1 res3a_branch2b_scale3a_branch2b res3a_branch2b_res3a_branch2b_relu
+Convolution      res3a_branch2c   1 1 res3a_branch2b_res3a_branch2b_relu res3a_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3a_branch2c    1 1 res3a_branch2c res3a_branch2c_bn3a_branch2c 0=512
+Scale            scale3a_branch2c 1 1 res3a_branch2c_bn3a_branch2c res3a_branch2c_scale3a_branch2c 0=512 1=1
+Eltwise          res3a            2 1 res3a_branch1_scale3a_branch1 res3a_branch2c_scale3a_branch2c res3a 0=1 -23301=0
+ReLU             res3a_relu       1 1 res3a res3a_res3a_relu
+Split            splitncnn_4      1 2 res3a_res3a_relu res3a_res3a_relu_splitncnn_0 res3a_res3a_relu_splitncnn_1
+Convolution      res3b_branch2a   1 1 res3a_res3a_relu_splitncnn_1 res3b_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3b_branch2a    1 1 res3b_branch2a res3b_branch2a_bn3b_branch2a 0=128
+Scale            scale3b_branch2a 1 1 res3b_branch2a_bn3b_branch2a res3b_branch2a_scale3b_branch2a 0=128 1=1
+ReLU             res3b_branch2a_relu 1 1 res3b_branch2a_scale3b_branch2a res3b_branch2a_res3b_branch2a_relu
+Convolution      res3b_branch2b   1 1 res3b_branch2a_res3b_branch2a_relu res3b_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3b_branch2b    1 1 res3b_branch2b res3b_branch2b_bn3b_branch2b 0=128
+Scale            scale3b_branch2b 1 1 res3b_branch2b_bn3b_branch2b res3b_branch2b_scale3b_branch2b 0=128 1=1
+ReLU             res3b_branch2b_relu 1 1 res3b_branch2b_scale3b_branch2b res3b_branch2b_res3b_branch2b_relu
+Convolution      res3b_branch2c   1 1 res3b_branch2b_res3b_branch2b_relu res3b_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3b_branch2c    1 1 res3b_branch2c res3b_branch2c_bn3b_branch2c 0=512
+Scale            scale3b_branch2c 1 1 res3b_branch2c_bn3b_branch2c res3b_branch2c_scale3b_branch2c 0=512 1=1
+Eltwise          res3b            2 1 res3a_res3a_relu_splitncnn_0 res3b_branch2c_scale3b_branch2c res3b 0=1 -23301=0
+ReLU             res3b_relu       1 1 res3b res3b_res3b_relu
+Split            splitncnn_5      1 2 res3b_res3b_relu res3b_res3b_relu_splitncnn_0 res3b_res3b_relu_splitncnn_1
+Convolution      res3c_branch2a   1 1 res3b_res3b_relu_splitncnn_1 res3c_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3c_branch2a    1 1 res3c_branch2a res3c_branch2a_bn3c_branch2a 0=128
+Scale            scale3c_branch2a 1 1 res3c_branch2a_bn3c_branch2a res3c_branch2a_scale3c_branch2a 0=128 1=1
+ReLU             res3c_branch2a_relu 1 1 res3c_branch2a_scale3c_branch2a res3c_branch2a_res3c_branch2a_relu
+Convolution      res3c_branch2b   1 1 res3c_branch2a_res3c_branch2a_relu res3c_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3c_branch2b    1 1 res3c_branch2b res3c_branch2b_bn3c_branch2b 0=128
+Scale            scale3c_branch2b 1 1 res3c_branch2b_bn3c_branch2b res3c_branch2b_scale3c_branch2b 0=128 1=1
+ReLU             res3c_branch2b_relu 1 1 res3c_branch2b_scale3c_branch2b res3c_branch2b_res3c_branch2b_relu
+Convolution      res3c_branch2c   1 1 res3c_branch2b_res3c_branch2b_relu res3c_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3c_branch2c    1 1 res3c_branch2c res3c_branch2c_bn3c_branch2c 0=512
+Scale            scale3c_branch2c 1 1 res3c_branch2c_bn3c_branch2c res3c_branch2c_scale3c_branch2c 0=512 1=1
+Eltwise          res3c            2 1 res3b_res3b_relu_splitncnn_0 res3c_branch2c_scale3c_branch2c res3c 0=1 -23301=0
+ReLU             res3c_relu       1 1 res3c res3c_res3c_relu
+Split            splitncnn_6      1 2 res3c_res3c_relu res3c_res3c_relu_splitncnn_0 res3c_res3c_relu_splitncnn_1
+Convolution      res3d_branch2a   1 1 res3c_res3c_relu_splitncnn_1 res3d_branch2a 0=128 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3d_branch2a    1 1 res3d_branch2a res3d_branch2a_bn3d_branch2a 0=128
+Scale            scale3d_branch2a 1 1 res3d_branch2a_bn3d_branch2a res3d_branch2a_scale3d_branch2a 0=128 1=1
+ReLU             res3d_branch2a_relu 1 1 res3d_branch2a_scale3d_branch2a res3d_branch2a_res3d_branch2a_relu
+Convolution      res3d_branch2b   1 1 res3d_branch2a_res3d_branch2a_relu res3d_branch2b 0=128 1=3 2=1 3=1 4=1 5=0 6=147456 8=2
+BatchNorm        bn3d_branch2b    1 1 res3d_branch2b res3d_branch2b_bn3d_branch2b 0=128
+Scale            scale3d_branch2b 1 1 res3d_branch2b_bn3d_branch2b res3d_branch2b_scale3d_branch2b 0=128 1=1
+ReLU             res3d_branch2b_relu 1 1 res3d_branch2b_scale3d_branch2b res3d_branch2b_res3d_branch2b_relu
+Convolution      res3d_branch2c   1 1 res3d_branch2b_res3d_branch2b_relu res3d_branch2c 0=512 1=1 2=1 3=1 4=0 5=0 6=65536 8=2
+BatchNorm        bn3d_branch2c    1 1 res3d_branch2c res3d_branch2c_bn3d_branch2c 0=512
+Scale            scale3d_branch2c 1 1 res3d_branch2c_bn3d_branch2c res3d_branch2c_scale3d_branch2c 0=512 1=1
+Eltwise          res3d            2 1 res3c_res3c_relu_splitncnn_0 res3d_branch2c_scale3d_branch2c res3d 0=1 -23301=0
+ReLU             res3d_relu       1 1 res3d res3d_res3d_relu
+Split            splitncnn_7      1 2 res3d_res3d_relu res3d_res3d_relu_splitncnn_0 res3d_res3d_relu_splitncnn_1
+Convolution      res4a_branch1    1 1 res3d_res3d_relu_splitncnn_1 res4a_branch1 0=1024 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
+BatchNorm        bn4a_branch1     1 1 res4a_branch1 res4a_branch1_bn4a_branch1 0=1024
+Scale            scale4a_branch1  1 1 res4a_branch1_bn4a_branch1 res4a_branch1_scale4a_branch1 0=1024 1=1
+Convolution      res4a_branch2a   1 1 res3d_res3d_relu_splitncnn_0 res4a_branch2a 0=256 1=1 2=1 3=2 4=0 5=0 6=131072 8=2
+BatchNorm        bn4a_branch2a    1 1 res4a_branch2a res4a_branch2a_bn4a_branch2a 0=256
+Scale            scale4a_branch2a 1 1 res4a_branch2a_bn4a_branch2a res4a_branch2a_scale4a_branch2a 0=256 1=1
+ReLU             res4a_branch2a_relu 1 1 res4a_branch2a_scale4a_branch2a res4a_branch2a_res4a_branch2a_relu
+Convolution      res4a_branch2b   1 1 res4a_branch2a_res4a_branch2a_relu res4a_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4a_branch2b    1 1 res4a_branch2b res4a_branch2b_bn4a_branch2b 0=256
+Scale            scale4a_branch2b 1 1 res4a_branch2b_bn4a_branch2b res4a_branch2b_scale4a_branch2b 0=256 1=1
+ReLU             res4a_branch2b_relu 1 1 res4a_branch2b_scale4a_branch2b res4a_branch2b_res4a_branch2b_relu
+Convolution      res4a_branch2c   1 1 res4a_branch2b_res4a_branch2b_relu res4a_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4a_branch2c    1 1 res4a_branch2c res4a_branch2c_bn4a_branch2c 0=1024
+Scale            scale4a_branch2c 1 1 res4a_branch2c_bn4a_branch2c res4a_branch2c_scale4a_branch2c 0=1024 1=1
+Eltwise          res4a            2 1 res4a_branch1_scale4a_branch1 res4a_branch2c_scale4a_branch2c res4a 0=1 -23301=0
+ReLU             res4a_relu       1 1 res4a res4a_res4a_relu
+Split            splitncnn_8      1 2 res4a_res4a_relu res4a_res4a_relu_splitncnn_0 res4a_res4a_relu_splitncnn_1
+Convolution      res4b_branch2a   1 1 res4a_res4a_relu_splitncnn_1 res4b_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4b_branch2a    1 1 res4b_branch2a res4b_branch2a_bn4b_branch2a 0=256
+Scale            scale4b_branch2a 1 1 res4b_branch2a_bn4b_branch2a res4b_branch2a_scale4b_branch2a 0=256 1=1
+ReLU             res4b_branch2a_relu 1 1 res4b_branch2a_scale4b_branch2a res4b_branch2a_res4b_branch2a_relu
+Convolution      res4b_branch2b   1 1 res4b_branch2a_res4b_branch2a_relu res4b_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4b_branch2b    1 1 res4b_branch2b res4b_branch2b_bn4b_branch2b 0=256
+Scale            scale4b_branch2b 1 1 res4b_branch2b_bn4b_branch2b res4b_branch2b_scale4b_branch2b 0=256 1=1
+ReLU             res4b_branch2b_relu 1 1 res4b_branch2b_scale4b_branch2b res4b_branch2b_res4b_branch2b_relu
+Convolution      res4b_branch2c   1 1 res4b_branch2b_res4b_branch2b_relu res4b_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4b_branch2c    1 1 res4b_branch2c res4b_branch2c_bn4b_branch2c 0=1024
+Scale            scale4b_branch2c 1 1 res4b_branch2c_bn4b_branch2c res4b_branch2c_scale4b_branch2c 0=1024 1=1
+Eltwise          res4b            2 1 res4a_res4a_relu_splitncnn_0 res4b_branch2c_scale4b_branch2c res4b 0=1 -23301=0
+ReLU             res4b_relu       1 1 res4b res4b_res4b_relu
+Split            splitncnn_9      1 2 res4b_res4b_relu res4b_res4b_relu_splitncnn_0 res4b_res4b_relu_splitncnn_1
+Convolution      res4c_branch2a   1 1 res4b_res4b_relu_splitncnn_1 res4c_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4c_branch2a    1 1 res4c_branch2a res4c_branch2a_bn4c_branch2a 0=256
+Scale            scale4c_branch2a 1 1 res4c_branch2a_bn4c_branch2a res4c_branch2a_scale4c_branch2a 0=256 1=1
+ReLU             res4c_branch2a_relu 1 1 res4c_branch2a_scale4c_branch2a res4c_branch2a_res4c_branch2a_relu
+Convolution      res4c_branch2b   1 1 res4c_branch2a_res4c_branch2a_relu res4c_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4c_branch2b    1 1 res4c_branch2b res4c_branch2b_bn4c_branch2b 0=256
+Scale            scale4c_branch2b 1 1 res4c_branch2b_bn4c_branch2b res4c_branch2b_scale4c_branch2b 0=256 1=1
+ReLU             res4c_branch2b_relu 1 1 res4c_branch2b_scale4c_branch2b res4c_branch2b_res4c_branch2b_relu
+Convolution      res4c_branch2c   1 1 res4c_branch2b_res4c_branch2b_relu res4c_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4c_branch2c    1 1 res4c_branch2c res4c_branch2c_bn4c_branch2c 0=1024
+Scale            scale4c_branch2c 1 1 res4c_branch2c_bn4c_branch2c res4c_branch2c_scale4c_branch2c 0=1024 1=1
+Eltwise          res4c            2 1 res4b_res4b_relu_splitncnn_0 res4c_branch2c_scale4c_branch2c res4c 0=1 -23301=0
+ReLU             res4c_relu       1 1 res4c res4c_res4c_relu
+Split            splitncnn_10     1 2 res4c_res4c_relu res4c_res4c_relu_splitncnn_0 res4c_res4c_relu_splitncnn_1
+Convolution      res4d_branch2a   1 1 res4c_res4c_relu_splitncnn_1 res4d_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4d_branch2a    1 1 res4d_branch2a res4d_branch2a_bn4d_branch2a 0=256
+Scale            scale4d_branch2a 1 1 res4d_branch2a_bn4d_branch2a res4d_branch2a_scale4d_branch2a 0=256 1=1
+ReLU             res4d_branch2a_relu 1 1 res4d_branch2a_scale4d_branch2a res4d_branch2a_res4d_branch2a_relu
+Convolution      res4d_branch2b   1 1 res4d_branch2a_res4d_branch2a_relu res4d_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4d_branch2b    1 1 res4d_branch2b res4d_branch2b_bn4d_branch2b 0=256
+Scale            scale4d_branch2b 1 1 res4d_branch2b_bn4d_branch2b res4d_branch2b_scale4d_branch2b 0=256 1=1
+ReLU             res4d_branch2b_relu 1 1 res4d_branch2b_scale4d_branch2b res4d_branch2b_res4d_branch2b_relu
+Convolution      res4d_branch2c   1 1 res4d_branch2b_res4d_branch2b_relu res4d_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4d_branch2c    1 1 res4d_branch2c res4d_branch2c_bn4d_branch2c 0=1024
+Scale            scale4d_branch2c 1 1 res4d_branch2c_bn4d_branch2c res4d_branch2c_scale4d_branch2c 0=1024 1=1
+Eltwise          res4d            2 1 res4c_res4c_relu_splitncnn_0 res4d_branch2c_scale4d_branch2c res4d 0=1 -23301=0
+ReLU             res4d_relu       1 1 res4d res4d_res4d_relu
+Split            splitncnn_11     1 2 res4d_res4d_relu res4d_res4d_relu_splitncnn_0 res4d_res4d_relu_splitncnn_1
+Convolution      res4e_branch2a   1 1 res4d_res4d_relu_splitncnn_1 res4e_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4e_branch2a    1 1 res4e_branch2a res4e_branch2a_bn4e_branch2a 0=256
+Scale            scale4e_branch2a 1 1 res4e_branch2a_bn4e_branch2a res4e_branch2a_scale4e_branch2a 0=256 1=1
+ReLU             res4e_branch2a_relu 1 1 res4e_branch2a_scale4e_branch2a res4e_branch2a_res4e_branch2a_relu
+Convolution      res4e_branch2b   1 1 res4e_branch2a_res4e_branch2a_relu res4e_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4e_branch2b    1 1 res4e_branch2b res4e_branch2b_bn4e_branch2b 0=256
+Scale            scale4e_branch2b 1 1 res4e_branch2b_bn4e_branch2b res4e_branch2b_scale4e_branch2b 0=256 1=1
+ReLU             res4e_branch2b_relu 1 1 res4e_branch2b_scale4e_branch2b res4e_branch2b_res4e_branch2b_relu
+Convolution      res4e_branch2c   1 1 res4e_branch2b_res4e_branch2b_relu res4e_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4e_branch2c    1 1 res4e_branch2c res4e_branch2c_bn4e_branch2c 0=1024
+Scale            scale4e_branch2c 1 1 res4e_branch2c_bn4e_branch2c res4e_branch2c_scale4e_branch2c 0=1024 1=1
+Eltwise          res4e            2 1 res4d_res4d_relu_splitncnn_0 res4e_branch2c_scale4e_branch2c res4e 0=1 -23301=0
+ReLU             res4e_relu       1 1 res4e res4e_res4e_relu
+Split            splitncnn_12     1 2 res4e_res4e_relu res4e_res4e_relu_splitncnn_0 res4e_res4e_relu_splitncnn_1
+Convolution      res4f_branch2a   1 1 res4e_res4e_relu_splitncnn_1 res4f_branch2a 0=256 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4f_branch2a    1 1 res4f_branch2a res4f_branch2a_bn4f_branch2a 0=256
+Scale            scale4f_branch2a 1 1 res4f_branch2a_bn4f_branch2a res4f_branch2a_scale4f_branch2a 0=256 1=1
+ReLU             res4f_branch2a_relu 1 1 res4f_branch2a_scale4f_branch2a res4f_branch2a_res4f_branch2a_relu
+Convolution      res4f_branch2b   1 1 res4f_branch2a_res4f_branch2a_relu res4f_branch2b 0=256 1=3 2=1 3=1 4=1 5=0 6=589824 8=2
+BatchNorm        bn4f_branch2b    1 1 res4f_branch2b res4f_branch2b_bn4f_branch2b 0=256
+Scale            scale4f_branch2b 1 1 res4f_branch2b_bn4f_branch2b res4f_branch2b_scale4f_branch2b 0=256 1=1
+ReLU             res4f_branch2b_relu 1 1 res4f_branch2b_scale4f_branch2b res4f_branch2b_res4f_branch2b_relu
+Convolution      res4f_branch2c   1 1 res4f_branch2b_res4f_branch2b_relu res4f_branch2c 0=1024 1=1 2=1 3=1 4=0 5=0 6=262144 8=2
+BatchNorm        bn4f_branch2c    1 1 res4f_branch2c res4f_branch2c_bn4f_branch2c 0=1024
+Scale            scale4f_branch2c 1 1 res4f_branch2c_bn4f_branch2c res4f_branch2c_scale4f_branch2c 0=1024 1=1
+Eltwise          res4f            2 1 res4e_res4e_relu_splitncnn_0 res4f_branch2c_scale4f_branch2c res4f 0=1 -23301=0
+ReLU             res4f_relu       1 1 res4f res4f_res4f_relu
+Split            splitncnn_13     1 2 res4f_res4f_relu res4f_res4f_relu_splitncnn_0 res4f_res4f_relu_splitncnn_1
+Convolution      res5a_branch1    1 1 res4f_res4f_relu_splitncnn_1 res5a_branch1 0=2048 1=1 2=1 3=2 4=0 5=0 6=2097152 8=2
+BatchNorm        bn5a_branch1     1 1 res5a_branch1 res5a_branch1_bn5a_branch1 0=2048
+Scale            scale5a_branch1  1 1 res5a_branch1_bn5a_branch1 res5a_branch1_scale5a_branch1 0=2048 1=1
+Convolution      res5a_branch2a   1 1 res4f_res4f_relu_splitncnn_0 res5a_branch2a 0=512 1=1 2=1 3=2 4=0 5=0 6=524288 8=2
+BatchNorm        bn5a_branch2a    1 1 res5a_branch2a res5a_branch2a_bn5a_branch2a 0=512
+Scale            scale5a_branch2a 1 1 res5a_branch2a_bn5a_branch2a res5a_branch2a_scale5a_branch2a 0=512 1=1
+ReLU             res5a_branch2a_relu 1 1 res5a_branch2a_scale5a_branch2a res5a_branch2a_res5a_branch2a_relu
+Convolution      res5a_branch2b   1 1 res5a_branch2a_res5a_branch2a_relu res5a_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5a_branch2b    1 1 res5a_branch2b res5a_branch2b_bn5a_branch2b 0=512
+Scale            scale5a_branch2b 1 1 res5a_branch2b_bn5a_branch2b res5a_branch2b_scale5a_branch2b 0=512 1=1
+ReLU             res5a_branch2b_relu 1 1 res5a_branch2b_scale5a_branch2b res5a_branch2b_res5a_branch2b_relu
+Convolution      res5a_branch2c   1 1 res5a_branch2b_res5a_branch2b_relu res5a_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        bn5a_branch2c    1 1 res5a_branch2c res5a_branch2c_bn5a_branch2c 0=2048
+Scale            scale5a_branch2c 1 1 res5a_branch2c_bn5a_branch2c res5a_branch2c_scale5a_branch2c 0=2048 1=1
+Eltwise          res5a            2 1 res5a_branch1_scale5a_branch1 res5a_branch2c_scale5a_branch2c res5a 0=1 -23301=0
+ReLU             res5a_relu       1 1 res5a res5a_res5a_relu
+Split            splitncnn_14     1 2 res5a_res5a_relu res5a_res5a_relu_splitncnn_0 res5a_res5a_relu_splitncnn_1
+Convolution      res5b_branch2a   1 1 res5a_res5a_relu_splitncnn_1 res5b_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        bn5b_branch2a    1 1 res5b_branch2a res5b_branch2a_bn5b_branch2a 0=512
+Scale            scale5b_branch2a 1 1 res5b_branch2a_bn5b_branch2a res5b_branch2a_scale5b_branch2a 0=512 1=1
+ReLU             res5b_branch2a_relu 1 1 res5b_branch2a_scale5b_branch2a res5b_branch2a_res5b_branch2a_relu
+Convolution      res5b_branch2b   1 1 res5b_branch2a_res5b_branch2a_relu res5b_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5b_branch2b    1 1 res5b_branch2b res5b_branch2b_bn5b_branch2b 0=512
+Scale            scale5b_branch2b 1 1 res5b_branch2b_bn5b_branch2b res5b_branch2b_scale5b_branch2b 0=512 1=1
+ReLU             res5b_branch2b_relu 1 1 res5b_branch2b_scale5b_branch2b res5b_branch2b_res5b_branch2b_relu
+Convolution      res5b_branch2c   1 1 res5b_branch2b_res5b_branch2b_relu res5b_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        bn5b_branch2c    1 1 res5b_branch2c res5b_branch2c_bn5b_branch2c 0=2048
+Scale            scale5b_branch2c 1 1 res5b_branch2c_bn5b_branch2c res5b_branch2c_scale5b_branch2c 0=2048 1=1
+Eltwise          res5b            2 1 res5a_res5a_relu_splitncnn_0 res5b_branch2c_scale5b_branch2c res5b 0=1 -23301=0
+ReLU             res5b_relu       1 1 res5b res5b_res5b_relu
+Split            splitncnn_15     1 2 res5b_res5b_relu res5b_res5b_relu_splitncnn_0 res5b_res5b_relu_splitncnn_1
+Convolution      res5c_branch2a   1 1 res5b_res5b_relu_splitncnn_1 res5c_branch2a 0=512 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        bn5c_branch2a    1 1 res5c_branch2a res5c_branch2a_bn5c_branch2a 0=512
+Scale            scale5c_branch2a 1 1 res5c_branch2a_bn5c_branch2a res5c_branch2a_scale5c_branch2a 0=512 1=1
+ReLU             res5c_branch2a_relu 1 1 res5c_branch2a_scale5c_branch2a res5c_branch2a_res5c_branch2a_relu
+Convolution      res5c_branch2b   1 1 res5c_branch2a_res5c_branch2a_relu res5c_branch2b 0=512 1=3 2=1 3=1 4=1 5=0 6=2359296 8=2
+BatchNorm        bn5c_branch2b    1 1 res5c_branch2b res5c_branch2b_bn5c_branch2b 0=512
+Scale            scale5c_branch2b 1 1 res5c_branch2b_bn5c_branch2b res5c_branch2b_scale5c_branch2b 0=512 1=1
+ReLU             res5c_branch2b_relu 1 1 res5c_branch2b_scale5c_branch2b res5c_branch2b_res5c_branch2b_relu
+Convolution      res5c_branch2c   1 1 res5c_branch2b_res5c_branch2b_relu res5c_branch2c 0=2048 1=1 2=1 3=1 4=0 5=0 6=1048576 8=2
+BatchNorm        bn5c_branch2c    1 1 res5c_branch2c res5c_branch2c_bn5c_branch2c 0=2048
+Scale            scale5c_branch2c 1 1 res5c_branch2c_bn5c_branch2c res5c_branch2c_scale5c_branch2c 0=2048 1=1
+Eltwise          res5c            2 1 res5b_res5b_relu_splitncnn_0 res5c_branch2c_scale5c_branch2c res5c 0=1 -23301=0
+ReLU             res5c_relu       1 1 res5c res5c_res5c_relu
+Pooling          pool5            1 1 res5c_res5c_relu pool5 0=1 1=7 2=1 3=0 4=0
+InnerProduct     fc1000           1 1 pool5 fc1000 0=1000 1=1 2=2048000
+Softmax          prob             1 1 fc1000 prob 0=0
diff --git a/benchmark/squeezenet_int8.param b/benchmark/squeezenet_int8.param
new file mode 100755
index 000000000..5ce2ae78a
--- /dev/null
+++ b/benchmark/squeezenet_int8.param
@@ -0,0 +1,77 @@
+7767517
+75 83
+Input            data             0 1 data 0=227 1=227 2=3
+Convolution      conv1            1 1 data conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
+ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1
+Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
+Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
+Split            splitncnn_0      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
+Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
+ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
+Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
+Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
+ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
+Split            splitncnn_1      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
+Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
+ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
+Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
+Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
+Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
+Split            splitncnn_2      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
+Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
+Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
+Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
+ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
+Split            splitncnn_3      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
+Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
+Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
+Pooling          pool5            1 1 fire5/concat pool5 0=0 1=3 2=2 3=0 4=0
+Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
+Split            splitncnn_4      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
+ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
+Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
+ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
+Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
+Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
+ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
+Split            splitncnn_5      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
+ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
+Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
+ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
+Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
+Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
+ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
+Split            splitncnn_6      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
+Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
+ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
+Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
+Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
+Split            splitncnn_7      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1 fire9/expand1x1_fire9/relu_expand1x1
+Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
+ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3 fire9/expand3x3_fire9/relu_expand3x3
+Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
+Dropout          drop9            1 1 fire9/concat fire9/concat_drop9
+Convolution      conv10           1 1 fire9/concat_drop9 conv10 0=1000 1=1 2=1 3=1 4=1 5=1 6=512000 8=2
+ReLU             relu_conv10      1 1 conv10 conv10_relu_conv10
+Pooling          pool10           1 1 conv10_relu_conv10 pool10 0=1 1=0 2=1 3=0 4=1
+Softmax          prob             1 1 pool10 prob 0=0
diff --git a/benchmark/squeezenet_ssd_int8.param b/benchmark/squeezenet_ssd_int8.param
new file mode 100755
index 000000000..19fb43c9e
--- /dev/null
+++ b/benchmark/squeezenet_ssd_int8.param
@@ -0,0 +1,181 @@
+7767517
+179 212
+Input            data             0 1 data 0=300 1=300 2=3
+Split            splitncnn_0      1 7 data data_splitncnn_0 data_splitncnn_1 data_splitncnn_2 data_splitncnn_3 data_splitncnn_4 data_splitncnn_5 data_splitncnn_6
+Convolution      conv1            1 1 data_splitncnn_6 conv1 0=64 1=3 2=1 3=2 4=0 5=1 6=1728 8=2
+ReLU             relu_conv1       1 1 conv1 conv1_relu_conv1
+Pooling          pool1            1 1 conv1_relu_conv1 pool1 0=0 1=3 2=2 3=0 4=0
+Convolution      fire2/squeeze1x1 1 1 pool1 fire2/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire2/relu_squeeze1x1 1 1 fire2/squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1
+Split            splitncnn_1      1 2 fire2/squeeze1x1_fire2/relu_squeeze1x1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1
+Convolution      fire2/expand1x1  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_1 fire2/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire2/relu_expand1x1 1 1 fire2/expand1x1 fire2/expand1x1_fire2/relu_expand1x1
+Convolution      fire2/expand3x3  1 1 fire2/squeeze1x1_fire2/relu_squeeze1x1_splitncnn_0 fire2/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
+ReLU             fire2/relu_expand3x3 1 1 fire2/expand3x3 fire2/expand3x3_fire2/relu_expand3x3
+Concat           fire2/concat     2 1 fire2/expand1x1_fire2/relu_expand1x1 fire2/expand3x3_fire2/relu_expand3x3 fire2/concat 0=0
+Convolution      fire3/squeeze1x1 1 1 fire2/concat fire3/squeeze1x1 0=16 1=1 2=1 3=1 4=0 5=1 6=2048 8=2
+ReLU             fire3/relu_squeeze1x1 1 1 fire3/squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1
+Split            splitncnn_2      1 2 fire3/squeeze1x1_fire3/relu_squeeze1x1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1
+Convolution      fire3/expand1x1  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_1 fire3/expand1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=1024 8=2
+ReLU             fire3/relu_expand1x1 1 1 fire3/expand1x1 fire3/expand1x1_fire3/relu_expand1x1
+Convolution      fire3/expand3x3  1 1 fire3/squeeze1x1_fire3/relu_squeeze1x1_splitncnn_0 fire3/expand3x3 0=64 1=3 2=1 3=1 4=1 5=1 6=9216 8=2
+ReLU             fire3/relu_expand3x3 1 1 fire3/expand3x3 fire3/expand3x3_fire3/relu_expand3x3
+Concat           fire3/concat     2 1 fire3/expand1x1_fire3/relu_expand1x1 fire3/expand3x3_fire3/relu_expand3x3 fire3/concat 0=0
+Pooling          pool3            1 1 fire3/concat pool3 0=0 1=3 2=2 3=0 4=0
+Convolution      fire4/squeeze1x1 1 1 pool3 fire4/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire4/relu_squeeze1x1 1 1 fire4/squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1
+Split            splitncnn_3      1 2 fire4/squeeze1x1_fire4/relu_squeeze1x1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1
+Convolution      fire4/expand1x1  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_1 fire4/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire4/relu_expand1x1 1 1 fire4/expand1x1 fire4/expand1x1_fire4/relu_expand1x1
+Convolution      fire4/expand3x3  1 1 fire4/squeeze1x1_fire4/relu_squeeze1x1_splitncnn_0 fire4/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+ReLU             fire4/relu_expand3x3 1 1 fire4/expand3x3 fire4/expand3x3_fire4/relu_expand3x3
+Concat           fire4/concat     2 1 fire4/expand1x1_fire4/relu_expand1x1 fire4/expand3x3_fire4/relu_expand3x3 fire4/concat 0=0
+Convolution      fire5/squeeze1x1 1 1 fire4/concat fire5/squeeze1x1 0=32 1=1 2=1 3=1 4=0 5=1 6=8192 8=2
+ReLU             fire5/relu_squeeze1x1 1 1 fire5/squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1
+Split            splitncnn_4      1 2 fire5/squeeze1x1_fire5/relu_squeeze1x1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1
+Convolution      fire5/expand1x1  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_1 fire5/expand1x1 0=128 1=1 2=1 3=1 4=0 5=1 6=4096 8=2
+ReLU             fire5/relu_expand1x1 1 1 fire5/expand1x1 fire5/expand1x1_fire5/relu_expand1x1
+Convolution      fire5/expand3x3  1 1 fire5/squeeze1x1_fire5/relu_squeeze1x1_splitncnn_0 fire5/expand3x3 0=128 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+ReLU             fire5/relu_expand3x3 1 1 fire5/expand3x3 fire5/expand3x3_fire5/relu_expand3x3
+Concat           fire5/concat     2 1 fire5/expand1x1_fire5/relu_expand1x1 fire5/expand3x3_fire5/relu_expand3x3 fire5/concat 0=0
+Split            splitncnn_5      1 2 fire5/concat fire5/concat_splitncnn_0 fire5/concat_splitncnn_1
+Pooling          pool5            1 1 fire5/concat_splitncnn_1 pool5 0=0 1=3 2=2 3=0 4=0
+Convolution      fire6/squeeze1x1 1 1 pool5 fire6/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=12288 8=2
+ReLU             fire6/relu_squeeze1x1 1 1 fire6/squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1
+Split            splitncnn_6      1 2 fire6/squeeze1x1_fire6/relu_squeeze1x1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1
+Convolution      fire6/expand1x1  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_1 fire6/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
+ReLU             fire6/relu_expand1x1 1 1 fire6/expand1x1 fire6/expand1x1_fire6/relu_expand1x1
+Convolution      fire6/expand3x3  1 1 fire6/squeeze1x1_fire6/relu_squeeze1x1_splitncnn_0 fire6/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
+ReLU             fire6/relu_expand3x3 1 1 fire6/expand3x3 fire6/expand3x3_fire6/relu_expand3x3
+Concat           fire6/concat     2 1 fire6/expand1x1_fire6/relu_expand1x1 fire6/expand3x3_fire6/relu_expand3x3 fire6/concat 0=0
+Convolution      fire7/squeeze1x1 1 1 fire6/concat fire7/squeeze1x1 0=48 1=1 2=1 3=1 4=0 5=1 6=18432 8=2
+ReLU             fire7/relu_squeeze1x1 1 1 fire7/squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1
+Split            splitncnn_7      1 2 fire7/squeeze1x1_fire7/relu_squeeze1x1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1
+Convolution      fire7/expand1x1  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_1 fire7/expand1x1 0=192 1=1 2=1 3=1 4=0 5=1 6=9216 8=2
+ReLU             fire7/relu_expand1x1 1 1 fire7/expand1x1 fire7/expand1x1_fire7/relu_expand1x1
+Convolution      fire7/expand3x3  1 1 fire7/squeeze1x1_fire7/relu_squeeze1x1_splitncnn_0 fire7/expand3x3 0=192 1=3 2=1 3=1 4=1 5=1 6=82944 8=2
+ReLU             fire7/relu_expand3x3 1 1 fire7/expand3x3 fire7/expand3x3_fire7/relu_expand3x3
+Concat           fire7/concat     2 1 fire7/expand1x1_fire7/relu_expand1x1 fire7/expand3x3_fire7/relu_expand3x3 fire7/concat 0=0
+Convolution      fire8/squeeze1x1 1 1 fire7/concat fire8/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=24576 8=2
+ReLU             fire8/relu_squeeze1x1 1 1 fire8/squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1
+Split            splitncnn_8      1 2 fire8/squeeze1x1_fire8/relu_squeeze1x1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1
+Convolution      fire8/expand1x1  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_1 fire8/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+ReLU             fire8/relu_expand1x1 1 1 fire8/expand1x1 fire8/expand1x1_fire8/relu_expand1x1
+Convolution      fire8/expand3x3  1 1 fire8/squeeze1x1_fire8/relu_squeeze1x1_splitncnn_0 fire8/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
+ReLU             fire8/relu_expand3x3 1 1 fire8/expand3x3 fire8/expand3x3_fire8/relu_expand3x3
+Concat           fire8/concat     2 1 fire8/expand1x1_fire8/relu_expand1x1 fire8/expand3x3_fire8/relu_expand3x3 fire8/concat 0=0
+Convolution      fire9/squeeze1x1 1 1 fire8/concat fire9/squeeze1x1 0=64 1=1 2=1 3=1 4=0 5=1 6=32768 8=2
+ReLU             fire9/relu_squeeze1x1 1 1 fire9/squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1
+Split            splitncnn_9      1 2 fire9/squeeze1x1_fire9/relu_squeeze1x1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1
+Convolution      fire9/expand1x1  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_1 fire9/expand1x1 0=256 1=1 2=1 3=1 4=0 5=1 6=16384 8=2
+BatchNorm        fire9/expand1x1/bn 1 1 fire9/expand1x1 fire9/expand1x1_fire9/expand1x1/bn 0=256
+Scale            fire9/expand1x1/scale 1 1 fire9/expand1x1_fire9/expand1x1/bn fire9/expand1x1_fire9/expand1x1/scale 0=256 1=1
+ReLU             fire9/relu_expand1x1 1 1 fire9/expand1x1_fire9/expand1x1/scale fire9/expand1x1_fire9/relu_expand1x1
+Convolution      fire9/expand3x3  1 1 fire9/squeeze1x1_fire9/relu_squeeze1x1_splitncnn_0 fire9/expand3x3 0=256 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
+BatchNorm        fire9/expand3x3/bn 1 1 fire9/expand3x3 fire9/expand3x3_fire9/expand3x3/bn 0=256
+Scale            fire9/expand3x3/scale 1 1 fire9/expand3x3_fire9/expand3x3/bn fire9/expand3x3_fire9/expand3x3/scale 0=256 1=1
+ReLU             fire9/relu_expand3x3 1 1 fire9/expand3x3_fire9/expand3x3/scale fire9/expand3x3_fire9/relu_expand3x3
+Concat           fire9/concat     2 1 fire9/expand1x1_fire9/relu_expand1x1 fire9/expand3x3_fire9/relu_expand3x3 fire9/concat 0=0
+Split            splitncnn_10     1 4 fire9/concat fire9/concat_splitncnn_0 fire9/concat_splitncnn_1 fire9/concat_splitncnn_2 fire9/concat_splitncnn_3
+Pooling          pool9            1 1 fire9/concat_splitncnn_3 pool9 0=0 1=3 2=2 3=0 4=0
+Convolution      fire10/squeeze1x1 1 1 pool9 fire10/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=49152 8=2
+BatchNorm        fire10/squeeze1x1/bn 1 1 fire10/squeeze1x1 fire10/squeeze1x1_fire10/squeeze1x1/bn 0=96
+Scale            fire10/squeeze1x1/scale 1 1 fire10/squeeze1x1_fire10/squeeze1x1/bn fire10/squeeze1x1_fire10/squeeze1x1/scale 0=96 1=1
+ReLU             fire10/relu_squeeze1x1 1 1 fire10/squeeze1x1_fire10/squeeze1x1/scale fire10/squeeze1x1_fire10/relu_squeeze1x1
+Split            splitncnn_11     1 2 fire10/squeeze1x1_fire10/relu_squeeze1x1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1
+Convolution      fire10/expand1x1 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_1 fire10/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
+BatchNorm        fire10/expand1x1/bn 1 1 fire10/expand1x1 fire10/expand1x1_fire10/expand1x1/bn 0=384
+Scale            fire10/expand1x1/scale 1 1 fire10/expand1x1_fire10/expand1x1/bn fire10/expand1x1_fire10/expand1x1/scale 0=384 1=1
+ReLU             fire10/relu_expand1x1 1 1 fire10/expand1x1_fire10/expand1x1/scale fire10/expand1x1_fire10/relu_expand1x1
+Convolution      fire10/expand3x3 1 1 fire10/squeeze1x1_fire10/relu_squeeze1x1_splitncnn_0 fire10/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
+BatchNorm        fire10/expand3x3/bn 1 1 fire10/expand3x3 fire10/expand3x3_fire10/expand3x3/bn 0=384
+Scale            fire10/expand3x3/scale 1 1 fire10/expand3x3_fire10/expand3x3/bn fire10/expand3x3_fire10/expand3x3/scale 0=384 1=1
+ReLU             fire10/relu_expand3x3 1 1 fire10/expand3x3_fire10/expand3x3/scale fire10/expand3x3_fire10/relu_expand3x3
+Concat           fire10/concat    2 1 fire10/expand1x1_fire10/relu_expand1x1 fire10/expand3x3_fire10/relu_expand3x3 fire10/concat 0=0
+Split            splitncnn_12     1 4 fire10/concat fire10/concat_splitncnn_0 fire10/concat_splitncnn_1 fire10/concat_splitncnn_2 fire10/concat_splitncnn_3
+Pooling          pool10           1 1 fire10/concat_splitncnn_3 pool10 0=0 1=3 2=2 3=0 4=0
+Convolution      fire11/squeeze1x1 1 1 pool10 fire11/squeeze1x1 0=96 1=1 2=1 3=1 4=0 5=1 6=73728 8=2
+BatchNorm        fire11/squeeze1x1/bn 1 1 fire11/squeeze1x1 fire11/squeeze1x1_fire11/squeeze1x1/bn 0=96
+Scale            fire11/squeeze1x1/scale 1 1 fire11/squeeze1x1_fire11/squeeze1x1/bn fire11/squeeze1x1_fire11/squeeze1x1/scale 0=96 1=1
+ReLU             fire11/relu_squeeze1x1 1 1 fire11/squeeze1x1_fire11/squeeze1x1/scale fire11/squeeze1x1_fire11/relu_squeeze1x1
+Split            splitncnn_13     1 2 fire11/squeeze1x1_fire11/relu_squeeze1x1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1
+Convolution      fire11/expand1x1 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_1 fire11/expand1x1 0=384 1=1 2=1 3=1 4=0 5=1 6=36864 8=2
+BatchNorm        fire11/expand1x1/bn 1 1 fire11/expand1x1 fire11/expand1x1_fire11/expand1x1/bn 0=384
+Scale            fire11/expand1x1/scale 1 1 fire11/expand1x1_fire11/expand1x1/bn fire11/expand1x1_fire11/expand1x1/scale 0=384 1=1
+ReLU             fire11/relu_expand1x1 1 1 fire11/expand1x1_fire11/expand1x1/scale fire11/expand1x1_fire11/relu_expand1x1
+Convolution      fire11/expand3x3 1 1 fire11/squeeze1x1_fire11/relu_squeeze1x1_splitncnn_0 fire11/expand3x3 0=384 1=3 2=1 3=1 4=1 5=1 6=331776 8=2
+BatchNorm        fire11/expand3x3/bn 1 1 fire11/expand3x3 fire11/expand3x3_fire11/expand3x3/bn 0=384
+Scale            fire11/expand3x3/scale 1 1 fire11/expand3x3_fire11/expand3x3/bn fire11/expand3x3_fire11/expand3x3/scale 0=384 1=1
+ReLU             fire11/relu_expand3x3 1 1 fire11/expand3x3_fire11/expand3x3/scale fire11/expand3x3_fire11/relu_expand3x3
+Concat           fire11/concat    2 1 fire11/expand1x1_fire11/relu_expand1x1 fire11/expand3x3_fire11/relu_expand3x3 fire11/concat 0=0
+Split            splitncnn_14     1 4 fire11/concat fire11/concat_splitncnn_0 fire11/concat_splitncnn_1 fire11/concat_splitncnn_2 fire11/concat_splitncnn_3
+Convolution      conv12_1         1 1 fire11/concat_splitncnn_3 conv12_1 0=128 1=1 2=1 3=1 4=0 5=0 6=98304 8=2
+BatchNorm        conv12_1/bn      1 1 conv12_1 conv12_1_conv12_1/bn 0=128
+Scale            conv12_1/scale   1 1 conv12_1_conv12_1/bn conv12_1_conv12_1/scale 0=128 1=1
+ReLU             conv12_1/relu    1 1 conv12_1_conv12_1/scale conv12_1_conv12_1/relu
+Convolution      conv12_2         1 1 conv12_1_conv12_1/relu conv12_2 0=256 1=3 2=1 3=2 4=1 5=0 6=294912 8=2
+BatchNorm        conv12_2/bn      1 1 conv12_2 conv12_2_conv12_2/bn 0=256
+Scale            conv12_2/scale   1 1 conv12_2_conv12_2/bn conv12_2_conv12_2/scale 0=256 1=1
+ReLU             conv12_2/relu    1 1 conv12_2_conv12_2/scale conv12_2_conv12_2/relu
+Split            splitncnn_15     1 4 conv12_2_conv12_2/relu conv12_2_conv12_2/relu_splitncnn_0 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_conv12_2/relu_splitncnn_3
+Convolution      conv13_1         1 1 conv12_2_conv12_2/relu_splitncnn_3 conv13_1 0=64 1=1 2=1 3=1 4=0 5=0 6=16384 8=2
+BatchNorm        conv13_1/bn      1 1 conv13_1 conv13_1_conv13_1/bn 0=64
+Scale            conv13_1/scale   1 1 conv13_1_conv13_1/bn conv13_1_conv13_1/scale 0=64 1=1
+ReLU             conv13_1/relu    1 1 conv13_1_conv13_1/scale conv13_1_conv13_1/relu
+Convolution      conv13_2         1 1 conv13_1_conv13_1/relu conv13_2 0=128 1=3 2=1 3=2 4=1 5=0 6=73728 8=2
+BatchNorm        conv13_2/bn      1 1 conv13_2 conv13_2_conv13_2/bn 0=128
+Scale            conv13_2/scale   1 1 conv13_2_conv13_2/bn conv13_2_conv13_2/scale 0=128 1=1
+ReLU             conv13_2/relu    1 1 conv13_2_conv13_2/scale conv13_2_conv13_2/relu
+Split            splitncnn_16     1 3 conv13_2_conv13_2/relu conv13_2_conv13_2/relu_splitncnn_0 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_conv13_2/relu_splitncnn_2
+BatchNorm        fire5/bn         1 1 fire5/concat_splitncnn_0 fire5/normal 0=256
+Scale            fire5/scale      1 1 fire5/normal fire5/normal_fire5/scale 0=256 1=1
+Split            splitncnn_17     1 3 fire5/normal_fire5/scale fire5/normal_fire5/scale_splitncnn_0 fire5/normal_fire5/scale_splitncnn_1 fire5/normal_fire5/scale_splitncnn_2
+Convolution      fire5_mbox_loc   1 1 fire5/normal_fire5/scale_splitncnn_2 fire5_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+Permute          fire5_mbox_loc_perm 1 1 fire5_mbox_loc fire5_mbox_loc_perm 0=3
+Flatten          fire5_mbox_loc_flat 1 1 fire5_mbox_loc_perm fire5_mbox_loc_flat
+Convolution      fire5_mbox_conf  1 1 fire5/normal_fire5/scale_splitncnn_1 fire5_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=193536 8=2
+Permute          fire5_mbox_conf_perm 1 1 fire5_mbox_conf fire5_mbox_conf_perm 0=3
+Flatten          fire5_mbox_conf_flat 1 1 fire5_mbox_conf_perm fire5_mbox_conf_flat
+PriorBox         fire5_mbox_priorbox 2 1 fire5/normal_fire5/scale_splitncnn_0 data_splitncnn_5 fire5_mbox_priorbox -23300=1,21.000000 -23301=1,45.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=8.000000 12=8.000000 13=0.500000
+Convolution      fire9_mbox_loc   1 1 fire9/concat_splitncnn_2 fire9_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=110592 8=2
+Permute          fire9_mbox_loc_perm 1 1 fire9_mbox_loc fire9_mbox_loc_perm 0=3
+Flatten          fire9_mbox_loc_flat 1 1 fire9_mbox_loc_perm fire9_mbox_loc_flat
+Convolution      fire9_mbox_conf  1 1 fire9/concat_splitncnn_1 fire9_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=580608 8=2
+Permute          fire9_mbox_conf_perm 1 1 fire9_mbox_conf fire9_mbox_conf_perm 0=3
+Flatten          fire9_mbox_conf_flat 1 1 fire9_mbox_conf_perm fire9_mbox_conf_flat
+PriorBox         fire9_mbox_priorbox 2 1 fire9/concat_splitncnn_0 data_splitncnn_4 fire9_mbox_priorbox -23300=1,45.000000 -23301=1,99.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=16.000000 12=16.000000 13=0.500000
+Convolution      fire10_mbox_loc  1 1 fire10/concat_splitncnn_2 fire10_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
+Permute          fire10_mbox_loc_perm 1 1 fire10_mbox_loc fire10_mbox_loc_perm 0=3
+Flatten          fire10_mbox_loc_flat 1 1 fire10_mbox_loc_perm fire10_mbox_loc_flat
+Convolution      fire10_mbox_conf 1 1 fire10/concat_splitncnn_1 fire10_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
+Permute          fire10_mbox_conf_perm 1 1 fire10_mbox_conf fire10_mbox_conf_perm 0=3
+Flatten          fire10_mbox_conf_flat 1 1 fire10_mbox_conf_perm fire10_mbox_conf_flat
+PriorBox         fire10_mbox_priorbox 2 1 fire10/concat_splitncnn_0 data_splitncnn_3 fire10_mbox_priorbox -23300=1,99.000000 -23301=1,153.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=32.000000 12=32.000000 13=0.500000
+Convolution      fire11_mbox_loc  1 1 fire11/concat_splitncnn_2 fire11_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=165888 8=2
+Permute          fire11_mbox_loc_perm 1 1 fire11_mbox_loc fire11_mbox_loc_perm 0=3
+Flatten          fire11_mbox_loc_flat 1 1 fire11_mbox_loc_perm fire11_mbox_loc_flat
+Convolution      fire11_mbox_conf 1 1 fire11/concat_splitncnn_1 fire11_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=870912 8=2
+Permute          fire11_mbox_conf_perm 1 1 fire11_mbox_conf fire11_mbox_conf_perm 0=3
+Flatten          fire11_mbox_conf_flat 1 1 fire11_mbox_conf_perm fire11_mbox_conf_flat
+PriorBox         fire11_mbox_priorbox 2 1 fire11/concat_splitncnn_0 data_splitncnn_2 fire11_mbox_priorbox -23300=1,153.000000 -23301=1,207.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=64.000000 12=64.000000 13=0.500000
+Convolution      conv12_2_mbox_loc 1 1 conv12_2_conv12_2/relu_splitncnn_2 conv12_2_mbox_loc 0=24 1=3 2=1 3=1 4=1 5=1 6=55296 8=2
+Permute          conv12_2_mbox_loc_perm 1 1 conv12_2_mbox_loc conv12_2_mbox_loc_perm 0=3
+Flatten          conv12_2_mbox_loc_flat 1 1 conv12_2_mbox_loc_perm conv12_2_mbox_loc_flat
+Convolution      conv12_2_mbox_conf 1 1 conv12_2_conv12_2/relu_splitncnn_1 conv12_2_mbox_conf 0=126 1=3 2=1 3=1 4=1 5=1 6=290304 8=2
+Permute          conv12_2_mbox_conf_perm 1 1 conv12_2_mbox_conf conv12_2_mbox_conf_perm 0=3
+Flatten          conv12_2_mbox_conf_flat 1 1 conv12_2_mbox_conf_perm conv12_2_mbox_conf_flat
+PriorBox         conv12_2_mbox_priorbox 2 1 conv12_2_conv12_2/relu_splitncnn_0 data_splitncnn_1 conv12_2_mbox_priorbox -23300=1,207.000000 -23301=1,261.000000 -23302=2,2.000000,3.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=100.000000 12=100.000000 13=0.500000
+Convolution      conv13_2_mbox_loc 1 1 conv13_2_conv13_2/relu_splitncnn_2 conv13_2_mbox_loc 0=16 1=3 2=1 3=1 4=1 5=1 6=18432 8=2
+Permute          conv13_2_mbox_loc_perm 1 1 conv13_2_mbox_loc conv13_2_mbox_loc_perm 0=3
+Flatten          conv13_2_mbox_loc_flat 1 1 conv13_2_mbox_loc_perm conv13_2_mbox_loc_flat
+Convolution      conv13_2_mbox_conf 1 1 conv13_2_conv13_2/relu_splitncnn_1 conv13_2_mbox_conf 0=84 1=3 2=1 3=1 4=1 5=1 6=96768 8=2
+Permute          conv13_2_mbox_conf_perm 1 1 conv13_2_mbox_conf conv13_2_mbox_conf_perm 0=3
+Flatten          conv13_2_mbox_conf_flat 1 1 conv13_2_mbox_conf_perm conv13_2_mbox_conf_flat
+PriorBox         conv13_2_mbox_priorbox 2 1 conv13_2_conv13_2/relu_splitncnn_0 data_splitncnn_0 conv13_2_mbox_priorbox -23300=1,261.000000 -23301=1,315.000000 -23302=1,2.000000 3=0.100000 4=0.100000 5=0.200000 6=0.200000 7=1 8=0 9=-233 10=-233 11=300.000000 12=300.000000 13=0.500000
+Concat           mbox_loc         6 1 fire5_mbox_loc_flat fire9_mbox_loc_flat fire10_mbox_loc_flat fire11_mbox_loc_flat conv12_2_mbox_loc_flat conv13_2_mbox_loc_flat mbox_loc 0=0
+Concat           mbox_conf        6 1 fire5_mbox_conf_flat fire9_mbox_conf_flat fire10_mbox_conf_flat fire11_mbox_conf_flat conv12_2_mbox_conf_flat conv13_2_mbox_conf_flat mbox_conf 0=0
+Concat           mbox_priorbox    6 1 fire5_mbox_priorbox fire9_mbox_priorbox fire10_mbox_priorbox fire11_mbox_priorbox conv12_2_mbox_priorbox conv13_2_mbox_priorbox mbox_priorbox 0=1
+Reshape          mbox_conf_reshape 1 1 mbox_conf mbox_conf_reshape 0=21 1=-1 2=0 3=0
+Softmax          mbox_conf_softmax 1 1 mbox_conf_reshape mbox_conf_softmax 0=1
+Flatten          mbox_conf_flatten 1 1 mbox_conf_softmax mbox_conf_flatten
+DetectionOutput  detection_out    3 1 mbox_loc mbox_conf_flatten mbox_priorbox detection_out 0=21 1=0.450000 2=100 3=100 4=0.050000
diff --git a/benchmark/vgg16_int8.param b/benchmark/vgg16_int8.param
new file mode 100755
index 000000000..110818999
--- /dev/null
+++ b/benchmark/vgg16_int8.param
@@ -0,0 +1,42 @@
+7767517
+40 40
+Input            data             0 1 data 0=224 1=224 2=3
+Convolution      conv1_1          1 1 data conv1_1 0=64 1=3 2=1 3=1 4=1 5=1 6=1728 8=2
+ReLU             relu1_1          1 1 conv1_1 conv1_1_relu1_1
+Convolution      conv1_2          1 1 conv1_1_relu1_1 conv1_2 0=64 1=3 2=1 3=1 4=1 5=1 6=36864 8=2
+ReLU             relu1_2          1 1 conv1_2 conv1_2_relu1_2
+Pooling          pool1            1 1 conv1_2_relu1_2 pool1 0=0 1=2 2=2 3=0 4=0
+Convolution      conv2_1          1 1 pool1 conv2_1 0=128 1=3 2=1 3=1 4=1 5=1 6=73728 8=2
+ReLU             relu2_1          1 1 conv2_1 conv2_1_relu2_1
+Convolution      conv2_2          1 1 conv2_1_relu2_1 conv2_2 0=128 1=3 2=1 3=1 4=1 5=1 6=147456 8=2
+ReLU             relu2_2          1 1 conv2_2 conv2_2_relu2_2
+Pooling          pool2            1 1 conv2_2_relu2_2 pool2 0=0 1=2 2=2 3=0 4=0
+Convolution      conv3_1          1 1 pool2 conv3_1 0=256 1=3 2=1 3=1 4=1 5=1 6=294912 8=2
+ReLU             relu3_1          1 1 conv3_1 conv3_1_relu3_1
+Convolution      conv3_2          1 1 conv3_1_relu3_1 conv3_2 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
+ReLU             relu3_2          1 1 conv3_2 conv3_2_relu3_2
+Convolution      conv3_3          1 1 conv3_2_relu3_2 conv3_3 0=256 1=3 2=1 3=1 4=1 5=1 6=589824 8=2
+ReLU             relu3_3          1 1 conv3_3 conv3_3_relu3_3
+Pooling          pool3            1 1 conv3_3_relu3_3 pool3 0=0 1=2 2=2 3=0 4=0
+Convolution      conv4_1          1 1 pool3 conv4_1 0=512 1=3 2=1 3=1 4=1 5=1 6=1179648 8=2
+ReLU             relu4_1          1 1 conv4_1 conv4_1_relu4_1
+Convolution      conv4_2          1 1 conv4_1_relu4_1 conv4_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
+ReLU             relu4_2          1 1 conv4_2 conv4_2_relu4_2
+Convolution      conv4_3          1 1 conv4_2_relu4_2 conv4_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
+ReLU             relu4_3          1 1 conv4_3 conv4_3_relu4_3
+Pooling          pool4            1 1 conv4_3_relu4_3 pool4 0=0 1=2 2=2 3=0 4=0
+Convolution      conv5_1          1 1 pool4 conv5_1 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
+ReLU             relu5_1          1 1 conv5_1 conv5_1_relu5_1
+Convolution      conv5_2          1 1 conv5_1_relu5_1 conv5_2 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
+ReLU             relu5_2          1 1 conv5_2 conv5_2_relu5_2
+Convolution      conv5_3          1 1 conv5_2_relu5_2 conv5_3 0=512 1=3 2=1 3=1 4=1 5=1 6=2359296 8=2
+ReLU             relu5_3          1 1 conv5_3 conv5_3_relu5_3
+Pooling          pool5            1 1 conv5_3_relu5_3 pool5 0=0 1=2 2=2 3=0 4=0
+InnerProduct     fc6              1 1 pool5 fc6 0=4096 1=1 2=102760448
+ReLU             relu6            1 1 fc6 fc6_relu6
+Dropout          drop6            1 1 fc6_relu6 fc6_drop6
+InnerProduct     fc7              1 1 fc6_drop6 fc7 0=4096 1=1 2=16777216
+ReLU             relu7            1 1 fc7 fc7_relu7
+Dropout          drop7            1 1 fc7_relu7 fc7_drop7
+InnerProduct     fc8              1 1 fc7_drop7 fc8 0=1000 1=1 2=4096000
+Softmax          prob             1 1 fc8 prob 0=0
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 5c2d395b0..092ef292d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -183,6 +183,7 @@ ncnn_add_layer(Yolov3DetectionOutput)
 ncnn_add_layer(PSROIPooling)
 ncnn_add_layer(ROIAlign OFF)
 ncnn_add_layer(Packing)
+ncnn_add_layer(Requantize)
 
 # message("SHADER_SPV_HEX_FILES = ${SHADER_SPV_HEX_FILES}")
 add_custom_target(generate-spirv DEPENDS ${SHADER_SPV_HEX_FILES})
diff --git a/src/benchmark.cpp b/src/benchmark.cpp
index 2898228ad..2e0bb0afa 100644
--- a/src/benchmark.cpp
+++ b/src/benchmark.cpp
@@ -55,14 +55,14 @@ double get_current_time()
 
 void benchmark(const Layer* layer, double start, double end)
 {
-    fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
+    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
     fprintf(stderr, "    |");
     fprintf(stderr, "\n");
 }
 
 void benchmark(const Layer* layer, const Mat& bottom_blob, Mat& top_blob, double start, double end)
 {
-    fprintf(stderr, "%-24s %-24s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
+    fprintf(stderr, "%-24s %-30s %8.2lfms", layer->type.c_str(), layer->name.c_str(), end - start);
     fprintf(stderr, "    |    feature_map: %4d x %-4d    inch: %4d    outch: %4d", bottom_blob.w, bottom_blob.h, bottom_blob.c, top_blob.c);
     if (layer->type == "Convolution")
     {
diff --git a/src/layer/arm/convolution_1x1_int8.h b/src/layer/arm/convolution_1x1_int8.h
index 70d8f6b25..5af98e42c 100644
--- a/src/layer/arm/convolution_1x1_int8.h
+++ b/src/layer/arm/convolution_1x1_int8.h
@@ -65,4097 +65,896 @@ static void conv1x1s1_sgemm_transform_kernel_int8_neon(const Mat& _kernel, Mat&
 
 #if __aarch64__
 /*
- * Convolution 1x1 quantized with int8,unroll 16 x 8
+ * Convolution 1x1 quantized with sgemm int8
  */
-static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
     int inch = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
     int outch = top_blob.c;
 
-    const signed char* kernel = _kernel;
-
-    int nn_outch = 0;
-    int remain_outch_start = 0;
-
-    nn_outch = outch >> 3;
-    remain_outch_start = nn_outch << 3;
+    const int size = w * h;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp<nn_outch; pp++)
+    // interleave
+    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 1u);
     {
-        int p = pp * 8;
+        int nn_size = size >> 3;
+        int remain_size_start = nn_size << 3;
 
-        Mat out0 = top_blob.channel(p);
-        Mat out1 = top_blob.channel(p+1);
-        Mat out2 = top_blob.channel(p+2);
-        Mat out3 = top_blob.channel(p+3);
-        Mat out4 = top_blob.channel(p+4);
-        Mat out5 = top_blob.channel(p+5);
-        Mat out6 = top_blob.channel(p+6);
-        Mat out7 = top_blob.channel(p+7);
-
-        out0.fill(0);
-        out1.fill(0);
-        out2.fill(0);
-        out3.fill(0);
-        out4.fill(0);
-        out5.fill(0);
-        out6.fill(0);
-        out7.fill(0);
-
-        int q = 0;
-
-#ifdef __clang__
-        for (; q+15<inch; q+=16)
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii=0; ii<nn_size; ii++)
         {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3;
-            int* outptr4 = out4;
-            int* outptr5 = out5;
-            int* outptr6 = out6;
-            int* outptr7 = out7;
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char* kernel1 = (const signed char*)kernel + (p+1)*inch + q;
-            const signed char* kernel2 = (const signed char*)kernel + (p+2)*inch + q;
-            const signed char* kernel3 = (const signed char*)kernel + (p+3)*inch + q;
-            const signed char* kernel4 = (const signed char*)kernel + (p+4)*inch + q;
-            const signed char* kernel5 = (const signed char*)kernel + (p+5)*inch + q;
-            const signed char* kernel6 = (const signed char*)kernel + (p+6)*inch + q;
-            const signed char* kernel7 = (const signed char*)kernel + (p+7)*inch + q;
-
-            const signed char* r0 = bottom_blob.channel(q);
-            const signed char* r1 = bottom_blob.channel(q+1);
-            const signed char* r2 = bottom_blob.channel(q+2);
-            const signed char* r3 = bottom_blob.channel(q+3);
-            const signed char* r4 = bottom_blob.channel(q+4);
-            const signed char* r5 = bottom_blob.channel(q+5);
-            const signed char* r6 = bottom_blob.channel(q+6);
-            const signed char* r7 = bottom_blob.channel(q+7);
-            const signed char* r8 = bottom_blob.channel(q+8);
-            const signed char* r9 = bottom_blob.channel(q+9);
-            const signed char* r10 = bottom_blob.channel(q+10);
-            const signed char* r11 = bottom_blob.channel(q+11);
-            const signed char* r12 = bottom_blob.channel(q+12);
-            const signed char* r13 = bottom_blob.channel(q+13);
-            const signed char* r14 = bottom_blob.channel(q+14);
-            const signed char* r15 = bottom_blob.channel(q+15);
-
-            int size = outw * outh;
-
-            int nn = size >> 4;
-            int remain = size & 15;
-
-            int8x16_t _k0 = vld1q_s8(kernel0);
-            int8x16_t _k1 = vld1q_s8(kernel1);
-            int8x16_t _k2 = vld1q_s8(kernel2);
-            int8x16_t _k3 = vld1q_s8(kernel3);
-            int8x16_t _k4 = vld1q_s8(kernel4);
-            int8x16_t _k5 = vld1q_s8(kernel5);
-            int8x16_t _k6 = vld1q_s8(kernel6);
-            int8x16_t _k7 = vld1q_s8(kernel7);
-
-            if (nn > 0)
-            {
-            asm volatile(
-                "prfm   pldl1keep, [%9, #128]        \n"
-                "prfm   pldl1keep, [%10, #128]       \n"
-                "prfm   pldl1keep, [%11, #128]       \n"
-                "prfm   pldl1keep, [%12, #128]       \n"
-                "ld1    {v8.16b}, [%9], #16          \n" // r0"
-                "ld1    {v9.16b}, [%10], #16         \n" // r1"
-                "ld1    {v10.16b}, [%11], #16        \n" // r2"
-                "ld1    {v11.16b}, [%12], #16        \n" // r3"
-
-                "dup    v24.16b, %50.b[0]            \n" // k00
-                "dup    v25.16b, %50.b[1]            \n" // k01
-                "dup    v26.16b, %50.b[2]            \n" // k02
-                "dup    v27.16b, %50.b[3]            \n" // k03
-
-                "0:                                  \n"
-                "smull  v28.8h, v8.8b, v24.8b        \n" // r0 * k0
-                "smull2  v31.8h, v8.16b, v24.16b     \n" // r0n * k0
-                "prfm   pldl1keep, [%13, #128]       \n"
-                "prfm   pldl1keep, [%14, #128]       \n"
-                "prfm   pldl1keep, [%15, #128]       \n"
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n" // r0 * k1
-                "smlal2  v31.8h, v9.16b, v25.16b     \n" // r0n * k1
-                "prfm   pldl1keep, [%16, #128]       \n"
-                "ld1    {v12.16b}, [%13], #16        \n" // r4"   
-                "ld1    {v13.16b}, [%14], #16        \n" // r5"
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "ld1    {v14.16b}, [%15], #16        \n" // r6"
-                "ld1    {v15.16b}, [%16], #16        \n" // r7"                               
-                "dup    v24.16b, %50.b[4]            \n" // k04
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"
-                "dup    v25.16b, %50.b[5]            \n" // k05
-                "dup    v26.16b, %50.b[6]            \n" // k06
-                "dup    v27.16b, %50.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n" // r4
-                "smlal2  v31.8h, v12.16b, v24.16b    \n" // r4
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%1]       \n" // sum0  
-                "prfm   pldl1keep, [%17, #128]       \n"
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "ld1    {v16.16b}, [%17], #16        \n" // r8"   
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "ld1    {v17.16b}, [%18], #16        \n" // r9"
-                "ld1    {v18.16b}, [%19], #16        \n" // r10"
-                "ld1    {v19.16b}, [%20], #16        \n" // r11"             
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v24.16b, %50.b[8]            \n" // k08
-                "dup    v25.16b, %50.b[9]            \n" // k09
-                "dup    v26.16b, %50.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n" // r8
-                "smlal2  v31.8h, v16.16b, v24.16b    \n" // r8
-                "dup    v27.16b, %50.b[11]           \n" // k11
-                "prfm   pldl1keep, [%21, #128]       \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "ld1    {v20.16b}, [%21], #16        \n" // r12"
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "ld1    {v21.16b}, [%22], #16        \n" // r13"
-                "ld1    {v22.16b}, [%23], #16        \n" // r14"
-                "ld1    {v23.16b}, [%24], #16        \n" // r15"              
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v24.16b, %50.b[12]           \n" // k12
-                "dup    v25.16b, %50.b[13]           \n" // k13
-                "dup    v26.16b, %50.b[14]           \n" // k14
-
-                "smlal  v28.8h, v20.8b, v24.8b       \n" // r12
-                "smlal2  v31.8h, v20.16b, v24.16b    \n" // r12
-                "dup    v27.16b, %50.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %51.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %51.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"             
-                "dup    v26.16b, %51.b[2]            \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v27.16b, %51.b[3]            \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%1], #32  \n" // sum0
-
-                "ld1    {v29.4s, v30.4s}, [%1]       \n" // sum0
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n" 
-                "st1    {v29.4s, v30.4s}, [%1], #32  \n" // sum0             
-                //########################################### 
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %51.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %51.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %51.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"         
-                "dup    v27.16b, %51.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%2]       \n" // sum1
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %51.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %51.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %51.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %51.b[11]           \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %51.b[12]           \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %51.b[13]           \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %51.b[14]           \n" // k14
-
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %51.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %52.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %52.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"        
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"     
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v26.16b, %52.b[2]            \n" // k02
-                "dup    v27.16b, %52.b[3]            \n" // k03  
-
-                "st1    {v29.4s, v30.4s}, [%2], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%2]       \n" // sum1
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"   
-                "st1    {v29.4s, v30.4s}, [%2], #32  \n"             
-                //########################################### // sum1
-
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %52.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %52.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %52.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"             
-                "dup    v27.16b, %52.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%3]       \n" // sum2 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %52.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %52.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %52.b[10]           \n" // k10
-
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %52.b[11]           \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %52.b[12]           \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %52.b[13]           \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %52.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %52.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %53.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %53.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"             
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.16b, %53.b[2]            \n" // k02
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v27.16b, %53.b[3]            \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%3], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%3]       \n" // sum2 
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"
-                "st1    {v29.4s, v30.4s}, [%3], #32  \n"
-                //########################################### //sum 2
-
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %53.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %53.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %53.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"             
-                "dup    v27.16b, %53.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%4]       \n" // sum3 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %53.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %53.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %53.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %53.b[11]           \n" // k11
+            int i = ii * 8;
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %53.b[12]           \n" // k12
+            const signed char* img0 = bottom_blob.channel(0);
+            img0 += i;
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %53.b[13]           \n" // k13
+            signed char* tmpptr = tmp.channel(i/8);
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %53.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %53.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %54.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %54.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"         
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.16b, %54.b[2]            \n" // k02
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v27.16b, %54.b[3]            \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%4], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%4]       \n" // sum3 
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"
-                "st1    {v29.4s, v30.4s}, [%4], #32  \n"
-                //########################################### // sum3
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %54.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %54.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %54.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"         
-                "dup    v27.16b, %54.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%5]       \n" // sum4
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %54.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %54.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %54.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %54.b[11]           \n" // k11
+            for (int q=0; q<inch; q++)
+            {
+#if 0 //__ARM_NEON
+                asm volatile(
+                    "pld        [%0, #64]     \n"
+                    "vld1.s8   {d0}, [%0]     \n"
+                    "vst1.s8   {d0}, [%1]!    \n"
+                    : "=r"(img0),   // %0
+                      "=r"(tmpptr)  // %1
+                    : "0"(img0),
+                      "1"(tmpptr)
+                    : "memory", "d0"
+                );
+                img0 += bottom_blob.cstep;
+#else                
+                tmpptr[0] = img0[0];
+                tmpptr[1] = img0[1];
+                tmpptr[2] = img0[2];
+                tmpptr[3] = img0[3];
+                tmpptr[4] = img0[4];
+                tmpptr[5] = img0[5];
+                tmpptr[6] = img0[6];
+                tmpptr[7] = img0[7];
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %54.b[12]           \n" // k12
+                tmpptr += 8;
+                img0 += bottom_blob.cstep;
+#endif // __ARM_NEON__                
+            }
+        }
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %54.b[13]           \n" // k13
+        nn_size = (size - remain_size_start) >> 2;
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %54.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %54.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %55.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %55.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"
-                "dup    v26.16b, %55.b[2]            \n" // k02    
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.16b, %55.b[3]            \n" // k03
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                
-                "st1    {v29.4s, v30.4s}, [%5], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%5]       \n" // sum4
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"
-                "st1    {v29.4s, v30.4s}, [%5], #32  \n"
-                //########################################### // sum4
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %55.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %55.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %55.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"             
-                "dup    v27.16b, %55.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%6]       \n" // sum5  
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %55.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %55.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %55.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %55.b[11]           \n" // k11
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii=0; ii<nn_size; ii++)
+        {
+            int i = remain_size_start + ii * 4;
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %55.b[12]           \n" // k12
+            const signed char* img0 = bottom_blob.channel(0);
+            img0 += i;
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %55.b[13]           \n" // k13
+            signed char* tmpptr = tmp.channel(i/8 + (i%8)/4);
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %55.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %55.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %56.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %56.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"             
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.16b, %56.b[2]            \n" // k02
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v27.16b, %56.b[3]            \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%6], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%6]       \n" // sum5 
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"
-                "st1    {v29.4s, v30.4s}, [%6], #32  \n"
-                //########################################### // sum5
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %56.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %56.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %56.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"         
-                "dup    v27.16b, %56.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%7]       \n" // sum6 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %56.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %56.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"     
-                "dup    v26.16b, %56.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %56.b[11]           \n" // k11
+            for (int q=0; q<inch; q++)
+            {
+                tmpptr[0] = img0[0];
+                tmpptr[1] = img0[1];
+                tmpptr[2] = img0[2];
+                tmpptr[3] = img0[3];
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %56.b[12]           \n" // k12
+                tmpptr += 4;
+                img0 += bottom_blob.cstep;
+            }            
+        }        
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %56.b[13]           \n" // k13
+        remain_size_start += nn_size << 2;
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %56.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %56.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "dup    v24.16b, %57.b[0]            \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "dup    v25.16b, %57.b[1]            \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"             
-                "dup    v26.16b, %57.b[2]            \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v27.16b, %57.b[3]            \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%7], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%7]       \n" // sum6 
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"     
-                "st1    {v29.4s, v30.4s}, [%7], #32  \n"                           
-                //########################################### // sum6
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "smull2  v31.8h, v8.16b, v24.16b     \n"
-                "dup    v24.16b, %57.b[4]            \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "smlal2  v31.8h, v9.16b, v25.16b     \n"
-                "dup    v25.16b, %57.b[5]            \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "smlal2  v31.8h, v10.16b, v26.16b    \n"
-                "dup    v26.16b, %57.b[6]            \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "smlal2  v31.8h, v11.16b, v27.16b    \n"             
-                "dup    v27.16b, %57.b[7]            \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "smlal2  v31.8h, v12.16b, v24.16b    \n"
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%8]       \n" // sum7 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "smlal2  v31.8h, v13.16b, v25.16b    \n"
-                "dup    v24.16b, %57.b[8]            \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "smlal2  v31.8h, v14.16b, v26.16b    \n"
-                "dup    v25.16b, %57.b[9]            \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "smlal2  v31.8h, v15.16b, v27.16b    \n"
-                "dup    v26.16b, %57.b[10]           \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "smlal2  v31.8h, v16.16b, v24.16b    \n"
-                "dup    v27.16b, %57.b[11]           \n" // k11
-                
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "smlal2  v31.8h, v17.16b, v25.16b    \n"
-                "dup    v24.16b, %57.b[12]           \n" // k12
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i=remain_size_start; i<size; i++)
+        {
+            const signed char* img0 = bottom_blob.channel(0);
+            img0 += i;
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "smlal2  v31.8h, v18.16b, v26.16b    \n"
-                "dup    v25.16b, %57.b[13]           \n" // k13
+            signed char* tmpptr = tmp.channel(i/8 + (i%8)/4 + i%4);
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "smlal2  v31.8h, v19.16b, v27.16b    \n"
-                "dup    v26.16b, %57.b[14]           \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "smlal2  v31.8h, v20.16b, v24.16b    \n"
-                "dup    v27.16b, %57.b[15]           \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "smlal2  v31.8h, v21.16b, v25.16b    \n"
-                "prfm   pldl1keep, [%9, #128]        \n"
-                "prfm   pldl1keep, [%10, #128]       \n"
-                "ld1    {v8.16b}, [%9], #16          \n" // r0"
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "smlal2  v31.8h, v22.16b, v26.16b    \n"
-                "ld1    {v9.16b}, [%10], #16         \n" // r1"
-                "prfm   pldl1keep, [%11, #128]       \n"
-                "prfm   pldl1keep, [%12, #128]       \n"
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"    
-                "smlal2  v31.8h, v23.16b, v27.16b    \n"         
-                "ld1    {v10.16b}, [%11], #16        \n" // r2"
-                "ld1    {v11.16b}, [%12], #16        \n" // r3"
-                "dup    v24.16b, %50.b[0]            \n" // k00                    
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v25.16b, %50.b[1]            \n" // k01
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v26.16b, %50.b[2]            \n" // k02
-                "dup    v27.16b, %50.b[3]            \n" // k03                
-
-                "st1    {v29.4s, v30.4s}, [%8], #32  \n"
-
-                "ld1    {v29.4s, v30.4s}, [%8]       \n" // sum7 
-                "saddw  v29.4s, v29.4s, v31.4h       \n"
-                "saddw2 v30.4s, v30.4s, v31.8h       \n"
-                "st1    {v29.4s, v30.4s}, [%8], #32  \n"
-                //########################################### // sum7
-                "subs   %w0, %w0, #1                 \n"
-                "bne    0b                           \n"
-                "sub    %9, %9, #16                  \n"
-                "sub    %10, %10, #16                \n"
-                "sub    %11, %11, #16                \n"
-                "sub    %12, %12, #16                \n"
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7),// %8
-                  "=r"(r0),     // %9
-                  "=r"(r1),     // %10
-                  "=r"(r2),     // %11
-                  "=r"(r3),     // %12
-                  "=r"(r4),     // %13
-                  "=r"(r5),     // %14
-                  "=r"(r6),     // %15
-                  "=r"(r7),     // %16
-                  "=r"(r8),     // %17
-                  "=r"(r9),     // %18
-                  "=r"(r10),    // %19
-                  "=r"(r11),    // %20
-                  "=r"(r12),    // %21
-                  "=r"(r13),    // %22
-                  "=r"(r14),    // %23
-                  "=r"(r15)     // %24
-                : "0"(nn),
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "9"(r0),
-                  "10"(r1),
-                  "11"(r2),
-                  "12"(r3),
-                  "13"(r4),
-                  "14"(r5),
-                  "15"(r6),
-                  "16"(r7),
-                  "17"(r8),
-                  "18"(r9),
-                  "19"(r10),
-                  "20"(r11),
-                  "21"(r12),
-                  "22"(r13),
-                  "23"(r14),
-                  "24"(r15),
-                  "w"(_k0),     // %50
-                  "w"(_k1),     // %51
-                  "w"(_k2),     // %52
-                  "w"(_k3),     // %53
-                  "w"(_k4),     // %54
-                  "w"(_k5),     // %55
-                  "w"(_k6),     // %56
-                  "w"(_k7)      // %57
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );                     
+            for (int q=0; q<inch; q++)
+            {
+                tmpptr[0] = img0[0];
+                tmpptr++;
+                img0 += bottom_blob.cstep;
             }
+        }
+    }
 
-            if (remain >= 8)
-            {
-                remain -= 8;
+    // sgemm process
+    int nn_outch = 0;
+    int remain_outch_start = 0;
 
-            asm volatile(
-                "prfm   pldl1keep, [%9, #128]        \n"
-                "prfm   pldl1keep, [%10, #128]       \n"
-                "prfm   pldl1keep, [%11, #128]       \n"
-                "prfm   pldl1keep, [%12, #128]       \n"
-                "ld1    {v8.8b}, [%9], #8            \n" // r0"
-                "ld1    {v9.8b}, [%10], #8           \n" // r1"
-                "ld1    {v10.8b}, [%11], #8          \n" // r2"
-                "ld1    {v11.8b}, [%12], #8          \n" // r3"
-
-                "dup    v24.8b, %50.b[0]             \n" // k00
-                "dup    v25.8b, %50.b[1]             \n" // k01
-                "dup    v26.8b, %50.b[2]             \n" // k02
-                "dup    v27.8b, %50.b[3]             \n" // k03
-
-                "smull  v28.8h, v8.8b, v24.8b        \n" // r0
-                "prfm   pldl1keep, [%13, #128]       \n"
-                "prfm   pldl1keep, [%14, #128]       \n"
-                "prfm   pldl1keep, [%15, #128]       \n"
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "prfm   pldl1keep, [%16, #128]       \n"
-                "ld1    {v12.8b}, [%13], #8          \n" // r4" 
-                "ld1    {v13.8b}, [%14], #8          \n" // r5"
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "ld1    {v14.8b}, [%15], #8          \n" // r6"
-                "ld1    {v15.8b}, [%16], #8          \n" // r7"                         
-                "dup    v24.8b, %50.b[4]             \n" // k04
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "dup    v25.8b, %50.b[5]             \n" // k05
-                "dup    v26.8b, %50.b[6]             \n" // k06
-                "dup    v27.8b, %50.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n" // r4
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%1]       \n" // sum0  
-                "prfm   pldl1keep, [%17, #128]       \n"
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "ld1    {v16.8b}, [%17], #8          \n" // r8" 
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "ld1    {v17.8b}, [%18], #8          \n" // r9"
-                "ld1    {v18.8b}, [%19], #8          \n" // r10"
-                "ld1    {v19.8b}, [%20], #8          \n" // r11"
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v24.8b, %50.b[8]             \n" // k08
-                "dup    v25.8b, %50.b[9]             \n" // k09
-                "dup    v26.8b, %50.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n" // r8
-                "dup    v27.8b, %50.b[11]            \n" // k11
-                "prfm   pldl1keep, [%21, #128]       \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
+    nn_outch = (outch - remain_outch_start) >> 2;    
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "ld1    {v20.8b}, [%21], #8          \n" // r12"
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp=0; pp<nn_outch; pp++)
+    {
+        int p = remain_outch_start + pp * 4;
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "ld1    {v21.8b}, [%22], #8          \n" // r13"
-                "ld1    {v22.8b}, [%23], #8          \n" // r14"
-                "ld1    {v23.8b}, [%24], #8          \n" // r15" 
+        int* outptr0 = top_blob.channel(p);
+        int* outptr1 = top_blob.channel(p+1);
+        int* outptr2 = top_blob.channel(p+2);
+        int* outptr3 = top_blob.channel(p+3);
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v24.8b, %50.b[12]            \n" // k12
-                "dup    v25.8b, %50.b[13]            \n" // k13
-                "dup    v26.8b, %50.b[14]            \n" // k14
+        int i = 0;
 
-                "smlal  v28.8h, v20.8b, v24.8b       \n" // r12
-                "dup    v27.8b, %50.b[15]            \n" // k15
+        for (; i+7<size; i+=8)
+        {
+            const signed char* tmpptr = tmp.channel(i/8);
+            const signed char* kptr = kernel.channel(p/4);
+#if 0 //__ARM_NEON
+            asm volatile(
+                // inch loop
+                "vmov.s32    q6, #0            \n"
+                "vmov.s32    q7, #0            \n"
+                "vmov.s32    q8, #0            \n"
+                "vmov.s32    q9, #0            \n"
+                "vmov.s32    q10, #0           \n"
+                "vmov.s32    q11, #0           \n"
+                "vmov.s32    q12, #0           \n"
+                "vmov.s32    q13, #0           \n"
 
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                 "dup    v24.8b, %51.b[0]             \n" // k00
+                "lsr         r4, %12, #2       \n"// r4 = nn = inch >> 2
+                "cmp         r4, #0            \n"
+                "beq         1f                \n"
+                
+                "0:                            \n"// for(; nn != 0; nn--)
+                "pld         [%4, #128]        \n"
+                "vld1.s8     {d4-d7}, [%4]!    \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                "vmovl.s8    q5, d7            \n"// a30-a37
+                "vmovl.s8    q4, d6            \n"// a20-a27
+                "vmovl.s8    q3, d5            \n"// a10-a17
+                "vmovl.s8    q2, d4            \n"// a00-a07
 
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[1]             \n" // k01
+                "vld1.s8     {d0-d1}, [%5]!    \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33    k(outch)(inch)
+                "vmovl.s8    q1, d1            \n"// k02-k32,k03-k33
+                "vmovl.s8    q0, d0            \n"// k00-k30,k01-k31
 
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %51.b[2]             \n" // k02
+                "vmlal.s16   q6, d4, d0[0]     \n"// sum0 = (a00-a07) * k00
+                "vmlal.s16   q7, d5, d0[0]     \n"
+                "vmlal.s16   q8, d4, d0[1]     \n"// sum1 = (a00-a07) * k10
+                "vmlal.s16   q9, d5, d0[1]     \n"
+                "vmlal.s16   q10, d4, d0[2]    \n"// sum2 = (a00-a07) * k20
+                "vmlal.s16   q11, d5, d0[2]    \n"
+                "vmlal.s16   q12, d4, d0[3]    \n"// sum3 = (a00-a07) * k30
+                "vmlal.s16   q13, d5, d0[3]    \n"
 
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
+                "vmlal.s16   q6, d6, d1[0]     \n"// sum0 += (a10-a17) * k01
+                "vmlal.s16   q7, d7, d1[0]     \n"
+                "vmlal.s16   q8, d6, d1[1]     \n"// sum1 += (a10-a17) * k11
+                "vmlal.s16   q9, d7, d1[1]     \n"
+                "vmlal.s16   q10, d6, d1[2]    \n"// sum2 += (a10-a17) * k21
+                "vmlal.s16   q11, d7, d1[2]    \n"
+                "vmlal.s16   q12, d6, d1[3]    \n"// sum3 += (a10-a17) * k31
+                "vmlal.s16   q13, d7, d1[3]    \n"
 
-                "dup    v27.8b, %51.b[3]             \n" // k03
+                "vmlal.s16   q6, d8, d2[0]     \n"// sum0 += (a20-a27) * k02
+                "vmlal.s16   q7, d9, d2[0]     \n"
+                "vmlal.s16   q8, d8, d2[1]     \n"// sum1 += (a20-a27) * k12
+                "vmlal.s16   q9, d9, d2[1]     \n"
+                "vmlal.s16   q10, d8, d2[2]    \n"// sum2 += (a20-a27) * k22
+                "vmlal.s16   q11, d9, d2[2]    \n"
+                "vmlal.s16   q12, d8, d2[3]    \n"// sum3 += (a20-a27) * k32
+                "vmlal.s16   q13, d9, d2[3]    \n"  
 
-                "st1    {v29.4s, v30.4s}, [%1], #32  \n" // sum0
-                //########################################### 
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %51.b[4]             \n" // k04
+                "vmlal.s16   q6, d10, d3[0]    \n"// sum0 += (a30-a37) * k03
+                "vmlal.s16   q7, d11, d3[0]    \n"
+                "vmlal.s16   q8, d10, d3[1]    \n"// sum1 += (a30-a37) * k13
+                "vmlal.s16   q9, d11, d3[1]    \n"
+                "vmlal.s16   q10, d10, d3[2]   \n"// sum2 += (a30-a37) * k23
+                "vmlal.s16   q11, d11, d3[2]   \n"
+                "vmlal.s16   q12, d10, d3[3]   \n"// sum3 += (a30-a37) * k33
+                "vmlal.s16   q13, d11, d3[3]   \n"                  
 
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %51.b[5]             \n" // k05
+                "subs        r4, r4, #1        \n"
+                "bne         0b                \n"// end for
+ 
+                "1:                            \n"
+                // remain loop
+                "and         r4, %12, #3       \n"// r4 = remain = inch & 3
+                "cmp         r4, #0            \n"
+                "beq         3f                \n"
 
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %51.b[6]             \n" // k06
+                "2:                            \n"// for(; remain != 0; remain--)
+                "vld1.s8     {d2}, [%4]!       \n"// tmpr a00-a07    a(inch)(data)
+                "vld1.s8     {d0}, [%5]        \n"// kptr k00-k30    k(outch)(inch)
+                "vmovl.s8    q1, d2            \n"
+                "vmovl.s8    q0, d0            \n"
+                "add         %5, #4            \n"
 
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %51.b[7]             \n" // k07
+                "vmlal.s16   q6, d2, d0[0]     \n"// sum0 += (a00-a07) * k00
+                "vmlal.s16   q7, d3, d0[0]     \n"
+                "vmlal.s16   q8, d2, d0[1]     \n"// sum1 += (a00-a07) * k10
+                "vmlal.s16   q9, d3, d0[1]     \n"
+                "vmlal.s16   q10, d2, d0[2]    \n"// sum2 += (a00-a07) * k20
+                "vmlal.s16   q11, d3, d0[2]    \n"
+                "vmlal.s16   q12, d2, d0[3]    \n"// sum3 += (a00-a07) * k30
+                "vmlal.s16   q13, d3, d0[3]    \n"    
 
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%2]       \n" // sum1
+                "subs        r4, r4, #1        \n"
+                "bne         2b                \n"
 
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %51.b[8]             \n" // k08
+                "3:                            \n"// store the result to memory
+                "vst1.s32    {d12-d15}, [%0]!  \n"
+                "vst1.s32    {d16-d19}, [%1]!  \n"
+                "vst1.s32    {d20-d23}, [%2]!  \n"
+                "vst1.s32    {d24-d27}, [%3]!  \n"
 
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[9]             \n" // k09
+                : "=r"(outptr0), // %0
+                  "=r"(outptr1), // %1
+                  "=r"(outptr2), // %2
+                  "=r"(outptr3), // %3
+                  "=r"(tmpptr),  // %4
+                  "=r"(kptr)     // %5
+                : "0"(outptr0),
+                  "1"(outptr1),
+                  "2"(outptr2),
+                  "3"(outptr3),
+                  "4"(tmpptr),
+                  "5"(kptr),
+                  "r"(inch)      // %12  
+                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+            );
+#else
+            int sum0_0 = 0;
+            int sum0_1 = 0;
+            int sum0_2 = 0;
+            int sum0_3 = 0;
+            int sum0_4 = 0;
+            int sum0_5 = 0;
+            int sum0_6 = 0;
+            int sum0_7 = 0;
+
+            int sum1_0 = 0;
+            int sum1_1 = 0;
+            int sum1_2 = 0;
+            int sum1_3 = 0;
+            int sum1_4 = 0;
+            int sum1_5 = 0;
+            int sum1_6 = 0;
+            int sum1_7 = 0;
+
+            int sum2_0 = 0;
+            int sum2_1 = 0;
+            int sum2_2 = 0;
+            int sum2_3 = 0;
+            int sum2_4 = 0;
+            int sum2_5 = 0;
+            int sum2_6 = 0;
+            int sum2_7 = 0;
+
+            int sum3_0 = 0;
+            int sum3_1 = 0;
+            int sum3_2 = 0;
+            int sum3_3 = 0;
+            int sum3_4 = 0;
+            int sum3_5 = 0;
+            int sum3_6 = 0;
+            int sum3_7 = 0;
 
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %51.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %51.b[11]            \n" // k11
+            for (int q=0; q<inch; q++)
+            {
+                sum0_0 += tmpptr[0] * kptr[0];
+                sum0_1 += tmpptr[1] * kptr[0];
+                sum0_2 += tmpptr[2] * kptr[0];
+                sum0_3 += tmpptr[3] * kptr[0];
+                sum0_4 += tmpptr[4] * kptr[0];
+                sum0_5 += tmpptr[5] * kptr[0];
+                sum0_6 += tmpptr[6] * kptr[0];
+                sum0_7 += tmpptr[7] * kptr[0];
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %51.b[12]            \n" // k12
+                sum1_0 += tmpptr[0] * kptr[1];
+                sum1_1 += tmpptr[1] * kptr[1];
+                sum1_2 += tmpptr[2] * kptr[1];
+                sum1_3 += tmpptr[3] * kptr[1];
+                sum1_4 += tmpptr[4] * kptr[1];
+                sum1_5 += tmpptr[5] * kptr[1];
+                sum1_6 += tmpptr[6] * kptr[1];
+                sum1_7 += tmpptr[7] * kptr[1];
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[13]            \n" // k13
+                sum2_0 += tmpptr[0] * kptr[2];
+                sum2_1 += tmpptr[1] * kptr[2];
+                sum2_2 += tmpptr[2] * kptr[2];
+                sum2_3 += tmpptr[3] * kptr[2];
+                sum2_4 += tmpptr[4] * kptr[2];
+                sum2_5 += tmpptr[5] * kptr[2];
+                sum2_6 += tmpptr[6] * kptr[2];
+                sum2_7 += tmpptr[7] * kptr[2];
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %51.b[14]            \n" // k14
+                sum3_0 += tmpptr[0] * kptr[3];
+                sum3_1 += tmpptr[1] * kptr[3];
+                sum3_2 += tmpptr[2] * kptr[3];
+                sum3_3 += tmpptr[3] * kptr[3];
+                sum3_4 += tmpptr[4] * kptr[3];
+                sum3_5 += tmpptr[5] * kptr[3];
+                sum3_6 += tmpptr[6] * kptr[3];
+                sum3_7 += tmpptr[7] * kptr[3];
 
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %51.b[15]            \n" // k15
+                tmpptr += 8;
+                kptr += 4;
+            }
 
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[0]             \n" // k00
+            outptr0[0] = sum0_0;
+            outptr0[1] = sum0_1;
+            outptr0[2] = sum0_2;
+            outptr0[3] = sum0_3;
+            outptr0[4] = sum0_4;
+            outptr0[5] = sum0_5;
+            outptr0[6] = sum0_6;
+            outptr0[7] = sum0_7;
 
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[1]             \n" // k01
+            outptr1[0] = sum1_0;
+            outptr1[1] = sum1_1;
+            outptr1[2] = sum1_2;
+            outptr1[3] = sum1_3;
+            outptr1[4] = sum1_4;
+            outptr1[5] = sum1_5;
+            outptr1[6] = sum1_6;
+            outptr1[7] = sum1_7;
 
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
+            outptr2[0] = sum2_0;
+            outptr2[1] = sum2_1;
+            outptr2[2] = sum2_2;
+            outptr2[3] = sum2_3;
+            outptr2[4] = sum2_4;
+            outptr2[5] = sum2_5;
+            outptr2[6] = sum2_6;
+            outptr2[7] = sum2_7;
 
-                "dup    v26.8b, %52.b[2]             \n" // k02
-                "dup    v27.8b, %52.b[3]             \n" // k03  
+            outptr3[0] = sum3_0;
+            outptr3[1] = sum3_1;
+            outptr3[2] = sum3_2;
+            outptr3[3] = sum3_3;
+            outptr3[4] = sum3_4;
+            outptr3[5] = sum3_5;
+            outptr3[6] = sum3_6;
+            outptr3[7] = sum3_7;
 
-                "st1    {v29.4s, v30.4s}, [%2], #32  \n"
-                //########################################### // sum1
+            outptr0 += 8;
+            outptr1 += 8;
+            outptr2 += 8;
+            outptr3 += 8;
+#endif // __ARM_NEON            
+        }    
 
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %52.b[4]             \n" // k04
+        for (; i+3<size; i+=4)
+        {
+            const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4);
+            const signed char* kptr = kernel.channel(p/4);
+#if 0 //__ARM_NEON
+            asm volatile(
+                // inch loop
+                "vmov.s32    q6, #0            \n"
+                "vmov.s32    q7, #0            \n"
+                "vmov.s32    q8, #0            \n"
+                "vmov.s32    q9, #0            \n"
 
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %52.b[5]             \n" // k05
+                "lsr         r4, %12, #2       \n"// r4 = nn = inch >> 2
+                "cmp         r4, #0            \n"
+                "beq         1f                \n"
+                
+                "0:                            \n"// for(; nn != 0; nn--)
+                "pld         [%4, #128]        \n"
+                "vld1.s8     {d4-d5}, [%4]!    \n"// tmpr a00-a03,a10-a13,a20-a23,a30-a33    a(inch)(data)
+                "vmovl.s8    q3, d5            \n"// a20-a23,a30-a33
+                "vmovl.s8    q2, d4            \n"// a00-a04,a10-a14
 
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %52.b[6]             \n" // k06
+                "vld1.s8     {d0-d1}, [%5]!    \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33    k(outch)(inch)
+                "vmovl.s8    q1, d1            \n"// k02-k32,k03-k33
+                "vmovl.s8    q0, d0            \n"// k00-k30,k01-k31
 
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %52.b[7]             \n" // k07
+                "vmlal.s16   q6, d4, d0[0]     \n"// sum0 = (a00-a03) * k00
+                "vmlal.s16   q7, d4, d0[1]     \n"// sum1 = (a00-a03) * k10
+                "vmlal.s16   q8, d4, d0[2]     \n"// sum2 = (a00-a03) * k20
+                "vmlal.s16   q9, d4, d0[3]     \n"// sum3 = (a00-a03) * k30
 
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%3]       \n" // sum2 
+                "vmlal.s16   q6, d5, d1[0]     \n"// sum0 += (a10-a13) * k01
+                "vmlal.s16   q7, d5, d1[1]     \n"// sum1 += (a10-a13) * k11
+                "vmlal.s16   q8, d5, d1[2]     \n"// sum2 += (a10-a13) * k21
+                "vmlal.s16   q9, d5, d1[3]     \n"// sum3 += (a10-a13) * k31
 
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[8]             \n" // k08
+                "vmlal.s16   q6, d6, d2[0]     \n"// sum0 += (a20-a23) * k02
+                "vmlal.s16   q7, d6, d2[1]     \n"// sum1 += (a20-a23) * k12
+                "vmlal.s16   q8, d6, d2[2]     \n"// sum2 += (a20-a23) * k22
+                "vmlal.s16   q9, d6, d2[3]     \n"// sum3 += (a20-a23) * k32
 
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[9]             \n" // k09
+                "vmlal.s16   q6, d7, d3[0]     \n"// sum0 += (a30-a33) * k03
+                "vmlal.s16   q7, d7, d3[1]     \n"// sum1 += (a30-a33) * k13
+                "vmlal.s16   q8, d7, d3[2]     \n"// sum2 += (a30-a33) * k23
+                "vmlal.s16   q9, d7, d3[3]     \n"// sum3 += (a30-a33) * k33
 
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %52.b[10]            \n" // k10
+                "subs        r4, r4, #1        \n"
+                "bne         0b                \n"// end for
+ 
+                "1:                            \n"
+                // remain loop
+                "and         r4, %12, #3       \n"// r4 = remain = inch & 3
+                "cmp         r4, #0            \n"
+                "beq         3f                \n"
 
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %52.b[11]            \n" // k11
+                "2:                            \n"// for(; remain != 0; remain--)
+                "vld1.s8     {d2}, [%4]        \n"// tmpr a00-a03    a(inch)(data)
+                "vld1.s8     {d0}, [%5]        \n"// kptr k00-k30    k(outch)(inch)
+                "vmovl.s8    q1, d2            \n"
+                "vmovl.s8    q0, d0            \n"
+                "add         %4, #4            \n"
+                "add         %5, #4            \n"
 
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[12]            \n" // k12
+                "vmlal.s16   q6, d2, d0[0]     \n"// sum0 += (a00-a03) * k00
+                "vmlal.s16   q7, d2, d0[1]     \n"// sum1 += (a00-a03) * k10
+                "vmlal.s16   q8, d2, d0[2]     \n"// sum2 += (a00-a03) * k20
+                "vmlal.s16   q9, d2, d0[3]     \n"// sum3 += (a00-a03) * k30
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[13]            \n" // k13
+                "subs        r4, r4, #1        \n"
+                "bne         2b                \n"
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %52.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %52.b[15]            \n" // k15
+                "3:                            \n"// store the result to memory
+                "vst1.s32    {d12-d13}, [%0]!  \n"
+                "vst1.s32    {d14-d15}, [%1]!  \n"
+                "vst1.s32    {d16-d17}, [%2]!  \n"
+                "vst1.s32    {d18-d19}, [%3]!  \n"
 
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[0]             \n" // k00
+                : "=r"(outptr0), // %0
+                  "=r"(outptr1), // %1
+                  "=r"(outptr2), // %2
+                  "=r"(outptr3), // %3
+                  "=r"(tmpptr),  // %4
+                  "=r"(kptr)     // %5
+                : "0"(outptr0),
+                  "1"(outptr1),
+                  "2"(outptr2),
+                  "3"(outptr3),
+                  "4"(tmpptr),
+                  "5"(kptr),
+                  "r"(inch)      // %12  
+                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+            );
+#else
+            int sum0_0 = 0;
+            int sum0_1 = 0;
+            int sum0_2 = 0;
+            int sum0_3 = 0;
+
+            int sum1_0 = 0;
+            int sum1_1 = 0;
+            int sum1_2 = 0;
+            int sum1_3 = 0;
+
+            int sum2_0 = 0;
+            int sum2_1 = 0;
+            int sum2_2 = 0;
+            int sum2_3 = 0;
+
+            int sum3_0 = 0;
+            int sum3_1 = 0;
+            int sum3_2 = 0;
+            int sum3_3 = 0;
 
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[1]             \n" // k01
+            for (int q=0; q<inch; q++)
+            {
+                sum0_0 += tmpptr[0] * kptr[0];
+                sum0_1 += tmpptr[1] * kptr[0];
+                sum0_2 += tmpptr[2] * kptr[0];
+                sum0_3 += tmpptr[3] * kptr[0];
 
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.8b, %53.b[2]             \n" // k02
+                sum1_0 += tmpptr[0] * kptr[1];
+                sum1_1 += tmpptr[1] * kptr[1];
+                sum1_2 += tmpptr[2] * kptr[1];
+                sum1_3 += tmpptr[3] * kptr[1];
 
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v27.8b, %53.b[3]             \n" // k03
+                sum2_0 += tmpptr[0] * kptr[2];
+                sum2_1 += tmpptr[1] * kptr[2];
+                sum2_2 += tmpptr[2] * kptr[2];
+                sum2_3 += tmpptr[3] * kptr[2];
 
-                "st1    {v29.4s, v30.4s}, [%3], #32  \n"
-                //########################################### //sum 2
+                sum3_0 += tmpptr[0] * kptr[3];
+                sum3_1 += tmpptr[1] * kptr[3];
+                sum3_2 += tmpptr[2] * kptr[3];
+                sum3_3 += tmpptr[3] * kptr[3];
 
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %53.b[4]             \n" // k04
+                tmpptr += 4;
+                kptr += 4;
+            }
 
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %53.b[5]             \n" // k05
+            outptr0[0] = sum0_0;
+            outptr0[1] = sum0_1;
+            outptr0[2] = sum0_2;
+            outptr0[3] = sum0_3;
 
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %53.b[6]             \n" // k06
+            outptr1[0] = sum1_0;
+            outptr1[1] = sum1_1;
+            outptr1[2] = sum1_2;
+            outptr1[3] = sum1_3;
 
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %53.b[7]             \n" // k07
+            outptr2[0] = sum2_0;
+            outptr2[1] = sum2_1;
+            outptr2[2] = sum2_2;
+            outptr2[3] = sum2_3;
 
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%4]       \n" // sum3 
+            outptr3[0] = sum3_0;
+            outptr3[1] = sum3_1;
+            outptr3[2] = sum3_2;
+            outptr3[3] = sum3_3;
 
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[8]             \n" // k08
+            outptr0 += 4;
+            outptr1 += 4;
+            outptr2 += 4;
+            outptr3 += 4;
+#endif // __ARM_NEON            
+        }
 
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[9]             \n" // k09
+        for (; i<size; i++)
+        {
+            const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4 + i%4);
+            const signed char* kptr = kernel.channel(p/4);
+#if 0 //__ARM_NEON
+            asm volatile(
+                // inch loop
+                "veor        q6, q6, q6        \n"
+                "veor        q7, q7, q7        \n"
+                "veor        q8, q8, q8        \n"
+                "veor        q9, q9, q9        \n"
+                "vmov.s32    q10, #0           \n"
 
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %53.b[10]            \n" // k10
+                "lsr         r4, %12, #2       \n"// r4 = nn = inch >> 2
+                "cmp         r4, #0            \n"
+                "beq         1f                \n"
                 
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %53.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[12]            \n" // k12
+                "0:                            \n"// for(; nn != 0; nn--)
+                "pld         [%4, #128]        \n"
+                "vld1.s8     {d4}, [%4]        \n"// tmpr a00,a10,a20,a30    a(inch)(data)
+                "add         %4, #4            \n"
+                "vmovl.s8    q2, d4            \n"// a00,a10,a20,a30
 
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[13]            \n" // k13
+                "vld1.s8     {d0-d1}, [%5]!    \n"// kptr k00-k30,k01-k31,k02-k32,k03-k33    k(outch)(inch)
+                "vmovl.s8    q1, d1            \n"// k02-k32,k03-k33
+                "vmovl.s8    q0, d0            \n"// k00-k30,k01-k31
 
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %53.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %53.b[15]            \n" // k15
+                "vmlal.s16   q6, d0, d4[0]     \n"// (k00-k30) * a00
+                "vmlal.s16   q7, d1, d4[1]     \n"// (k01-k31) * a10
+                "vmlal.s16   q8, d2, d4[2]     \n"// (k02-k32) * a20
+                "vmlal.s16   q9, d3, d4[3]     \n"// (k03-k33) * a30
 
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[0]             \n" // k00
+                "subs        r4, r4, #1        \n"
+                "bne         0b                \n"// end for
 
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[1]             \n" // k01
+                "vadd.s32    q6, q6, q7        \n"
+                "vadd.s32    q9, q9, q8        \n"
+                "vadd.s32    q10, q6, q9       \n"
+ 
+                "1:                            \n"
+                // remain loop
+                "and         r4, %12, #3       \n"// r4 = remain = inch & 3
+                "cmp         r4, #0            \n"
+                "beq         3f                \n"
 
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.8b, %54.b[2]             \n" // k02
+                "2:                            \n"// for(; remain != 0; remain--)
+                "vld1.s8     {d2}, [%4]        \n"// tmpr a00        a(inch)(data)
+                "vld1.s8     {d0}, [%5]        \n"// kptr k00-k30    k(outch)(inch)
+                "vmovl.s8    q1, d2            \n"
+                "vmovl.s8    q0, d0            \n"
+                "add         %4, #1            \n"
+                "add         %5, #4            \n"
 
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
+                "vmlal.s16   q10, d0, d2[0]    \n"
 
-                "dup    v27.8b, %54.b[3]             \n" // k03
+                "subs        r4, r4, #1        \n"
+                "bne         2b                \n"
 
-                "st1    {v29.4s, v30.4s}, [%4], #32  \n"
-                //########################################### // sum3
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %54.b[4]             \n" // k04
+                "3:                            \n"// store the result to memory
+                "vst1.s32    {d20[0]}, [%0]!   \n"
+                "vst1.s32    {d20[1]}, [%1]!   \n"
+                "vst1.s32    {d21[0]}, [%2]!   \n"
+                "vst1.s32    {d21[1]}, [%3]!   \n"
 
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %54.b[5]             \n" // k05
+                : "=r"(outptr0), // %0
+                  "=r"(outptr1), // %1
+                  "=r"(outptr2), // %2
+                  "=r"(outptr3), // %3
+                  "=r"(tmpptr),  // %4
+                  "=r"(kptr)     // %5
+                : "0"(outptr0),
+                  "1"(outptr1),
+                  "2"(outptr2),
+                  "3"(outptr3),
+                  "4"(tmpptr),
+                  "5"(kptr),
+                  "r"(inch)      // %12  
+                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+            );    
+#else
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
 
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %54.b[6]             \n" // k06
+            for (int q=0; q<inch; q++)
+            {
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[0] * kptr[1];
+                sum2 += tmpptr[0] * kptr[2];
+                sum3 += tmpptr[0] * kptr[3];
 
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %54.b[7]             \n" // k07
+                tmpptr++;
+                kptr += 4;
+            }
 
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%5]       \n" // sum4
+            outptr0[0] = sum0;
+            outptr1[0] = sum1;
+            outptr2[0] = sum2;
+            outptr3[0] = sum3;
 
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[8]             \n" // k08
+            outptr0++;
+            outptr1++;
+            outptr2++;
+            outptr3++;  
+#endif // __ARM_NEON
+        }
+    }
 
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"    
-                "dup    v26.8b, %54.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %54.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %54.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %54.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[2]             \n" // k02 
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %55.b[3]             \n" // k03
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                
-                "st1    {v29.4s, v30.4s}, [%5], #32  \n"
-                //########################################### // sum4
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %55.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %55.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %55.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %55.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%6]       \n" // sum5  
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %55.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %55.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v26.8b, %56.b[2]             \n" // k02
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v27.8b, %56.b[3]             \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%6], #32  \n"
-                //########################################### // sum5
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %56.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %56.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %56.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %56.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%7]       \n" // sum6 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"        
-                "dup    v26.8b, %56.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %56.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %56.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %56.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %57.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v27.8b, %57.b[3]             \n" // k03
-
-                "st1    {v29.4s, v30.4s}, [%7], #32  \n"
-                //########################################### // sum6
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %57.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %57.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %57.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %57.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v29.4s, v30.4s}, [%8]       \n" // sum7 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %57.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %57.b[11]            \n" // k11
-                
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %57.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %57.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "prfm   pldl1keep, [%9, #128]        \n"
-                "prfm   pldl1keep, [%10, #128]       \n"
-                "ld1    {v8.8b}, [%9], #8            \n" // r0"
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "ld1    {v9.8b}, [%10], #8           \n" // r1"
-                "prfm   pldl1keep, [%11, #128]       \n"
-                "prfm   pldl1keep, [%12, #128]       \n"
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "ld1    {v10.8b}, [%11], #8          \n" // r2"
-                "ld1    {v11.8b}, [%12], #8          \n" // r3"
-                "dup    v24.8b, %50.b[0]             \n" // k00                     
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v25.8b, %50.b[1]             \n" // k01
-
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-                "dup    v26.8b, %50.b[2]             \n" // k02
-                "dup    v27.8b, %50.b[3]             \n" // k03                
-
-                "st1    {v29.4s, v30.4s}, [%8], #32  \n"   
-                //########################################### // sum7
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7),// %8
-                  "=r"(r0),     // %9
-                  "=r"(r1),     // %10
-                  "=r"(r2),     // %11
-                  "=r"(r3),     // %12
-                  "=r"(r4),     // %13
-                  "=r"(r5),     // %14
-                  "=r"(r6),     // %15
-                  "=r"(r7),     // %16
-                  "=r"(r8),     // %17
-                  "=r"(r9),     // %18
-                  "=r"(r10),    // %19
-                  "=r"(r11),    // %20
-                  "=r"(r12),    // %21
-                  "=r"(r13),    // %22
-                  "=r"(r14),    // %23
-                  "=r"(r15)     // %24
-                : "0"(nn),
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "9"(r0),
-                  "10"(r1),
-                  "11"(r2),
-                  "12"(r3),
-                  "13"(r4),
-                  "14"(r5),
-                  "15"(r6),
-                  "16"(r7),
-                  "17"(r8),
-                  "18"(r9),
-                  "19"(r10),
-                  "20"(r11),
-                  "21"(r12),
-                  "22"(r13),
-                  "23"(r14),
-                  "24"(r15),
-                  "w"(_k0),     // %50
-                  "w"(_k1),     // %51
-                  "w"(_k2),     // %52
-                  "w"(_k3),     // %53
-                  "w"(_k4),     // %54
-                  "w"(_k5),     // %55
-                  "w"(_k6),     // %56
-                  "w"(_k7)      // %57
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            );                             
-            }
-
-            if (remain >= 4)
-            {
-                remain -= 4;
-
-            asm volatile(
-                "prfm   pldl1keep, [%9, #128]        \n"
-                "prfm   pldl1keep, [%10, #128]       \n"
-                "prfm   pldl1keep, [%11, #128]       \n"
-                "prfm   pldl1keep, [%12, #128]       \n"
-                "ld1    {v8.8b}, [%9], #8            \n" // r0"
-                "ld1    {v9.8b}, [%10], #8           \n" // r1"
-                "ld1    {v10.8b}, [%11], #8          \n" // r2"
-                "ld1    {v11.8b}, [%12], #8          \n" // r3"
-
-                "dup    v24.8b, %50.b[0]             \n" // k00
-                "dup    v25.8b, %50.b[1]             \n" // k01
-                "dup    v26.8b, %50.b[2]             \n" // k02
-                "dup    v27.8b, %50.b[3]             \n" // k03
-
-                "smull  v28.8h, v8.8b, v24.8b        \n" // r0
-                "prfm   pldl1keep, [%13, #128]       \n"
-                "prfm   pldl1keep, [%14, #128]       \n"
-                "prfm   pldl1keep, [%15, #128]       \n"
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "prfm   pldl1keep, [%16, #128]       \n"
-                "ld1    {v12.8b}, [%13], #8          \n" // r4" 
-                "ld1    {v13.8b}, [%14], #8          \n" // r5"
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "ld1    {v14.8b}, [%15], #8          \n" // r6"
-                "ld1    {v15.8b}, [%16], #8          \n" // r7"                         
-                "dup    v24.8b, %50.b[4]             \n" // k04
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"
-                "dup    v25.8b, %50.b[5]             \n" // k05
-                "dup    v26.8b, %50.b[6]             \n" // k06
-                "dup    v27.8b, %50.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n" // r4
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v29.4s}, [%1]               \n" // sum0  
-                "prfm   pldl1keep, [%17, #128]       \n"
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "ld1    {v16.8b}, [%17], #8          \n" // r8" 
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "ld1    {v17.8b}, [%18], #8          \n" // r9"
-                "ld1    {v18.8b}, [%19], #8          \n" // r10"
-                "ld1    {v19.8b}, [%20], #8          \n" // r11"
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v24.8b, %50.b[8]             \n" // k08
-                "dup    v25.8b, %50.b[9]             \n" // k09
-                "dup    v26.8b, %50.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n" // r8
-                "dup    v27.8b, %50.b[11]            \n" // k11
-                "prfm   pldl1keep, [%21, #128]       \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "ld1    {v20.8b}, [%21], #8          \n" // r12"
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "ld1    {v21.8b}, [%22], #8          \n" // r13"
-                "ld1    {v22.8b}, [%23], #8          \n" // r14"
-                "ld1    {v23.8b}, [%24], #8          \n" // r15" 
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v24.8b, %50.b[12]            \n" // k12
-                "dup    v25.8b, %50.b[13]            \n" // k13
-                "dup    v26.8b, %50.b[14]            \n" // k14
-
-                "smlal  v28.8h, v20.8b, v24.8b       \n" // r12
-                "dup    v27.8b, %50.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %51.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %51.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %51.b[3]             \n" // k03
-
-                "st1    {v29.4s}, [%1], #16          \n" // sum0
-                //########################################### 
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %51.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %51.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %51.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %51.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v29.4s}, [%2]               \n" // sum1
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %51.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %51.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %51.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %51.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %51.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %51.b[14]            \n" // k14
-
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %51.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %52.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %52.b[3]             \n" // k03  
-
-                "st1    {v29.4s}, [%2], #16          \n"
-                //########################################### // sum1
-
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %52.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %52.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %52.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %52.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v29.4s}, [%3]               \n" // sum2 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %52.b[10]            \n" // k10
-
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %52.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %52.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %52.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %52.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %52.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %53.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %53.b[3]             \n" // k03
-
-                "st1    {v29.4s}, [%3], #16          \n"
-                //########################################### //sum 2
-
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %53.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %53.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %53.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %53.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v29.4s}, [%4]               \n" // sum3 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %53.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %53.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %53.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %53.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %53.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %53.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %54.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %54.b[3]             \n" // k03
-
-                "st1    {v29.4s}, [%4], #16          \n"
-                //########################################### // sum3
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %54.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %54.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %54.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %54.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v29.4s}, [%5]               \n" // sum4
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"    
-                "dup    v26.8b, %54.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %54.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %54.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %54.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %54.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %54.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[2]             \n" // k02 
-            
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %55.b[3]             \n" // k03
-                
-                "st1    {v29.4s}, [%5], #16          \n"
-                //########################################### // sum4
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %55.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %55.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %55.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %55.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v29.4s}, [%6]               \n" // sum5  
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %55.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %55.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %55.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %55.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %55.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %56.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "dup    v27.8b, %56.b[3]             \n" // k03
-
-                "st1    {v29.4s}, [%6], #16          \n"
-                //########################################### // sum5
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %56.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %56.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %56.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %56.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v29.4s}, [%7]               \n" // sum6 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"        
-                "dup    v26.8b, %56.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %56.b[11]            \n" // k11
-
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %56.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %56.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %56.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %56.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[0]             \n" // k00
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[1]             \n" // k01
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"                
-                "dup    v26.8b, %57.b[2]             \n" // k02
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "saddw2 v30.4s, v30.4s, v28.8h       \n"
-
-                "dup    v27.8b, %57.b[3]             \n" // k03
-
-                "st1    {v29.4s}, [%7], #16          \n"
-                //########################################### // sum6
-                "smull  v28.8h, v8.8b, v24.8b        \n"
-                "dup    v24.8b, %57.b[4]             \n" // k04
-
-                "smlal  v28.8h, v9.8b, v25.8b        \n"
-                "dup    v25.8b, %57.b[5]             \n" // k05
-
-                "smlal  v28.8h, v10.8b, v26.8b       \n"
-                "dup    v26.8b, %57.b[6]             \n" // k06
-
-                "smlal  v28.8h, v11.8b, v27.8b       \n"                
-                "dup    v27.8b, %57.b[7]             \n" // k07
-
-                "smlal  v28.8h, v12.8b, v24.8b       \n"
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v29.4s}, [%8]               \n" // sum7 
-
-                "smlal  v28.8h, v13.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[8]             \n" // k08
-
-                "smlal  v28.8h, v14.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[9]             \n" // k09
-
-                "smlal  v28.8h, v15.8b, v27.8b       \n"
-                "dup    v26.8b, %57.b[10]            \n" // k10
-                
-                "smlal  v28.8h, v16.8b, v24.8b       \n"
-                "dup    v27.8b, %57.b[11]            \n" // k11
-                
-                "smlal  v28.8h, v17.8b, v25.8b       \n"
-                "dup    v24.8b, %57.b[12]            \n" // k12
-
-                "smlal  v28.8h, v18.8b, v26.8b       \n"
-                "dup    v25.8b, %57.b[13]            \n" // k13
-
-                "smlal  v28.8h, v19.8b, v27.8b       \n"
-                "dup    v26.8b, %57.b[14]            \n" // k14
-                
-                "smlal  v28.8h, v20.8b, v24.8b       \n"
-                "dup    v27.8b, %57.b[15]            \n" // k15
-
-                "smlal  v28.8h, v21.8b, v25.8b       \n"
-                "sub    %9, %9, #4                   \n"
-
-                "smlal  v28.8h, v22.8b, v26.8b       \n"
-                "sub    %10, %10, #4                 \n"
-                "sub    %11, %11, #4                 \n"
-                "sub    %12, %12, #4                 \n"
-
-                "smlal  v28.8h, v23.8b, v27.8b       \n"    
-                "sub    %13, %13, #4                 \n"
-                "sub    %14, %14, #4                 \n"
-                "sub    %15, %15, #4                 \n"
-                "sub    %16, %16, #4                 \n"
-
-                "saddw  v29.4s, v29.4s, v28.4h       \n"
-                "sub    %17, %17, #4                 \n"
-                "sub    %18, %18, #4                 \n"
-                "sub    %19, %19, #4                 \n"
-                "sub    %20, %20, #4                 \n"
-
-                "st1    {v29.4s}, [%8], #16          \n"
-                //########################################### // sum7
-                "sub    %21, %21, #4                 \n"
-                "sub    %22, %22, #4                 \n"
-                "sub    %23, %23, #4                 \n"
-                "sub    %24, %24, #4                 \n" 
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7),// %8
-                  "=r"(r0),     // %9
-                  "=r"(r1),     // %10
-                  "=r"(r2),     // %11
-                  "=r"(r3),     // %12
-                  "=r"(r4),     // %13
-                  "=r"(r5),     // %14
-                  "=r"(r6),     // %15
-                  "=r"(r7),     // %16
-                  "=r"(r8),     // %17
-                  "=r"(r9),     // %18
-                  "=r"(r10),     // %19
-                  "=r"(r11),     // %20
-                  "=r"(r12),     // %21
-                  "=r"(r13),     // %22
-                  "=r"(r14),     // %23
-                  "=r"(r15)      // %24
-                : "0"(nn),
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "9"(r0),
-                  "10"(r1),
-                  "11"(r2),
-                  "12"(r3),
-                  "13"(r4),
-                  "14"(r5),
-                  "15"(r6),
-                  "16"(r7),
-                  "17"(r8),
-                  "18"(r9),
-                  "19"(r10),
-                  "20"(r11),
-                  "21"(r12),
-                  "22"(r13),
-                  "23"(r14),
-                  "24"(r15),
-                  "w"(_k0),     // %50
-                  "w"(_k1),     // %51
-                  "w"(_k2),     // %52
-                  "w"(_k3),     // %53
-                  "w"(_k4),     // %54
-                  "w"(_k5),     // %55
-                  "w"(_k6),     // %56
-                  "w"(_k7)      // %57
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
-            ); 
-            }
-
-            for (; remain>0; remain--)
-            {
-                // TODO neon optimize
-                int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7] + *r8 * kernel0[8] + *r9 * kernel0[9] + *r10 * kernel0[10] + *r11 * kernel0[11] + *r12 * kernel0[12] + *r13 * kernel0[13] + *r14 * kernel0[14] + *r15 * kernel0[15];
-                int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7] + *r8 * kernel1[8] + *r9 * kernel1[9] + *r10 * kernel1[10] + *r11 * kernel1[11] + *r12 * kernel1[12] + *r13 * kernel1[13] + *r14 * kernel1[14] + *r15 * kernel1[15];
-                int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7] + *r8 * kernel2[8] + *r9 * kernel2[9] + *r10 * kernel2[10] + *r11 * kernel2[11] + *r12 * kernel2[12] + *r13 * kernel2[13] + *r14 * kernel2[14] + *r15 * kernel2[15];
-                int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7] + *r8 * kernel3[8] + *r9 * kernel3[9] + *r10 * kernel3[10] + *r11 * kernel3[11] + *r12 * kernel3[12] + *r13 * kernel3[13] + *r14 * kernel3[14] + *r15 * kernel3[15];
-                int sum4 = (int)*r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3] + *r4 * kernel4[4] + *r5 * kernel4[5] + *r6 * kernel4[6] + *r7 * kernel4[7] + *r8 * kernel4[8] + *r9 * kernel4[9] + *r10 * kernel4[10] + *r11 * kernel4[11] + *r12 * kernel4[12] + *r13 * kernel4[13] + *r14 * kernel4[14] + *r15 * kernel4[15];
-                int sum5 = (int)*r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3] + *r4 * kernel5[4] + *r5 * kernel5[5] + *r6 * kernel5[6] + *r7 * kernel5[7] + *r8 * kernel5[8] + *r9 * kernel5[9] + *r10 * kernel5[10] + *r11 * kernel5[11] + *r12 * kernel5[12] + *r13 * kernel5[13] + *r14 * kernel5[14] + *r15 * kernel5[15];
-                int sum6 = (int)*r0 * kernel6[0] + *r1 * kernel6[1] + *r2 * kernel6[2] + *r3 * kernel6[3] + *r4 * kernel6[4] + *r5 * kernel6[5] + *r6 * kernel6[6] + *r7 * kernel6[7] + *r8 * kernel6[8] + *r9 * kernel6[9] + *r10 * kernel6[10] + *r11 * kernel6[11] + *r12 * kernel6[12] + *r13 * kernel6[13] + *r14 * kernel6[14] + *r15 * kernel6[15];
-                int sum7 = (int)*r0 * kernel7[0] + *r1 * kernel7[1] + *r2 * kernel7[2] + *r3 * kernel7[3] + *r4 * kernel7[4] + *r5 * kernel7[5] + *r6 * kernel7[6] + *r7 * kernel7[7] + *r8 * kernel7[8] + *r9 * kernel7[9] + *r10 * kernel7[10] + *r11 * kernel7[11] + *r12 * kernel7[12] + *r13 * kernel7[13] + *r14 * kernel7[14] + *r15 * kernel7[15];
-
-                *outptr0 += sum0;
-                *outptr1 += sum1;
-                *outptr2 += sum2;
-                *outptr3 += sum3;
-                *outptr4 += sum4;
-                *outptr5 += sum5;
-                *outptr6 += sum6;
-                *outptr7 += sum7;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                r4++;
-                r5++;
-                r6++;
-                r7++;
-                r8++;
-                r9++;
-                r10++;
-                r11++;
-                r12++;
-                r13++;
-                r14++;
-                r15++;
-                outptr0++;
-                outptr1++;
-                outptr2++;
-                outptr3++;
-                outptr4++;
-                outptr5++;
-                outptr6++;
-                outptr7++;          
-            }
-        }
-#else // f**k the gcc limit the num of asm operand less than 30 
-        for (; q+7<inch; q+=8)
-        {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3;
-            int* outptr4 = out4;
-            int* outptr5 = out5;
-            int* outptr6 = out6;
-            int* outptr7 = out7;
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char* kernel1 = (const signed char*)kernel + (p+1)*inch + q;
-            const signed char* kernel2 = (const signed char*)kernel + (p+2)*inch + q;
-            const signed char* kernel3 = (const signed char*)kernel + (p+3)*inch + q;
-            const signed char* kernel4 = (const signed char*)kernel + (p+4)*inch + q;
-            const signed char* kernel5 = (const signed char*)kernel + (p+5)*inch + q;
-            const signed char* kernel6 = (const signed char*)kernel + (p+6)*inch + q;
-            const signed char* kernel7 = (const signed char*)kernel + (p+7)*inch + q;
-
-            const signed char* r0 = bottom_blob.channel(q);
-            const signed char* r1 = bottom_blob.channel(q+1);
-            const signed char* r2 = bottom_blob.channel(q+2);
-            const signed char* r3 = bottom_blob.channel(q+3);
-            const signed char* r4 = bottom_blob.channel(q+4);
-            const signed char* r5 = bottom_blob.channel(q+5);
-            const signed char* r6 = bottom_blob.channel(q+6);
-            const signed char* r7 = bottom_blob.channel(q+7);
-
-            int size = outw * outh;
-
-            int nn = size >> 4;
-            int remain = size & 15;
-
-            asm volatile(
-                "ld1    {v0.16b}, [%0]    \n"
-                "ld1    {v1.16b}, [%1]    \n"
-                "ld1    {v2.16b}, [%2]    \n"
-                "ld1    {v3.16b}, [%3]    \n"
-                "ld1    {v4.16b}, [%4]    \n"
-                "ld1    {v5.16b}, [%5]    \n"
-                "ld1    {v6.16b}, [%6]    \n"
-                "ld1    {v7.16b}, [%7]    \n"
-                : 
-                : "r"(kernel0),
-                  "r"(kernel1),
-                  "r"(kernel2),
-                  "r"(kernel3),
-                  "r"(kernel4),
-                  "r"(kernel5),
-                  "r"(kernel6),
-                  "r"(kernel7)
-                : "cc", "memory"
-            );
-
-	    if (nn > 0)
-            {
-            asm volatile(
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "prfm   pldl1keep, [%21, #128]       \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "prfm   pldl1keep, [%25, #128]       \n"
-                "ld1    {v8.16b}, [%18], #16         \n" // r0"
-                "ld1    {v9.16b}, [%19], #16         \n" // r1"
-                "ld1    {v10.16b}, [%20], #16        \n" // r2"
-                "ld1    {v11.16b}, [%21], #16        \n" // r3"
-                "ld1    {v12.16b}, [%22], #16        \n" // r4"
-                "ld1    {v13.16b}, [%23], #16        \n" // r5"
-                "ld1    {v14.16b}, [%24], #16        \n" // r6"
-                "ld1    {v15.16b}, [%25], #16        \n" // r7"
-                
-                "0:                                  \n"
-
-                "dup    v16.16b, v0.16b[0]           \n" // k00
-                "dup    v17.16b, v0.16b[1]           \n" // k01
-                "dup    v18.16b, v0.16b[2]           \n" // k02
-                "dup    v19.16b, v0.16b[3]           \n" // k03
-                "dup    v20.16b, v0.16b[4]           \n" // k04
-                "dup    v21.16b, v0.16b[5]           \n" // k05
-                "dup    v22.16b, v0.16b[6]           \n" // k06
-                "dup    v23.16b, v0.16b[7]           \n" // k07				
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v1.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v1.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v1.16b[2]           \n" // k02
-
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%1]       \n" // sum0  
-                                    
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v1.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v1.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"
-                "dup    v21.16b, v1.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                
-                "st1    {v26.4s, v27.4s}, [%1], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%1]       \n" // sum0n
-                "dup    v22.16b, v1.16b[6]           \n" // k06
-                "dup    v23.16b, v1.16b[7]           \n" // k07	
-                
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n" 
-                "st1    {v28.4s, v29.4s}, [%1], #32  \n"
-                //###########################################
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n"
-                "dup    v16.16b, v2.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v2.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v2.16b[2]           \n" // k02
-
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%2]       \n" // sum1
-
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v2.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v2.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"  
-                "dup    v21.16b, v2.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-
-                "st1    {v26.4s, v27.4s}, [%2], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%2]       \n" // sum1n
-                "dup    v22.16b, v2.16b[6]           \n" // k06
-                "dup    v23.16b, v2.16b[7]           \n" // k07					
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n" 
-
-                "st1    {v28.4s, v29.4s}, [%2], #32  \n"
-                //###########################################
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v3.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v3.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v3.16b[2]           \n" // k02					
-
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%3]       \n" // sum2
-
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v3.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v3.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"  
-                "dup    v21.16b, v3.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%3], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%3]       \n" // sum2n
-                "dup    v22.16b, v3.16b[6]           \n" // k06
-                "dup    v23.16b, v3.16b[7]           \n" // k07					
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"
-                "st1    {v28.4s, v29.4s}, [%3], #32  \n"
-                //##########################################
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v4.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v4.16b[1]            \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v4.16b[2]           \n" // k02					
-
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%4]       \n" // sum3
-
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v4.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v4.16b[4]            \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n" 
-                "dup    v21.16b, v4.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%4], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%4]       \n" // sum3n
-                "dup    v22.16b, v4.16b[6]           \n" // k06
-                "dup    v23.16b, v4.16b[7]           \n" // k07					
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"
-                
-                "st1    {v28.4s, v29.4s}, [%4], #32  \n"
-                //##########################################	
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v5.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v5.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v5.16b[2]           \n" // k02
-
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%5]       \n" // sum4
-
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v5.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v5.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n" 
-                "dup    v21.16b, v5.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"	
-                "st1    {v26.4s, v27.4s}, [%5], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%5]       \n" // sum4n
-                "dup    v22.16b, v5.16b[6]           \n" // k06
-                "dup    v23.16b, v5.16b[7]           \n" // k07
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"	
-                "st1    {v28.4s, v29.4s}, [%5], #32  \n"
-                //##########################################	
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v6.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v6.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v6.16b[2]           \n" // k02
-                
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v6.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v6.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"  
-                "dup    v21.16b, v6.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%6]       \n" // sum5
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%6], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%6]       \n" // sum5n
-                "dup    v22.16b, v6.16b[6]           \n" // k06
-                "dup    v23.16b, v6.16b[7]           \n" // k07
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"
-                "st1    {v28.4s, v29.4s}, [%6], #32  \n"
-                //##########################################
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "dup    v16.16b, v7.16b[0]           \n" // k00
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "dup    v17.16b, v7.16b[1]           \n" // k01
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-                "dup    v18.16b, v7.16b[2]           \n" // k02					
-
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%7]       \n" // sum6
-
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "dup    v19.16b, v7.16b[3]           \n" // k03
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "dup    v20.16b, v7.16b[4]           \n" // k04
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"
-                "dup    v21.16b, v7.16b[5]           \n" // k05
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"      
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%7], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%7]       \n" // sum6n
-                "dup    v22.16b, v7.16b[6]           \n" // k06
-                "dup    v23.16b, v7.16b[7]           \n" // k07
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"
-                "st1    {v28.4s, v29.4s}, [%7], #32  \n"
-                //##########################################		
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smull2 v25.8h, v8.16b, v16.16b      \n" 
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal2  v25.8h, v9.16b, v17.16b     \n" 
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "prfm   pldl1keep, [%21, #128]       \n"
-                
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal2  v25.8h, v10.16b, v18.16b    \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal2  v25.8h, v11.16b, v19.16b    \n"
-
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%8]       \n" // sum7
-                                    
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal2  v25.8h, v12.16b, v20.16b    \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "prfm   pldl1keep, [%25, #128]       \n"
-                
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal2  v25.8h, v13.16b, v21.16b    \n"
-                "ld1    {v8.16b}, [%18], #16         \n" // r0"
-                "ld1    {v9.16b}, [%19], #16         \n" // r1"
-                
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal2  v25.8h, v14.16b, v22.16b    \n"  
-                "ld1    {v10.16b}, [%20], #16        \n" // r2"
-                "ld1    {v11.16b}, [%21], #16        \n" // r3"
-                
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                "smlal2  v25.8h, v15.16b, v23.16b    \n"   
-                "ld1    {v12.16b}, [%22], #16        \n" // r4"
-                "ld1    {v13.16b}, [%23], #16        \n" // r5"					
-                
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"				
-                "st1    {v26.4s, v27.4s}, [%8], #32  \n" 
-                
-                "ld1    {v28.4s, v29.4s}, [%8]       \n" // sum7n
-                "ld1    {v14.16b}, [%24], #16        \n" // r6"
-                "ld1    {v15.16b}, [%25], #16        \n" // r7"						
-                "saddw  v28.4s, v28.4s, v25.4h       \n"
-                "saddw2 v29.4s, v29.4s, v25.8h       \n"
-                "st1    {v28.4s, v29.4s}, [%8], #32  \n"
-                "subs   %w0, %w0, #1                 \n"
-                "bne    0b                           \n"
-                "sub    %18, %18, #16                \n"
-                "sub    %19, %19, #16                \n"
-                "sub    %20, %20, #16                \n"
-                "sub    %21, %21, #16                \n"
-                "sub    %22, %22, #16                \n"
-                "sub    %23, %23, #16                \n"
-                "sub    %24, %24, #16                \n"
-                "sub    %25, %25, #16                \n"
-                //##########################################					
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7) // %8
-                : "0"(nn),      
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "r"(r0),              // %18
-                  "r"(r1),		// %19
-                  "r"(r2),		// %20
-                  "r"(r3),		// %21
-                  "r"(r4),		// %22
-                  "r"(r5),		// %23
-                  "r"(r6),		// %24
-                  "r"(r7)		// %25
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
-            );
-			}
-
-            if (remain == 8)
-            {
-                remain -= 8;
-
-            asm volatile(
-                "prfm   pldl1keep, [%18, #128]       \n"
-                "prfm   pldl1keep, [%19, #128]       \n"
-                "prfm   pldl1keep, [%20, #128]       \n"
-                "prfm   pldl1keep, [%21, #128]       \n"
-                "prfm   pldl1keep, [%22, #128]       \n"
-                "prfm   pldl1keep, [%23, #128]       \n"
-                "prfm   pldl1keep, [%24, #128]       \n"
-                "prfm   pldl1keep, [%25, #128]       \n"				
-                "ld1    {v8.8b}, [%18], #8           \n" // r0"
-                "ld1    {v9.8b}, [%19], #8           \n" // r1"
-                "ld1    {v10.8b}, [%20], #8          \n" // r2"
-                "ld1    {v11.8b}, [%21], #8          \n" // r3"
-                "ld1    {v12.8b}, [%22], #8          \n" // r4"   
-                "ld1    {v13.8b}, [%23], #8          \n" // r5"	
-                "ld1    {v14.8b}, [%24], #8          \n" // r6"
-                "ld1    {v15.8b}, [%25], #8          \n" // r7" 
-
-                "dup    v16.8b, v0.16b[0]            \n" // k00
-                "dup    v17.8b, v0.16b[1]            \n" // k01
-                "dup    v18.8b, v0.16b[2]            \n" // k02
-                "dup    v19.8b, v0.16b[3]            \n" // k03
-                "dup    v20.8b, v0.16b[4]            \n" // k04
-                "dup    v21.8b, v0.16b[5]            \n" // k05
-                "dup    v22.8b, v0.16b[6]            \n" // k06
-                "dup    v23.8b, v0.16b[7]            \n" // k07				
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%1]       \n" // sum0  
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%1], #32  \n" 
-                //###########################################
-                "dup    v16.8b, v1.16b[0]            \n" // k00
-                "dup    v17.8b, v1.16b[1]            \n" // k01
-                "dup    v18.8b, v1.16b[2]            \n" // k02
-                "dup    v19.8b, v1.16b[3]            \n" // k03
-                "dup    v20.8b, v1.16b[4]            \n" // k04
-                "dup    v21.8b, v1.16b[5]            \n" // k05
-                "dup    v22.8b, v1.16b[6]            \n" // k06
-                "dup    v23.8b, v1.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%2]       \n" // sum1
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%2], #32  \n" 
-                //###########################################
-                "dup    v16.8b, v2.16b[0]            \n" // k00
-                "dup    v17.8b, v2.16b[1]            \n" // k01
-                "dup    v18.8b, v2.16b[2]            \n" // k02
-                "dup    v19.8b, v2.16b[3]            \n" // k03
-                "dup    v20.8b, v2.16b[4]            \n" // k04
-                "dup    v21.8b, v2.16b[5]            \n" // k05
-                "dup    v22.8b, v2.16b[6]            \n" // k06
-                "dup    v23.8b, v2.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%3]       \n" // sum2
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%3], #32  \n" 
-                //##########################################
-                "dup    v16.8b, v3.16b[0]            \n" // k00
-                "dup    v17.8b, v3.16b[1]            \n" // k01
-                "dup    v18.8b, v3.16b[2]            \n" // k02
-                "dup    v19.8b, v3.16b[3]            \n" // k03
-                "dup    v20.8b, v3.16b[4]            \n" // k04
-                "dup    v21.8b, v3.16b[5]            \n" // k05
-                "dup    v22.8b, v3.16b[6]            \n" // k06
-                "dup    v23.8b, v3.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%4]       \n" // sum3
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%4], #32  \n" 
-                //##########################################	
-                "dup    v16.8b, v4.16b[0]            \n" // k00
-                "dup    v17.8b, v4.16b[1]            \n" // k01
-                "dup    v18.8b, v4.16b[2]            \n" // k02
-                "dup    v19.8b, v4.16b[3]            \n" // k03
-                "dup    v20.8b, v4.16b[4]            \n" // k04
-                "dup    v21.8b, v4.16b[5]            \n" // k05
-                "dup    v22.8b, v4.16b[6]            \n" // k06
-                "dup    v23.8b, v4.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%5]       \n" // sum4
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%5], #32  \n" 
-                //##########################################	
-                "dup    v16.8b, v5.16b[0]            \n" // k00
-                "dup    v17.8b, v5.16b[1]            \n" // k01
-                "dup    v18.8b, v5.16b[2]            \n" // k02
-                "dup    v19.8b, v5.16b[3]            \n" // k03
-                "dup    v20.8b, v5.16b[4]            \n" // k04
-                "dup    v21.8b, v5.16b[5]            \n" // k05
-                "dup    v22.8b, v5.16b[6]            \n" // k06
-                "dup    v23.8b, v5.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%6]       \n" // sum5
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%6], #32  \n" 
-                //##########################################
-                "dup    v16.8b, v6.16b[0]            \n" // k00
-                "dup    v17.8b, v6.16b[1]            \n" // k01
-                "dup    v18.8b, v6.16b[2]            \n" // k02
-                "dup    v19.8b, v6.16b[3]            \n" // k03
-                "dup    v20.8b, v6.16b[4]            \n" // k04
-                "dup    v21.8b, v6.16b[5]            \n" // k05
-                "dup    v22.8b, v6.16b[6]            \n" // k06
-                "dup    v23.8b, v6.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%7]       \n" // sum6
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%7], #32  \n" 
-                //##########################################		
-                "dup    v16.8b, v7.16b[0]            \n" // k00
-                "dup    v17.8b, v7.16b[1]            \n" // k01
-                "dup    v18.8b, v7.16b[2]            \n" // k02
-                "dup    v19.8b, v7.16b[3]            \n" // k03
-                "dup    v20.8b, v7.16b[4]            \n" // k04
-                "dup    v21.8b, v7.16b[5]            \n" // k05
-                "dup    v22.8b, v7.16b[6]            \n" // k06
-                "dup    v23.8b, v7.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v26.4s, v27.4s}, [%8]       \n" // sum7
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "saddw2 v27.4s, v27.4s, v24.8h       \n"
-                "st1    {v26.4s, v27.4s}, [%8], #32  \n" 
-                //##########################################					
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7) // %8
-                : "0"(nn),      
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "r"(r0),              // %18
-                  "r"(r1),		// %19
-                  "r"(r2),		// %20
-                  "r"(r3),		// %21
-                  "r"(r4),		// %22
-                  "r"(r5),		// %23
-                  "r"(r6),		// %24
-                  "r"(r7)		// %25
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
-            );
-			}                               
-
-            if (remain == 4)
-            {
-                remain -= 4;
-
-            asm volatile(		
-                "ld1    {v8.8b}, [%18], #8           \n" // r0"
-                "ld1    {v9.8b}, [%19], #8           \n" // r1"
-                "ld1    {v10.8b}, [%20], #8          \n" // r2"
-                "ld1    {v11.8b}, [%21], #8          \n" // r3"
-                "ld1    {v12.8b}, [%22], #8          \n" // r4"   
-                "ld1    {v13.8b}, [%23], #8          \n" // r5"	
-                "ld1    {v14.8b}, [%24], #8          \n" // r6"
-                "ld1    {v15.8b}, [%25], #8          \n" // r7" 
-
-                "dup    v16.8b, v0.16b[0]            \n" // k00
-                "dup    v17.8b, v0.16b[1]            \n" // k01
-                "dup    v18.8b, v0.16b[2]            \n" // k02
-                "dup    v19.8b, v0.16b[3]            \n" // k03
-                "dup    v20.8b, v0.16b[4]            \n" // k04
-                "dup    v21.8b, v0.16b[5]            \n" // k05
-                "dup    v22.8b, v0.16b[6]            \n" // k06
-                "dup    v23.8b, v0.16b[7]            \n" // k07				
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%1, #128]        \n"
-                "ld1    {v26.4s}, [%1]               \n" // sum0  
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%1], #16  	     \n" 
-                //###########################################
-                "dup    v16.8b, v1.16b[0]            \n" // k00
-                "dup    v17.8b, v1.16b[1]            \n" // k01
-                "dup    v18.8b, v1.16b[2]            \n" // k02
-                "dup    v19.8b, v1.16b[3]            \n" // k03
-                "dup    v20.8b, v1.16b[4]            \n" // k04
-                "dup    v21.8b, v1.16b[5]            \n" // k05
-                "dup    v22.8b, v1.16b[6]            \n" // k06
-                "dup    v23.8b, v1.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%2, #128]        \n"
-                "ld1    {v26.4s}, [%2]               \n" // sum1
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%2], #16  	     \n" 
-                //###########################################
-                "dup    v16.8b, v2.16b[0]            \n" // k00
-                "dup    v17.8b, v2.16b[1]            \n" // k01
-                "dup    v18.8b, v2.16b[2]            \n" // k02
-                "dup    v19.8b, v2.16b[3]            \n" // k03
-                "dup    v20.8b, v2.16b[4]            \n" // k04
-                "dup    v21.8b, v2.16b[5]            \n" // k05
-                "dup    v22.8b, v2.16b[6]            \n" // k06
-                "dup    v23.8b, v2.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%3, #128]        \n"
-                "ld1    {v26.4s}, [%3]       	     \n" // sum2
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%3], #16  	     \n" 
-                //##########################################
-                "dup    v16.8b, v3.16b[0]            \n" // k00
-                "dup    v17.8b, v3.16b[1]            \n" // k01
-                "dup    v18.8b, v3.16b[2]            \n" // k02
-                "dup    v19.8b, v3.16b[3]            \n" // k03
-                "dup    v20.8b, v3.16b[4]            \n" // k04
-                "dup    v21.8b, v3.16b[5]            \n" // k05
-                "dup    v22.8b, v3.16b[6]            \n" // k06
-                "dup    v23.8b, v3.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%4, #128]        \n"
-                "ld1    {v26.4s}, [%4]       	     \n" // sum3
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%4], #16  	     \n" 
-                //##########################################	
-                "dup    v16.8b, v4.16b[0]            \n" // k00
-                "dup    v17.8b, v4.16b[1]            \n" // k01
-                "dup    v18.8b, v4.16b[2]            \n" // k02
-                "dup    v19.8b, v4.16b[3]            \n" // k03
-                "dup    v20.8b, v4.16b[4]            \n" // k04
-                "dup    v21.8b, v4.16b[5]            \n" // k05
-                "dup    v22.8b, v4.16b[6]            \n" // k06
-                "dup    v23.8b, v4.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%5, #128]        \n"
-                "ld1    {v26.4s}, [%5]       	     \n" // sum4
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%5], #16  	     \n" 
-                //##########################################	
-                "dup    v16.8b, v5.16b[0]            \n" // k00
-                "dup    v17.8b, v5.16b[1]            \n" // k01
-                "dup    v18.8b, v5.16b[2]            \n" // k02
-                "dup    v19.8b, v5.16b[3]            \n" // k03
-                "dup    v20.8b, v5.16b[4]            \n" // k04
-                "dup    v21.8b, v5.16b[5]            \n" // k05
-                "dup    v22.8b, v5.16b[6]            \n" // k06
-                "dup    v23.8b, v5.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%6, #128]        \n"
-                "ld1    {v26.4s}, [%6]       	     \n" // sum5
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%6], #16  	     \n" 
-                //##########################################
-                "dup    v16.8b, v6.16b[0]            \n" // k00
-                "dup    v17.8b, v6.16b[1]            \n" // k01
-                "dup    v18.8b, v6.16b[2]            \n" // k02
-                "dup    v19.8b, v6.16b[3]            \n" // k03
-                "dup    v20.8b, v6.16b[4]            \n" // k04
-                "dup    v21.8b, v6.16b[5]            \n" // k05
-                "dup    v22.8b, v6.16b[6]            \n" // k06
-                "dup    v23.8b, v6.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%7, #128]        \n"
-                "ld1    {v26.4s}, [%7]       	     \n" // sum6
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%7], #16  	     \n" 
-                //##########################################		
-                "dup    v16.8b, v7.16b[0]            \n" // k00
-                "dup    v17.8b, v7.16b[1]            \n" // k01
-                "dup    v18.8b, v7.16b[2]            \n" // k02
-                "dup    v19.8b, v7.16b[3]            \n" // k03
-                "dup    v20.8b, v7.16b[4]            \n" // k04
-                "dup    v21.8b, v7.16b[5]            \n" // k05
-                "dup    v22.8b, v7.16b[6]            \n" // k06
-                "dup    v23.8b, v7.16b[7]            \n" // k07
-
-                "smull  v24.8h, v8.8b, v16.8b        \n" // r0 * k0
-                "smlal  v24.8h, v9.8b, v17.8b        \n" // r0 * k1
-                "smlal  v24.8h, v10.8b, v18.8b       \n" // r0 * k2
-                "smlal  v24.8h, v11.8b, v19.8b       \n" // r0 * k3
-                "smlal  v24.8h, v12.8b, v20.8b       \n" // r0 * k4
-                "smlal  v24.8h, v13.8b, v21.8b       \n" // r0 * k5
-                "smlal  v24.8h, v14.8b, v22.8b       \n" // r0 * k6
-                "smlal  v24.8h, v15.8b, v23.8b       \n" // r0 * k7
-                
-                "prfm   pldl1keep, [%8, #128]        \n"
-                "ld1    {v26.4s}, [%8]       	     \n" // sum7
-                "saddw  v26.4s, v26.4s, v24.4h       \n"
-                "st1    {v26.4s}, [%8], #16  	     \n" 
-                "sub    %18, %18, #4                 \n"
-                "sub    %19, %19, #4                 \n"
-                "sub    %20, %20, #4                 \n"
-                "sub    %21, %21, #4                 \n"
-                "sub    %22, %22, #4                 \n"
-                "sub    %23, %23, #4                 \n"
-                "sub    %24, %24, #4                 \n"
-                "sub    %25, %25, #4                 \n"
-                //##########################################					
-                : "=r"(nn),     // %0
-                  "=r"(outptr0),// %1
-                  "=r"(outptr1),// %2
-                  "=r"(outptr2),// %3
-                  "=r"(outptr3),// %4
-                  "=r"(outptr4),// %5
-                  "=r"(outptr5),// %6
-                  "=r"(outptr6),// %7
-                  "=r"(outptr7) // %8
-                : "0"(nn),      
-                  "1"(outptr0),
-                  "2"(outptr1),
-                  "3"(outptr2),
-                  "4"(outptr3),
-                  "5"(outptr4),
-                  "6"(outptr5),
-                  "7"(outptr6),
-                  "8"(outptr7),
-                  "r"(r0),              // %18
-                  "r"(r1),		// %19
-                  "r"(r2),		// %20
-                  "r"(r3),		// %21
-                  "r"(r4),		// %22
-                  "r"(r5),		// %23
-                  "r"(r6),		// %24
-                  "r"(r7)		// %25
-                : "cc", "memory", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29"
-            );                
-            }
-	
-	    for (; remain>0; remain--)
-            {
-                // TODO neon optimize
-                int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7];
-                int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7];
-                int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7];
-                int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7];
-                int sum4 = (int)*r0 * kernel4[0] + *r1 * kernel4[1] + *r2 * kernel4[2] + *r3 * kernel4[3] + *r4 * kernel4[4] + *r5 * kernel4[5] + *r6 * kernel4[6] + *r7 * kernel4[7];
-                int sum5 = (int)*r0 * kernel5[0] + *r1 * kernel5[1] + *r2 * kernel5[2] + *r3 * kernel5[3] + *r4 * kernel5[4] + *r5 * kernel5[5] + *r6 * kernel5[6] + *r7 * kernel5[7];
-                int sum6 = (int)*r0 * kernel6[0] + *r1 * kernel6[1] + *r2 * kernel6[2] + *r3 * kernel6[3] + *r4 * kernel6[4] + *r5 * kernel6[5] + *r6 * kernel6[6] + *r7 * kernel6[7];
-                int sum7 = (int)*r0 * kernel7[0] + *r1 * kernel7[1] + *r2 * kernel7[2] + *r3 * kernel7[3] + *r4 * kernel7[4] + *r5 * kernel7[5] + *r6 * kernel7[6] + *r7 * kernel7[7];
-
-                *outptr0 += sum0;
-                *outptr1 += sum1;
-                *outptr2 += sum2;
-                *outptr3 += sum3;
-                *outptr4 += sum4;
-                *outptr5 += sum5;
-                *outptr6 += sum6;
-                *outptr7 += sum7;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                r4++;
-                r5++;
-                r6++;
-                r7++;
-                outptr0++;
-                outptr1++;
-                outptr2++;
-                outptr3++;
-                outptr4++;
-                outptr5++;
-                outptr6++;
-                outptr7++;          
-            }
-        }       
-#endif
-
-        for (; q<inch; q++)
-        {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3;
-            int* outptr4 = out4;
-            int* outptr5 = out5;
-            int* outptr6 = out6;
-            int* outptr7 = out7;
-
-            const signed char* img0 = bottom_blob.channel(q);
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char* kernel1 = (const signed char*)kernel + (p+1)*inch + q;
-            const signed char* kernel2 = (const signed char*)kernel + (p+2)*inch + q;
-            const signed char* kernel3 = (const signed char*)kernel + (p+3)*inch + q;
-            const signed char* kernel4 = (const signed char*)kernel + (p+4)*inch + q;
-            const signed char* kernel5 = (const signed char*)kernel + (p+5)*inch + q;
-            const signed char* kernel6 = (const signed char*)kernel + (p+6)*inch + q;
-            const signed char* kernel7 = (const signed char*)kernel + (p+7)*inch + q;
-
-            const signed char k0 = kernel0[0];
-            const signed char k1 = kernel1[0];
-            const signed char k2 = kernel2[0];
-            const signed char k3 = kernel3[0];
-            const signed char k4 = kernel4[0];
-            const signed char k5 = kernel5[0];
-            const signed char k6 = kernel6[0];
-            const signed char k7 = kernel7[0];
-
-            const signed char* r0 = img0;
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            int8x8_t _k0 = vdup_n_s8(k0);
-            int8x8_t _k1 = vdup_n_s8(k1);
-            int8x8_t _k2 = vdup_n_s8(k2);
-            int8x8_t _k3 = vdup_n_s8(k3);
-            int8x8_t _k4 = vdup_n_s8(k4);
-            int8x8_t _k5 = vdup_n_s8(k5);
-            int8x8_t _k6 = vdup_n_s8(k6);
-            int8x8_t _k7 = vdup_n_s8(k7);
-
-            for (; nn>0; nn--)
-            {
-                int8x8_t _r0 = vld1_s8(r0);
-
-                int32x4_t _out0  = vld1q_s32(outptr0);
-                int32x4_t _out0n = vld1q_s32(outptr0+4);
-                int32x4_t _out1  = vld1q_s32(outptr1);
-                int32x4_t _out1n = vld1q_s32(outptr1+4);
-                int32x4_t _out2  = vld1q_s32(outptr2);
-                int32x4_t _out2n = vld1q_s32(outptr2+4);
-                int32x4_t _out3  = vld1q_s32(outptr3);
-                int32x4_t _out3n = vld1q_s32(outptr3+4);
-                int32x4_t _out4  = vld1q_s32(outptr4);
-                int32x4_t _out4n = vld1q_s32(outptr4+4);
-                int32x4_t _out5  = vld1q_s32(outptr5);
-                int32x4_t _out5n = vld1q_s32(outptr5+4);
-                int32x4_t _out6  = vld1q_s32(outptr6);
-                int32x4_t _out6n = vld1q_s32(outptr6+4);
-                int32x4_t _out7  = vld1q_s32(outptr7);
-                int32x4_t _out7n = vld1q_s32(outptr7+4);
-
-                int16x8_t _out0_s16 = vmull_s8(_r0, _k0);
-                int16x8_t _out1_s16 = vmull_s8(_r0, _k1);
-                int16x8_t _out2_s16 = vmull_s8(_r0, _k2);
-                int16x8_t _out3_s16 = vmull_s8(_r0, _k3);
-                int16x8_t _out4_s16 = vmull_s8(_r0, _k4);
-                int16x8_t _out5_s16 = vmull_s8(_r0, _k5);
-                int16x8_t _out6_s16 = vmull_s8(_r0, _k6);
-                int16x8_t _out7_s16 = vmull_s8(_r0, _k7);
-
-                _out0  = vaddw_s16(_out0, vget_low_s16(_out0_s16));
-                _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16));
-                _out1  = vaddw_s16(_out1, vget_low_s16(_out1_s16));
-                _out1n = vaddw_s16(_out1n, vget_high_s16(_out1_s16));
-                _out2  = vaddw_s16(_out2, vget_low_s16(_out2_s16));
-                _out2n = vaddw_s16(_out2n, vget_high_s16(_out2_s16));
-                _out3  = vaddw_s16(_out3, vget_low_s16(_out3_s16));
-                _out3n = vaddw_s16(_out3n, vget_high_s16(_out3_s16));
-                _out4  = vaddw_s16(_out4, vget_low_s16(_out4_s16));
-                _out4n = vaddw_s16(_out4n, vget_high_s16(_out4_s16));
-                _out5  = vaddw_s16(_out5, vget_low_s16(_out5_s16));
-                _out5n = vaddw_s16(_out5n, vget_high_s16(_out5_s16));
-                _out6  = vaddw_s16(_out6, vget_low_s16(_out6_s16));
-                _out6n = vaddw_s16(_out6n, vget_high_s16(_out6_s16));
-                _out7  = vaddw_s16(_out7, vget_low_s16(_out7_s16));
-                _out7n = vaddw_s16(_out7n, vget_high_s16(_out7_s16));
-
-                vst1q_s32(outptr0, _out0);
-                vst1q_s32(outptr0+4, _out0n);
-                vst1q_s32(outptr1, _out1);
-                vst1q_s32(outptr1+4, _out1n);
-                vst1q_s32(outptr2, _out2);
-                vst1q_s32(outptr2+4, _out2n);
-                vst1q_s32(outptr3, _out3);
-                vst1q_s32(outptr3+4, _out3n);
-                vst1q_s32(outptr4, _out4);
-                vst1q_s32(outptr4+4, _out4n);
-                vst1q_s32(outptr5, _out5);
-                vst1q_s32(outptr5+4, _out5n);
-                vst1q_s32(outptr6, _out6);
-                vst1q_s32(outptr6+4, _out6n);
-                vst1q_s32(outptr7, _out7);
-                vst1q_s32(outptr7+4, _out7n);
-
-                r0 += 8;
-                outptr0 += 8;
-                outptr1 += 8;
-                outptr2 += 8;
-                outptr3 += 8;
-                outptr4 += 8;
-                outptr5 += 8;
-                outptr6 += 8;
-                outptr7 += 8;
-            }
-            
-            for (; remain>0; remain--)
-            {
-                // TODO neon optimize
-                int sum0 = (int)*r0 * k0;
-                int sum1 = (int)*r0 * k1;
-                int sum2 = (int)*r0 * k2;
-                int sum3 = (int)*r0 * k3;
-                int sum4 = (int)*r0 * k4;
-                int sum5 = (int)*r0 * k5;
-                int sum6 = (int)*r0 * k6;
-                int sum7 = (int)*r0 * k7;
-
-                *outptr0 += sum0;
-                *outptr1 += sum1;
-                *outptr2 += sum2;
-                *outptr3 += sum3;
-                *outptr4 += sum4;
-                *outptr5 += sum5;
-                *outptr6 += sum6;
-                *outptr7 += sum7;
-
-                r0++;
-                outptr0++;
-                outptr1++;
-                outptr2++;
-                outptr3++;
-                outptr4++;
-                outptr5++;
-                outptr6++;
-                outptr7++;
-            }
-        }
-    }
+    remain_outch_start += nn_outch << 2;
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
-        Mat out = top_blob.channel(p);
-
-        out.fill(0);
-
-        int q = 0;
+        Mat out0 = top_blob.channel(p);
 
-        for (; q+7<inch; q+=8)
-        {
-            int* outptr = out;
-
-            const signed char* img0 = bottom_blob.channel(q);
-            const signed char* img1 = bottom_blob.channel(q+1);
-            const signed char* img2 = bottom_blob.channel(q+2);
-            const signed char* img3 = bottom_blob.channel(q+3);
-            const signed char* img4 = bottom_blob.channel(q+4);
-            const signed char* img5 = bottom_blob.channel(q+5);
-            const signed char* img6 = bottom_blob.channel(q+6);
-            const signed char* img7 = bottom_blob.channel(q+7);
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char k0 = kernel0[0];
-            const signed char k1 = kernel0[1];
-            const signed char k2 = kernel0[2];
-            const signed char k3 = kernel0[3];
-            const signed char k4 = kernel0[4];
-            const signed char k5 = kernel0[5];
-            const signed char k6 = kernel0[6];
-            const signed char k7 = kernel0[7];
-
-            const signed char* r0 = img0;
-            const signed char* r1 = img1;
-            const signed char* r2 = img2;
-            const signed char* r3 = img3;
-            const signed char* r4 = img4;
-            const signed char* r5 = img5;
-            const signed char* r6 = img6;
-            const signed char* r7 = img7;
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            int8x8_t _k0 = vdup_n_s8(k0);
-            int8x8_t _k1 = vdup_n_s8(k1);
-            int8x8_t _k2 = vdup_n_s8(k2);
-            int8x8_t _k3 = vdup_n_s8(k3);
-            int8x8_t _k4 = vdup_n_s8(k4);
-            int8x8_t _k5 = vdup_n_s8(k5);
-            int8x8_t _k6 = vdup_n_s8(k6);
-            int8x8_t _k7 = vdup_n_s8(k7);
-
-            for (; nn>0; nn--)
-            {
-                int8x8_t _r0 = vld1_s8(r0);
-                int8x8_t _r1 = vld1_s8(r1);
-                int8x8_t _r2 = vld1_s8(r2);
-                int8x8_t _r3 = vld1_s8(r3);
-                int8x8_t _r4 = vld1_s8(r4);
-                int8x8_t _r5 = vld1_s8(r5);
-                int8x8_t _r6 = vld1_s8(r6);
-                int8x8_t _r7 = vld1_s8(r7);
-
-                int32x4_t _out0 = vld1q_s32(outptr);
-                int32x4_t _out0n = vld1q_s32(outptr+4);
-
-                int16x8_t _out0_s16 = vmull_s8(_r0, _k0);
-                _out0_s16 = vmlal_s8(_out0_s16, _r1, _k1);
-                _out0_s16 = vmlal_s8(_out0_s16, _r2, _k2);
-                _out0_s16 = vmlal_s8(_out0_s16, _r3, _k3);
-                _out0_s16 = vmlal_s8(_out0_s16, _r4, _k4);
-                _out0_s16 = vmlal_s8(_out0_s16, _r5, _k5);
-                _out0_s16 = vmlal_s8(_out0_s16, _r6, _k6);
-                _out0_s16 = vmlal_s8(_out0_s16, _r7, _k7);
-
-                _out0 = vaddw_s16(_out0, vget_low_s16(_out0_s16));
-                _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16));
-
-                vst1q_s32(outptr, _out0);
-                vst1q_s32(outptr+4, _out0n);
-
-                r0 += 8;
-                r1 += 8;
-                r2 += 8;
-                r3 += 8;
-                r4 += 8;
-                r5 += 8;
-                r6 += 8;
-                r7 += 8;
-                outptr += 8;
-            }
+        const int bias0 = 0;
 
-            for (; remain>0; remain--)
-            {
-                int sum  = (int)*r0 * k0;
-                int sum1 = (int)*r1 * k1;
-                int sum2 = (int)*r2 * k2;
-                int sum3 = (int)*r3 * k3;
-                int sum4 = (int)*r4 * k4;
-                int sum5 = (int)*r5 * k5;
-                int sum6 = (int)*r6 * k6;
-                int sum7 = (int)*r7 * k7;
-
-                *outptr += sum + sum1 + sum2 + sum3 + sum4 + sum5 + sum6 + sum7;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                r4++;
-                r5++;
-                r6++;
-                r7++;
-                outptr++;
-            }
+        int* outptr0 = out0;
 
-        }
+        int i = 0;
 
-        for (; q<inch; q++)
+        for (; i+7<size; i+=8)
         {
-            int* outptr = out;
-
-            const signed char* img0 = bottom_blob.channel(q);
-            const signed char* kernel0 = (const signed char*)kernel + p*inch  + q;
-            const signed char k0 = kernel0[0];
-            const signed char* r0 = img0;
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            int8x8_t _k0 = vdup_n_s8(k0);
-
-            for (; nn>0; nn--)
-            {
-                int8x8_t _r0 = vld1_s8(r0);
-
-                int32x4_t _out0 = vld1q_s32(outptr);
-                int32x4_t _out0n = vld1q_s32(outptr+4);
-
-                int16x8_t _out0_s16 = vmull_s8(_r0, _k0);
-
-                _out0 = vaddw_s16(_out0, vget_low_s16(_out0_s16));
-                _out0n = vaddw_s16(_out0n, vget_high_s16(_out0_s16));
-
-                vst1q_s32(outptr, _out0);
-                vst1q_s32(outptr+4, _out0n);
-
-                r0 += 8;
-                outptr += 8;
-            }
-
-            for (; remain>0; remain--)
-            {
-                int sum = (int)*r0 * k0;
+            const signed char* tmpptr = tmp.channel(i/8);
+            const signed char* kptr = kernel.channel(p/4 + p%4);
+#if 0 //__ARM_NEON
+            asm volatile(
+                // inch loop
+                "vmov.s32    q6, #0            \n"
+                "vmov.s32    q7, #0            \n"
 
-                *outptr += sum;
+                "lsr         r4, %6, #2        \n"// r4 = nn = inch >> 2
+                "cmp         r4, #0            \n"
+                "beq         1f                \n"
+                
+                "0:                            \n"// for(; nn != 0; nn--)
+                "pld         [%2, #128]        \n"
+                "vld1.s8     {d4-d7}, [%1]!    \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                "vmovl.s8    q5, d7            \n"// a30-a37
+                "vmovl.s8    q4, d6            \n"// a20-a27
+                "vmovl.s8    q3, d5            \n"// a10-a17
+                "vmovl.s8    q2, d4            \n"// a00-a07
 
-                r0++;
-                outptr++;
-            }
-        }
-    }    
-}
-#else // __aarch64__
-/*
- * Convolution 1x1 quantized with int8,unroll 8 x 4
- */
-static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
-{
-    int inch = bottom_blob.c;
+                "vld1.s8     {d0}, [%2]        \n"// kptr k00,k01,k02,k03    k(outch)(inch)
+                "vmovl.s8    q0, d0            \n"// k00,k01,k02,k03
+                "add         %2, #4            \n"
 
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
+                "vmlal.s16   q6, d4, d0[0]     \n"// (a00-a07) * k00
+                "vmlal.s16   q7, d5, d0[0]     \n"
+                "vmlal.s16   q6, d6, d0[1]     \n"// (a10-a17) * k01
+                "vmlal.s16   q7, d7, d0[1]     \n"
+                "vmlal.s16   q6, d8, d0[2]     \n"// (a20-a27) * k02
+                "vmlal.s16   q7, d9, d0[2]     \n"
+                "vmlal.s16   q6, d10, d0[3]    \n"// (a30-a37) * k03
+                "vmlal.s16   q7, d11, d0[3]    \n"
 
-    const signed char* kernel = _kernel;
+                "subs        r4, r4, #1        \n"
+                "bne         0b                \n"// end for
+ 
+                "1:                            \n"
+                // remain loop
+                "and         r4, %6, #3        \n"// r4 = remain = inch & 3
+                "cmp         r4, #0            \n"
+                "beq         3f                \n"
 
-    int nn_outch = outch >> 2;
-    int remain_outch_start = nn_outch << 2;
+                "2:                            \n"// for(; remain != 0; remain--)
+                "vld1.s8     {d2}, [%1]!       \n"// tmpr a00-a07    a(inch)(data)
+                "vld1.s8     {d0}, [%2]        \n"// kptr k00        k(outch)(inch)
+                "vmovl.s8    q1, d2            \n"
+                "vmovl.s8    q0, d0            \n"
+                "add         %2, #1            \n"
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp<nn_outch; pp++)
-    {
-        int p = pp * 4;
+                "vmlal.s16   q6, d2, d0[0]     \n"// (a00-a07) * k00
+                "vmlal.s16   q7, d3, d0[0]     \n"  
 
-        Mat out0 = top_blob.channel(p);
-        Mat out1 = top_blob.channel(p+1);
-        Mat out2 = top_blob.channel(p+2);
-        Mat out3 = top_blob.channel(p+3);
+                "subs        r4, r4, #1        \n"
+                "bne         2b                \n"
 
-        out0.fill(0);
-        out1.fill(0);
-        out2.fill(0);
-        out3.fill(0);
+                "3:                            \n"// store the result to memory
+                "vst1.s32    {d12-d15}, [%0]!  \n"
 
-        int q = 0;
+                : "=r"(outptr0), // %0
+                  "=r"(tmpptr),  // %1
+                  "=r"(kptr)     // %2
+                : "0"(outptr0),
+                  "1"(tmpptr),
+                  "2"(kptr),
+                  "r"(inch)      // %6  
+                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+            );
+#else
+            int sum0 = bias0;
+            int sum1 = bias0;
+            int sum2 = bias0;
+            int sum3 = bias0;
+            int sum4 = bias0;
+            int sum5 = bias0;
+            int sum6 = bias0;
+            int sum7 = bias0;
 
-        for (; q+7<inch; q+=8)
-        {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3;
-
-            const signed char* r0 = bottom_blob.channel(q);
-            const signed char* r1 = bottom_blob.channel(q+1);
-            const signed char* r2 = bottom_blob.channel(q+2);
-            const signed char* r3 = bottom_blob.channel(q+3);
-            const signed char* r4 = bottom_blob.channel(q+4);
-            const signed char* r5 = bottom_blob.channel(q+5);
-            const signed char* r6 = bottom_blob.channel(q+6);
-            const signed char* r7 = bottom_blob.channel(q+7);
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char* kernel1 = (const signed char*)kernel + (p+1)*inch + q;
-            const signed char* kernel2 = (const signed char*)kernel + (p+2)*inch + q;
-            const signed char* kernel3 = (const signed char*)kernel + (p+3)*inch + q;
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            if (nn > 0)
+            for (int q=0; q<inch; q++)
             {
-                asm volatile(
-                    "vld1.s8    d18, [%0]   \n"
-                    "vld1.s8    d19, [%1]   \n"
-                    "vld1.s8    d24, [%2]   \n"
-                    "vld1.s8    d25, [%3]   \n"
-                    : "=r"(kernel0), // %0
-                      "=r"(kernel1), // %1
-                      "=r"(kernel2), // %2
-                      "=r"(kernel3)  // %3
-                    : "0"(kernel0),
-                      "1"(kernel1),
-                      "2"(kernel2),
-                      "3"(kernel3)
-                    :
-                );
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[1] * kptr[0];
+                sum2 += tmpptr[2] * kptr[0];
+                sum3 += tmpptr[3] * kptr[0];
+                sum4 += tmpptr[4] * kptr[0];
+                sum5 += tmpptr[5] * kptr[0];
+                sum6 += tmpptr[6] * kptr[0];
+                sum7 += tmpptr[7] * kptr[0];
 
-                asm volatile(
-                    "0:                            \n"
-                    //ld r0-r7
-                    "pld        [%5, #64]          \n"
-                    "vld1.s8    {d0}, [%5 :64]!    \n" //r0
-
-                    "pld        [%6, #64]          \n"
-                    "vld1.s8    {d1}, [%6 :64]!    \n" //r1
-
-                    "pld        [%7, #64]          \n"
-                    "vld1.s8    {d2}, [%7 :64]!    \n" //r2
-
-                    "pld        [%8, #64]          \n"
-                    "vld1.s8    {d3}, [%8 :64]!    \n" //r3
-
-                    "pld        [%9, #64]          \n"
-                    "vld1.s8    {d4}, [%9 :64]!    \n" //r4
-
-                    "pld        [%10, #64]         \n"
-                    "vld1.s8    {d5}, [%10 :64]!   \n" //r5
-
-                    "pld        [%11, #64]         \n"
-                    "vld1.s8    {d6}, [%11 :64]!   \n" //r6
-
-                    "pld        [%12, #64]         \n"
-                    "vld1.s8    {d7}, [%12 :64]!   \n" //r7
-                    //###########################################
-                    //load inch kernel_0 k0-k7
-                    "vdup.s8    d8, d18[0]          \n"
-                    "vdup.s8    d9, d18[1]          \n"
-                    "vdup.s8    d10, d18[2]         \n"
-                    "vdup.s8    d11, d18[3]         \n"
-                    "vdup.s8    d12, d18[4]         \n"
-                    "vdup.s8    d13, d18[5]         \n"
-                    "vdup.s8    d14, d18[6]         \n"
-                    "vdup.s8    d15, d18[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr0_s32
-                    "pld        [%1, #256]          \n"
-                    "vld1.32    {d20-d23}, [%1:128] \n" //outptr0_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vaddw.s16   q11, q11, d17      \n"
-                    "vst1.32    {d20-d23}, [%1:128]!\n"
-                    //###########################################
-                    //load inch kernel_1 k0-k7
-                    "vdup.s8    d8, d19[0]          \n"
-                    "vdup.s8    d9, d19[1]          \n"
-                    "vdup.s8    d10, d19[2]         \n"
-                    "vdup.s8    d11, d19[3]         \n"
-                    "vdup.s8    d12, d19[4]         \n"
-                    "vdup.s8    d13, d19[5]         \n"
-                    "vdup.s8    d14, d19[6]         \n"
-                    "vdup.s8    d15, d19[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr1_s32
-                    "pld        [%2, #256]          \n"
-                    "vld1.32    {d20-d23}, [%2:128] \n" //outptr1_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vaddw.s16   q11, q11, d17      \n"
-                    "vst1.32    {d20-d23}, [%2:128]!\n"
-                    //############################################
-                    //load inch kernel_2 k0-k7
-                    "vdup.s8    d8, d24[0]          \n"
-                    "vdup.s8    d9, d24[1]          \n"
-                    "vdup.s8    d10, d24[2]         \n"
-                    "vdup.s8    d11, d24[3]         \n"
-                    "vdup.s8    d12, d24[4]         \n"
-                    "vdup.s8    d13, d24[5]         \n"
-                    "vdup.s8    d14, d24[6]         \n"
-                    "vdup.s8    d15, d24[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr2_s32
-                    "pld        [%3, #256]          \n"
-                    "vld1.32    {d20-d23}, [%3:128] \n" //outptr2_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vaddw.s16   q11, q11, d17      \n"
-                    "vst1.32    {d20-d23}, [%3:128]!\n"
-                    //#############################################
-                    //load inch kernel_3 k0-k7
-                    "vdup.s8    d8, d25[0]          \n"
-                    "vdup.s8    d9, d25[1]          \n"
-                    "vdup.s8    d10, d25[2]         \n"
-                    "vdup.s8    d11, d25[3]         \n"
-                    "vdup.s8    d12, d25[4]         \n"
-                    "vdup.s8    d13, d25[5]         \n"
-                    "vdup.s8    d14, d25[6]         \n"
-                    "vdup.s8    d15, d25[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr3_s32
-                    "pld        [%4, #256]          \n"
-                    "vld1.32    {d20-d23}, [%4:128] \n" //outptr3_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vaddw.s16   q11, q11, d17      \n"
-                    "vst1.32    {d20-d23}, [%4:128]!\n"
-
-                    //next
-                    "subs       %0, #1              \n"
-                    "bne        0b                  \n"
-                    : "=r"(nn),          // %0
-                      "=r"(outptr0),     // %1
-                      "=r"(outptr1),     // %2
-                      "=r"(outptr2),     // %3
-                      "=r"(outptr3),     // %4
-                      "=r"(r0),          // %5
-                      "=r"(r1),          // %6
-                      "=r"(r2),          // %7
-                      "=r"(r3),          // %8
-                      "=r"(r4),          // %9
-                      "=r"(r5),          // %10
-                      "=r"(r6),          // %11
-                      "=r"(r7)           // %12
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(outptr1),
-                      "3"(outptr2),
-                      "4"(outptr3),
-                      "5"(r0),
-                      "6"(r1),
-                      "7"(r2),
-                      "8"(r3),
-                      "9"(r4),
-                      "10"(r5),
-                      "11"(r6),
-                      "12"(r7)
-                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q10", "q11", "q13", "q14", "q15"
-                );
+                tmpptr += 8;
+                kptr++;
             }
 
-            if (remain >= 4)
-            {
-                remain -= 4;
-
-                asm volatile(
-                    "0:                            \n"
-                    //ld r0-r7
-                    "pld        [%5, #64]          \n"
-                    "vld1.s8    {d0}, [%5 :64]     \n"  //r0
-
-                    "pld        [%6, #64]          \n"
-                    "vld1.s8    {d1}, [%6 :64]     \n"  //r1
-
-                    "pld        [%7, #64]          \n"
-                    "vld1.s8    {d2}, [%7 :64]     \n"  //r2
-
-                    "pld        [%8, #64]          \n"
-                    "vld1.s8    {d3}, [%8 :64]     \n"  //r3
-
-                    "pld        [%9, #64]          \n"
-                    "vld1.s8    {d4}, [%9 :64]     \n"  //r4
-
-                    "pld        [%10, #64]         \n"
-                    "vld1.s8    {d5}, [%10 :64]    \n"  //r5
-
-                    "pld        [%11, #64]         \n"
-                    "vld1.s8    {d6}, [%11 :64]    \n"  //r6
-
-                    "pld        [%12, #64]         \n"
-                    "vld1.s8    {d7}, [%12 :64]    \n"  //r7
-
-                    "add        %5, #4             \n"
-                    "add        %6, #4             \n"
-                    "add        %7, #4             \n"
-                    "add        %8, #4             \n"
-                    "add        %9, #4             \n"
-                    "add        %10, #4            \n"
-                    "add        %11, #4            \n"
-                    "add        %12, #4            \n"
-                    //###########################################
-                    //load inch kernel_0 k0-k7
-                    "vdup.s8    d8, d18[0]          \n"
-                    "vdup.s8    d9, d18[1]          \n"
-                    "vdup.s8    d10, d18[2]         \n"
-                    "vdup.s8    d11, d18[3]         \n"
-                    "vdup.s8    d12, d18[4]         \n"
-                    "vdup.s8    d13, d18[5]         \n"
-                    "vdup.s8    d14, d18[6]         \n"
-                    "vdup.s8    d15, d18[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr0_s32
-                    "pld        [%1, #128]          \n"
-                    "vld1.32    {d20-d21}, [%1:128] \n" //outptr0_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vst1.32    {d20-d21}, [%1:128]!\n"
-                    //###########################################
-                    //load inch kernel_1 k0-k7
-                    "vdup.s8    d8, d19[0]          \n"
-                    "vdup.s8    d9, d19[1]          \n"
-                    "vdup.s8    d10, d19[2]         \n"
-                    "vdup.s8    d11, d19[3]         \n"
-                    "vdup.s8    d12, d19[4]         \n"
-                    "vdup.s8    d13, d19[5]         \n"
-                    "vdup.s8    d14, d19[6]         \n"
-                    "vdup.s8    d15, d19[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr1_s32
-                    "pld        [%2, #128]          \n"
-                    "vld1.32    {d20-d21}, [%2:128] \n" //outptr1_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vst1.32    {d20-d21}, [%2:128]!\n"
-                    //############################################
-                    //load inch kernel_2 k0-k7
-                    "vdup.s8    d8, d24[0]          \n"
-                    "vdup.s8    d9, d24[1]          \n"
-                    "vdup.s8    d10, d24[2]         \n"
-                    "vdup.s8    d11, d24[3]         \n"
-                    "vdup.s8    d12, d24[4]         \n"
-                    "vdup.s8    d13, d24[5]         \n"
-                    "vdup.s8    d14, d24[6]         \n"
-                    "vdup.s8    d15, d24[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr2_s32
-                    "pld        [%3, #256]          \n"
-                    "vld1.32    {d20-d21}, [%3:128] \n" //outptr2_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vst1.32    {d20-d21}, [%3:128]!\n"
-                    //#############################################
-                    //load inch kernel_3 k0-k7
-                    "vdup.s8    d8, d25[0]          \n"
-                    "vdup.s8    d9, d25[1]          \n"
-                    "vdup.s8    d10, d25[2]         \n"
-                    "vdup.s8    d11, d25[3]         \n"
-                    "vdup.s8    d12, d25[4]         \n"
-                    "vdup.s8    d13, d25[5]         \n"
-                    "vdup.s8    d14, d25[6]         \n"
-                    "vdup.s8    d15, d25[7]         \n"
-
-                    //mla
-                    "vmull.s8   q8, d0, d8          \n"
-                    "vmlal.s8   q8, d1, d9          \n"
-                    "vmlal.s8   q8, d2, d10         \n"
-                    "vmlal.s8   q8, d3, d11         \n"
-                    "vmlal.s8   q8, d4, d12         \n"
-                    "vmlal.s8   q8, d5, d13         \n"
-                    "vmlal.s8   q8, d6, d14         \n"
-                    "vmlal.s8   q8, d7, d15         \n"
-
-                    //outptr3_s32
-                    "pld        [%4, #256]          \n"
-                    "vld1.32    {d20-d21}, [%4:128] \n" //outptr3_s32
-                    "vaddw.s16   q10, q10, d16      \n"
-                    "vst1.32    {d20-d21}, [%4:128]!\n"
-                    : "=r"(nn),          // %0
-                      "=r"(outptr0),     // %1
-                      "=r"(outptr1),     // %2
-                      "=r"(outptr2),     // %3
-                      "=r"(outptr3),     // %4
-                      "=r"(r0),          // %5
-                      "=r"(r1),          // %6
-                      "=r"(r2),          // %7
-                      "=r"(r3),          // %8
-                      "=r"(r4),          // %9
-                      "=r"(r5),          // %10
-                      "=r"(r6),          // %11
-                      "=r"(r7)           // %12
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(outptr1),
-                      "3"(outptr2),
-                      "4"(outptr3),
-                      "5"(r0),
-                      "6"(r1),
-                      "7"(r2),
-                      "8"(r3),
-                      "9"(r4),
-                      "10"(r5),
-                      "11"(r6),
-                      "12"(r7)
-                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q10", "q11"
-                );                
-            }
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+            outptr0[4] = sum4;
+            outptr0[5] = sum5;
+            outptr0[6] = sum6;
+            outptr0[7] = sum7;
 
-            for (; remain>0; remain--)
-            {
-                //ToDo Neon
-                int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7];
-                int sum1 = (int)*r0 * kernel1[0] + *r1 * kernel1[1] + *r2 * kernel1[2] + *r3 * kernel1[3] + *r4 * kernel1[4] + *r5 * kernel1[5] + *r6 * kernel1[6] + *r7 * kernel1[7];
-                int sum2 = (int)*r0 * kernel2[0] + *r1 * kernel2[1] + *r2 * kernel2[2] + *r3 * kernel2[3] + *r4 * kernel2[4] + *r5 * kernel2[5] + *r6 * kernel2[6] + *r7 * kernel2[7];
-                int sum3 = (int)*r0 * kernel3[0] + *r1 * kernel3[1] + *r2 * kernel3[2] + *r3 * kernel3[3] + *r4 * kernel3[4] + *r5 * kernel3[5] + *r6 * kernel3[6] + *r7 * kernel3[7];
-
-                *outptr0 += sum0;
-                *outptr1 += sum1;
-                *outptr2 += sum2;
-                *outptr3 += sum3;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                r4++;
-                r5++;
-                r6++;
-                r7++;
-                outptr0++;
-                outptr1++;
-                outptr2++;
-                outptr3++;
-            }
-        }
+            outptr0 += 8;
+#endif // __ARM_NEON            
+        }   
 
-        for (; q<inch; q++)
+        for (; i+3<size; i+=4)
         {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3;
-
-            const signed char* img0_s8 = bottom_blob.channel(q);
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char* kernel1 = (const signed char*)kernel + (p+1)*inch + q;
-            const signed char* kernel2 = (const signed char*)kernel + (p+2)*inch + q;
-            const signed char* kernel3 = (const signed char*)kernel + (p+3)*inch + q;
-
-            const signed char k0 = kernel0[0];
-            const signed char k1 = kernel1[0];
-            const signed char k2 = kernel2[0];
-            const signed char k3 = kernel3[0];
-
-            const signed char* r0 = img0_s8;
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
-
-            int8x8_t _k0 = vdup_n_s8(k0);
-            int8x8_t _k1 = vdup_n_s8(k1);
-            int8x8_t _k2 = vdup_n_s8(k2);
-            int8x8_t _k3 = vdup_n_s8(k3);
-
-            if (nn > 0)
-            {
-                asm volatile(
-                    "0:                             \n"
-                    //load r0
-                    "pld        [%5, #64]           \n"
-                    "vld1.s8    {d8}, [%5 :64]!     \n"
-
-                    //mla
-                    "vmull.s8   q5, d8, %12         \n"
-                    //outptr0_s32
-                    "pld        [%1, #256]          \n"
-                    "vld1.32    {d12-d15}, [%1]     \n"
-                    "vmovl.s16  q8, d10             \n"
-                    "vmovl.s16  q9, d11             \n"
-                    "vadd.s32   q6, q8              \n"
-                    "vadd.s32   q7, q9              \n"
-                    "vst1.32    {d12-d15}, [%1]!    \n"
-
-                    //mla
-                    "vmull.s8   q5, d8, %13         \n"
-                    //outptr1_s32
-                    "pld        [%2, #256]          \n"
-                    "vld1.32    {d12-d15}, [%2]     \n"
-                    "vaddw.s16   q6, q6, d10        \n"
-                    "vaddw.s16   q7, q7, d11        \n"
-                    "vst1.32    {d12-d15}, [%2]!    \n"
-
-                    //mla
-                    "vmull.s8   q5, d8, %14         \n"
-                    //outptr0_s32
-                    "pld        [%3, #256]          \n"
-                    "vld1.32    {d12-d15}, [%3]     \n"
-                    "vaddw.s16   q6, q6, d10        \n"
-                    "vaddw.s16   q7, q7, d11        \n"
-                    "vst1.32    {d12-d15}, [%3]!    \n"
-
-                    //mla
-                    "vmull.s8   q5, d8, %15         \n"
-                    //outptr0_s32
-                    "pld        [%4, #256]          \n"
-                    "vld1.32    {d12-d15}, [%4]     \n"
-                    "vaddw.s16   q6, q6, d10        \n"
-                    "vaddw.s16   q7, q7, d11        \n"
-                    "vst1.32    {d12-d15}, [%4]!    \n"
-
-                    "subs       %0, #1              \n"
-                    "bne        0b                  \n"
-                    : "=r"(nn),             // %0
-                      "=r"(outptr0),        // %1
-                      "=r"(outptr1),        // %2
-                      "=r"(outptr2),        // %3
-                      "=r"(outptr3),        // %4
-                      "=r"(r0)              // %5
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(outptr1),
-                      "3"(outptr2),
-                      "4"(outptr3),
-                      "5"(r0),
-                      "w"(_k0),             // %12
-                      "w"(_k1),             // %13
-                      "w"(_k2),             // %14
-                      "w"(_k3)              // %15
-                    : "cc", "memory", "q4", "q5", "q6", "q7", "q8", "q9"
-                );
-            }
+            const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4);   
+            const signed char* kptr = kernel.channel(p/4 + p%4);    
+#if 0 //__ARM_NEON
+            asm volatile(
+                // inch loop
+                "vmov.s32    q6, #0            \n"
 
-            for (; remain>0; remain--)
-            {
-                // TODO neon optimize
-                int sum0 = (int)*r0 * k0;
-                int sum1 = (int)*r0 * k1;
-                int sum2 = (int)*r0 * k2;
-                int sum3 = (int)*r0 * k3;
-
-                *outptr0 += sum0;
-                *outptr1 += sum1;
-                *outptr2 += sum2;
-                *outptr3 += sum3;
-
-                r0++;
-                outptr0++;
-                outptr1++;
-                outptr2++;
-                outptr3++;
-            }
-        }
-    }
+                "lsr         r4, %6, #2        \n"// r4 = nn = inch >> 2
+                "cmp         r4, #0            \n"
+                "beq         1f                \n"
+                
+                "0:                            \n"// for(; nn != 0; nn--)
+                "pld         [%2, #128]        \n"
+                "vld1.s8     {d4-d5}, [%1]!    \n"// tmpr a00-a03,a10-a13,a20-a23,a30-a33    a(inch)(data)
+                "vmovl.s8    q3, d5            \n"// a20-a23,a30-a33
+                "vmovl.s8    q2, d4            \n"// a00-a03,a10-a13
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p=remain_outch_start; p<outch; p++)
-    {
-        Mat out0 = top_blob.channel(p);
+                "vld1.s8     {d0}, [%2]        \n"// kptr k00,k01,k02,k03    k(outch)(inch)
+                "vmovl.s8    q0, d0            \n"// k00,k01,k02,k03
+                "add         %2, #4            \n"
 
-        out0.fill(0);
+                "vmlal.s16   q6, d4, d0[0]     \n"// (a00-a03) * k00
+                "vmlal.s16   q6, d5, d0[1]     \n"// (a10-a13) * k01
+                "vmlal.s16   q6, d6, d0[2]     \n"// (a20-a23) * k02
+                "vmlal.s16   q6, d7, d0[3]     \n"// (a30-a33) * k03
 
-        int q = 0;
+                "subs        r4, r4, #1        \n"
+                "bne         0b                \n"// end for
+ 
+                "1:                            \n"
+                // remain loop
+                "and         r4, %6, #3        \n"// r4 = remain = inch & 3
+                "cmp         r4, #0            \n"
+                "beq         3f                \n"
 
-        for (; q+7<inch; q+=8)
-        {
-            int* outptr0 = out0;
+                "2:                            \n"// for(; remain != 0; remain--)
+                "vld1.s8     {d2}, [%1]        \n"// tmpr a00-a03    a(inch)(data)
+                "vld1.s8     {d0}, [%2]        \n"// kptr k00        k(outch)(inch)
+                "vmovl.s8    q1, d2            \n"
+                "vmovl.s8    q0, d0            \n"
+                "add         %1, #4            \n"
+                "add         %2, #1            \n"
 
-            const signed char* r0 = bottom_blob.channel(q);
-            const signed char* r1 = bottom_blob.channel(q+1);
-            const signed char* r2 = bottom_blob.channel(q+2);
-            const signed char* r3 = bottom_blob.channel(q+3);
-            const signed char* r4 = bottom_blob.channel(q+4);
-            const signed char* r5 = bottom_blob.channel(q+5);
-            const signed char* r6 = bottom_blob.channel(q+6);
-            const signed char* r7 = bottom_blob.channel(q+7);
+                "vmlal.s16   q6, d2, d0[0]     \n"// (a00-a03) * k00
 
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
+                "subs        r4, r4, #1        \n"
+                "bne         2b                \n"
 
-            int size = outw * outh;
+                "3:                            \n"// store the result to memory
+                "vst1.s32    {d12-d13}, [%0]!  \n"
 
-            int nn = size >> 3;
-            int remain = size & 7;
+                : "=r"(outptr0), // %0
+                  "=r"(tmpptr),  // %1
+                  "=r"(kptr)     // %2
+                : "0"(outptr0),
+                  "1"(tmpptr),
+                  "2"(kptr),
+                  "r"(inch)      // %6  
+                : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6"
+            );
+#else
+            int sum0 = bias0;
+            int sum1 = bias0;
+            int sum2 = bias0;
+            int sum3 = bias0;
 
-            if (nn > 0)
+            for (int q=0; q<inch; q++)
             {
-                //load inch kernel_0 k0-k7
-                asm volatile(
-                    "vld1.s8    d18, [%0]   \n"
-                    : "=r"(kernel0) // %0
-                    : "0" (kernel0)
-                    :
-                );
+                sum0 += tmpptr[0] * kptr[0];
+                sum1 += tmpptr[1] * kptr[0];
+                sum2 += tmpptr[2] * kptr[0];
+                sum3 += tmpptr[3] * kptr[0];
 
-                asm volatile(
-                    "0:                            \n"
-                    //ld r0-r7
-                    "pld        [%2, #64]          \n"
-                    "vld1.s8    {d0}, [%2 :64]!    \n"  //r0
-                    "pld        [%3, #64]          \n"
-                    "vld1.s8    {d1}, [%3 :64]!    \n"  //r1
-                    "pld        [%4, #64]          \n"
-                    "vld1.s8    {d2}, [%4 :64]!    \n"  //r2
-                    "pld        [%5, #64]          \n"
-                    "vld1.s8    {d3}, [%5 :64]!    \n"  //r3
-                    "pld        [%6, #64]          \n"
-                    "vld1.s8    {d4}, [%6 :64]!    \n"  //r4
-                    "pld        [%7, #64]          \n"
-                    "vld1.s8    {d5}, [%7 :64]!    \n"  //r5
-                    "pld        [%8, #64]          \n"
-                    "vld1.s8    {d6}, [%8 :64]!    \n"  //r6
-                    "pld        [%9, #64]          \n"
-                    "vld1.s8    {d7}, [%9 :64]!    \n"  //r7
-
-                    //load inch kernel_0 k0-k7
-                    "vdup.s8    d8, d18[0]          \n"
-                    "vdup.s8    d9, d18[1]          \n"
-                    "vdup.s8    d10, d18[2]         \n"
-                    "vdup.s8    d11, d18[3]         \n"
-                    "vdup.s8    d12, d18[4]         \n"
-                    "vdup.s8    d13, d18[5]         \n"
-                    "vdup.s8    d14, d18[6]         \n"
-                    "vdup.s8    d15, d18[7]         \n"
-
-                    //mla
-                    "vmull.s8   q14, d0,    d8          \n"
-                    "vmlal.s8   q14, d1,    d9          \n"
-                    "vmlal.s8   q14, d2,    d10         \n"
-                    "vmlal.s8   q14, d3,    d11         \n"
-                    "vmlal.s8   q14, d4,    d12         \n"
-                    "vmlal.s8   q14, d5,    d13         \n"
-                    "vmlal.s8   q14, d6,    d14         \n"
-                    "vmlal.s8   q14, d7,    d15         \n"
-
-                    //outptr0_s32
-                    "pld        [%1, #256]          \n"
-                    "vld1.32    {d20-d23}, [%1]     \n" //outptr0_s32
-                    "vaddw.s16   q10, q10, d28      \n"
-                    "vaddw.s16   q11, q11, d29      \n"
-                    "vst1.32    {d20-d23}, [%1]!    \n"
-
-                    //next
-                    "subs       %0, #1              \n"
-                    "bne        0b                  \n"
-                    : "=r"(nn),          // %0
-                      "=r"(outptr0),     // %1
-                      "=r"(r0),          // %2
-                      "=r"(r1),          // %3
-                      "=r"(r2),          // %4
-                      "=r"(r3),          // %5
-                      "=r"(r4),          // %6
-                      "=r"(r5),          // %7
-                      "=r"(r6),          // %8
-                      "=r"(r7)           // %9
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(r0),
-                      "3"(r1),
-                      "4"(r2),
-                      "5"(r3),
-                      "6"(r4),
-                      "7"(r5),
-                      "8"(r6),
-                      "9"(r7)
-                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q10", "q11", "q12", "q13", "q14"
-                );
+                tmpptr += 4;
+                kptr++;
             }
 
-            for (; remain>0; remain--)
-            {
-                //ToDo Neon
-                int sum0 = (int)*r0 * kernel0[0] + *r1 * kernel0[1] + *r2 * kernel0[2] + *r3 * kernel0[3] + *r4 * kernel0[4] + *r5 * kernel0[5] + *r6 * kernel0[6] + *r7 * kernel0[7];
-
-                *outptr0 += sum0;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                r4++;
-                r5++;
-                r6++;
-                r7++;
-                outptr0++;
-            }
+            outptr0[0] = sum0;
+            outptr0[1] = sum1;
+            outptr0[2] = sum2;
+            outptr0[3] = sum3;
+
+            outptr0 += 4;
+#endif // __ARM_NEON
         }
 
-        for (; q<inch; q++)
+        for (; i<size; i++)
         {
-            int* outptr0 = out0;
-
-            const signed char* img0_s8 = bottom_blob.channel(q);
-            const signed char* r0 = img0_s8;
-
-            const signed char* kernel0 = (const signed char*)kernel + p*inch + q;
-            const signed char k0 = kernel0[0];
-
-            int size = outw * outh;
-
-            int nn = size >> 3;
-            int remain = size & 7;
+            const signed char* tmpptr = tmp.channel(i/8 + (i%8)/4 + i%4);   
+            const signed char* kptr = kernel.channel(p/4 + p%4);
 
-            int8x8_t _k0 = vdup_n_s8(k0);
+            int q = 0;            
+            int sum0 = bias0;
 
-            if (nn > 0)
+            for (; q<inch; q++)
             {
-                asm volatile(
-                    "0:                             \n"
-                    //load r0
-                    "pld        [%2, #64]           \n"
-                    "vld1.s8    {d8}, [%2 :64]!     \n"
-
-                    //mla
-                    "vmull.s8   q10, d8, %6         \n"
-                    //outptr0_s32
-                    "pld        [%1, #256]          \n"
-                    "vld1.32    {d12-d15}, [%1]     \n"
-                    "vaddw.s16   q6, q6, d20        \n"
-                    "vaddw.s16   q7, q7, d21        \n"
-                    "vst1.32    {d12-d15}, [%1]!    \n"
-
-                    "subs       %0, #1              \n"
-                    "bne        0b                  \n"
-                    : "=r"(nn),             // %0
-                      "=r"(outptr0),        // %1
-                      "=r"(r0)              // %2
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(r0),
-                      "w"(_k0)              // %6
-                    : "cc", "memory", "q4", "q10", "q7", "q8", "q9"
-                );
+                sum0 += tmpptr[0] * kptr[0];
+                tmpptr++;
+                kptr++;
             }
 
-            for (; remain>0; remain--)
-            {
-                int sum0 = (int)*r0 * k0;
-
-                *outptr0 += sum0;
+            outptr0[0] = sum0;
 
-                r0++;
-                outptr0++;
-            }
+            outptr0++;
         }
-    }
-}
+    }  
 
+//     // NOTE sgemm int8
+//     for (; p<outch; p++)
+//     {
+//         Mat out0 = top_blob.channel(p);
+//
+//         int* outptr0 = out0;
+//
+//         for (int i=0; i<size; i++)
+//         {
+//             int sum = 0;
+//
+//             const signed char* kptr = _kernel.channel(p/8 + p%8);
+//
+//             for (int q=0; q<inch; q++)
+//             {
+//                 const signed char* img0 = bottom_blob.channel(q);
+//
+//                 sum += img0[i] * kptr[0];
+//                 kptr ++;
+//             }
+//
+//             outptr0[i] = sum;
+//         }
+//     }
+}
+#else // __aarch64__
+/*
+ * Convolution 1x1 quantized with sgemm int8
+ */
 static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Option& opt)
 {
     int w = bottom_blob.w;
@@ -4166,7 +965,7 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
     const int size = w * h;
 
     // interleave
-    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 1u);
+    Mat tmp(8*4, inch/4+inch%4, size/8 + (size%8)/4 + size%4, 1u, opt.workspace_allocator);
     {
         int nn_size = size >> 3;
         int remain_size_start = nn_size << 3;
@@ -4390,41 +1189,41 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
                 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
             );
 #else
-            int sum0_0 = biasptr[0];
-            int sum0_1 = biasptr[0];
-            int sum0_2 = biasptr[0];
-            int sum0_3 = biasptr[0];
-            int sum0_4 = biasptr[0];
-            int sum0_5 = biasptr[0];
-            int sum0_6 = biasptr[0];
-            int sum0_7 = biasptr[0];
-
-            int sum1_0 = biasptr[1];
-            int sum1_1 = biasptr[1];
-            int sum1_2 = biasptr[1];
-            int sum1_3 = biasptr[1];
-            int sum1_4 = biasptr[1];
-            int sum1_5 = biasptr[1];
-            int sum1_6 = biasptr[1];
-            int sum1_7 = biasptr[1];
-
-            int sum2_0 = biasptr[2];
-            int sum2_1 = biasptr[2];
-            int sum2_2 = biasptr[2];
-            int sum2_3 = biasptr[2];
-            int sum2_4 = biasptr[2];
-            int sum2_5 = biasptr[2];
-            int sum2_6 = biasptr[2];
-            int sum2_7 = biasptr[2];
-
-            int sum3_0 = biasptr[3];
-            int sum3_1 = biasptr[3];
-            int sum3_2 = biasptr[3];
-            int sum3_3 = biasptr[3];
-            int sum3_4 = biasptr[3];
-            int sum3_5 = biasptr[3];
-            int sum3_6 = biasptr[3];
-            int sum3_7 = biasptr[3];
+            int sum0_0 = 0;
+            int sum0_1 = 0;
+            int sum0_2 = 0;
+            int sum0_3 = 0;
+            int sum0_4 = 0;
+            int sum0_5 = 0;
+            int sum0_6 = 0;
+            int sum0_7 = 0;
+
+            int sum1_0 = 0;
+            int sum1_1 = 0;
+            int sum1_2 = 0;
+            int sum1_3 = 0;
+            int sum1_4 = 0;
+            int sum1_5 = 0;
+            int sum1_6 = 0;
+            int sum1_7 = 0;
+
+            int sum2_0 = 0;
+            int sum2_1 = 0;
+            int sum2_2 = 0;
+            int sum2_3 = 0;
+            int sum2_4 = 0;
+            int sum2_5 = 0;
+            int sum2_6 = 0;
+            int sum2_7 = 0;
+
+            int sum3_0 = 0;
+            int sum3_1 = 0;
+            int sum3_2 = 0;
+            int sum3_3 = 0;
+            int sum3_4 = 0;
+            int sum3_5 = 0;
+            int sum3_6 = 0;
+            int sum3_7 = 0;
 
             for (int q=0; q<inch; q++)
             {
@@ -4604,25 +1403,25 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
                 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
             );
 #else
-            int sum0_0 = biasptr[0];
-            int sum0_1 = biasptr[0];
-            int sum0_2 = biasptr[0];
-            int sum0_3 = biasptr[0];
-
-            int sum1_0 = biasptr[1];
-            int sum1_1 = biasptr[1];
-            int sum1_2 = biasptr[1];
-            int sum1_3 = biasptr[1];
-
-            int sum2_0 = biasptr[2];
-            int sum2_1 = biasptr[2];
-            int sum2_2 = biasptr[2];
-            int sum2_3 = biasptr[2];
-
-            int sum3_0 = biasptr[3];
-            int sum3_1 = biasptr[3];
-            int sum3_2 = biasptr[3];
-            int sum3_3 = biasptr[3];
+            int sum0_0 = 0;
+            int sum0_1 = 0;
+            int sum0_2 = 0;
+            int sum0_3 = 0;
+
+            int sum1_0 = 0;
+            int sum1_1 = 0;
+            int sum1_2 = 0;
+            int sum1_3 = 0;
+
+            int sum2_0 = 0;
+            int sum2_1 = 0;
+            int sum2_2 = 0;
+            int sum2_3 = 0;
+
+            int sum3_0 = 0;
+            int sum3_1 = 0;
+            int sum3_2 = 0;
+            int sum3_3 = 0;
 
             for (int q=0; q<inch; q++)
             {
@@ -4757,10 +1556,10 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
                 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
             );    
 #else
-            int sum0 = biasptr[0];
-            int sum1 = biasptr[1];
-            int sum2 = biasptr[2];
-            int sum3 = biasptr[3];
+            int sum0 = 0;
+            int sum1 = 0;
+            int sum2 = 0;
+            int sum3 = 0;
 
             for (int q=0; q<inch; q++)
             {
@@ -4814,7 +1613,7 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
                 "beq         1f                \n"
                 
                 "0:                            \n"// for(; nn != 0; nn--)
-                "pld         [%2, #128]        \n"
+                "pld         [%1, #128]        \n"
                 "vld1.s8     {d4-d7}, [%1]!    \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
                 "vmovl.s8    q5, d7            \n"// a30-a37
                 "vmovl.s8    q4, d6            \n"// a20-a27
@@ -5042,103 +1841,24 @@ static void conv1x1s1_sgemm_int8_neon(const Mat& bottom_blob, Mat& top_blob, con
 }
 #endif // __aarch64__
 
-static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
-    int w = bottom_blob.w;
-    int inch = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int tailstep = w - 2*outw + w;
-    const signed char *kernel = _kernel;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        Mat out0 = top_blob.channel(p);
-
-        out0.fill(0);
-
-        int q = 0;
-
-        for (; q+7<inch; q+=8)
-        {
-            int* outptr0 = out0;
-
-            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+    int kernel_w = 1;
+    int kernel_h = 1;
 
-            const signed char *r0 = bottom_blob.channel(q);
-            const signed char *r1 = bottom_blob.channel(q + 1);
-            const signed char *r2 = bottom_blob.channel(q + 2);
-            const signed char *r3 = bottom_blob.channel(q + 3);
-            const signed char *r4 = bottom_blob.channel(q + 4);
-            const signed char *r5 = bottom_blob.channel(q + 5);
-            const signed char *r6 = bottom_blob.channel(q + 6);
-            const signed char *r7 = bottom_blob.channel(q + 7);
+    int stride_w = 1;
+    int stride_h = 1;
 
-            for(int i = 0; i < outh; i++)
-            {
-                int remain = outw;
-
-                for (; remain > 0; remain--)
-                {
-                    //ToDo Neon
-                    int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
-                            (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
-                            (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
-                            (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
-
-                    *outptr0 += sum0;
-
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
-                    r3 += 2;
-                    r4 += 2;
-                    r5 += 2;
-                    r6 += 2;
-                    r7 += 2;
-                    outptr0++;
-                }
-
-                r0 += tailstep;
-                r1 += tailstep;
-                r2 += tailstep;
-                r3 += tailstep;
-                r4 += tailstep;
-                r5 += tailstep;
-                r6 += tailstep;
-                r7 += tailstep;
-            }
-        }
-
-        for (; q<inch; q++)
-        {
-            int* outptr0 = out0;
-
-            const signed char *r0 = bottom_blob.channel(q);
-
-            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
-
-            for(int i = 0; i < outh; i++)
-            {
-                int remain = outw;
-
-                for (; remain > 0; remain--)
-                {
-                    //ToDo Neon
-                    int sum0 = (int)*r0 * (int)kernel0[0];
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
 
-                    *outptr0 += sum0;
+static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 1;
+    int kernel_h = 1;
 
-                    r0 += 2;
-                    outptr0++;
-                }
+    int stride_w = 2;
+    int stride_h = 2;
 
-                r0 += tailstep;
-            }
-        }
-    }
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
 }
diff --git a/src/layer/arm/convolution_3x3_int8.h b/src/layer/arm/convolution_3x3_int8.h
index 759b2eade..857053c51 100644
--- a/src/layer/arm/convolution_3x3_int8.h
+++ b/src/layer/arm/convolution_3x3_int8.h
@@ -69,6 +69,185 @@ static void conv3x3s1_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel
     }
 }
 
+static void conv3x3s1_winograd23_transform_kernel_int8_neon(const Mat& kernel, std::vector<Mat> &kernel_tm2, int inch, int outch)
+{
+    Mat kernel_tm(4*4, inch, outch, 2ul);  
+
+    // G
+    const short ktm[4][3] = {
+        {   2,     0,     0},
+        {   1,     1,     1},
+        {   1,    -1,     1},
+        {   0,     0,     2}
+    };
+
+    #pragma omp parallel for
+    for (int p = 0; p<outch; p++)
+    {
+        for (int q = 0; q<inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p*inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[4][3];
+            for (int i=0; i<4; i++)
+            {
+                tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j=0; j<4; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i=0; i<4; i++)
+                {
+                    kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+
+    for (int r=0; r<4; r++)
+    {
+        Mat kernel_tm_test(4*8, inch, outch/8 + (outch%8)/4 + outch%4, 2u);
+
+        int p = 0;
+        for (; p+7<outch; p+=8)
+        {
+            const short* kernel0 = (const short*)kernel_tm + (p+0)*inch*16;
+            const short* kernel1 = (const short*)kernel_tm + (p+1)*inch*16;
+            const short* kernel2 = (const short*)kernel_tm + (p+2)*inch*16;
+            const short* kernel3 = (const short*)kernel_tm + (p+3)*inch*16;
+            const short* kernel4 = (const short*)kernel_tm + (p+4)*inch*16;
+            const short* kernel5 = (const short*)kernel_tm + (p+5)*inch*16;
+            const short* kernel6 = (const short*)kernel_tm + (p+6)*inch*16;
+            const short* kernel7 = (const short*)kernel_tm + (p+7)*inch*16;
+
+            short* ktmp = kernel_tm_test.channel(p/8);
+
+            for (int q=0; q<inch; q++)
+            {
+                ktmp[0] = kernel0[r*4+0];
+                ktmp[1] = kernel0[r*4+1];
+                ktmp[2] = kernel0[r*4+2];
+                ktmp[3] = kernel0[r*4+3];
+
+                ktmp[4] = kernel1[r*4+0];
+                ktmp[5] = kernel1[r*4+1];
+                ktmp[6] = kernel1[r*4+2];
+                ktmp[7] = kernel1[r*4+3];
+
+                ktmp[8] = kernel2[r*4+0];
+                ktmp[9] = kernel2[r*4+1];
+                ktmp[10] = kernel2[r*4+2];
+                ktmp[11] = kernel2[r*4+3];
+
+                ktmp[12] = kernel3[r*4+0];
+                ktmp[13] = kernel3[r*4+1];
+                ktmp[14] = kernel3[r*4+2];
+                ktmp[15] = kernel3[r*4+3];
+
+                ktmp[16] = kernel4[r*4+0];
+                ktmp[17] = kernel4[r*4+1];
+                ktmp[18] = kernel4[r*4+2];
+                ktmp[19] = kernel4[r*4+3];
+
+                ktmp[20] = kernel5[r*4+0];
+                ktmp[21] = kernel5[r*4+1];
+                ktmp[22] = kernel5[r*4+2];
+                ktmp[23] = kernel5[r*4+3];
+
+                ktmp[24] = kernel6[r*4+0];
+                ktmp[25] = kernel6[r*4+1];
+                ktmp[26] = kernel6[r*4+2];
+                ktmp[27] = kernel6[r*4+3];
+
+                ktmp[28] = kernel7[r*4+0];
+                ktmp[29] = kernel7[r*4+1];
+                ktmp[30] = kernel7[r*4+2];
+                ktmp[31] = kernel7[r*4+3];
+
+                ktmp += 32;
+                kernel0 += 16;
+                kernel1 += 16;
+                kernel2 += 16;
+                kernel3 += 16;
+                kernel4 += 16;
+                kernel5 += 16;
+                kernel6 += 16;
+                kernel7 += 16;
+            }
+        }
+
+        for (; p+3<outch; p+=4)
+        {
+            const short* kernel0 = (const short*)kernel_tm + (p+0)*inch*16;
+            const short* kernel1 = (const short*)kernel_tm + (p+1)*inch*16;
+            const short* kernel2 = (const short*)kernel_tm + (p+2)*inch*16;
+            const short* kernel3 = (const short*)kernel_tm + (p+3)*inch*16;
+
+            short* ktmp = kernel_tm_test.channel(p/8 + (p%8)/4);
+
+            for (int q=0; q<inch; q++)
+            {
+                ktmp[0] = kernel0[r*4+0];
+                ktmp[1] = kernel0[r*4+1];
+                ktmp[2] = kernel0[r*4+2];
+                ktmp[3] = kernel0[r*4+3];
+
+                ktmp[4] = kernel1[r*4+0];
+                ktmp[5] = kernel1[r*4+1];
+                ktmp[6] = kernel1[r*4+2];
+                ktmp[7] = kernel1[r*4+3];
+
+                ktmp[8] = kernel2[r*4+0];
+                ktmp[9] = kernel2[r*4+1];
+                ktmp[10] = kernel2[r*4+2];
+                ktmp[11] = kernel2[r*4+3];
+
+                ktmp[12] = kernel3[r*4+0];
+                ktmp[13] = kernel3[r*4+1];
+                ktmp[14] = kernel3[r*4+2];
+                ktmp[15] = kernel3[r*4+3];                             
+
+                ktmp += 16;
+                kernel0 += 16;
+                kernel1 += 16;
+                kernel2 += 16;
+                kernel3 += 16;                
+            }
+        }
+
+        for (; p<outch; p++)
+        {
+            const short* kernel0 = (const short*)kernel_tm + p*inch*16;
+
+            short* ktmp = kernel_tm_test.channel(p/8 + (p%8)/4 + p%4);
+
+            for (int q=0; q<inch; q++)
+            {
+                ktmp[0] = kernel0[r*4+0];
+                ktmp[1] = kernel0[r*4+1];
+                ktmp[2] = kernel0[r*4+2];
+                ktmp[3] = kernel0[r*4+3];
+
+                ktmp += 4;
+                kernel0 += 16;
+            }        
+        }
+        kernel_tm2.push_back(kernel_tm_test);
+    }
+}
+
 static void conv3x3s2_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel_tm, int inch, int outch)
 {
     kernel_tm.create(8*9, inch, outch/8 + outch%8, (size_t)1u);
@@ -134,1220 +313,1433 @@ static void conv3x3s2_transform_kernel_int8_neon(const Mat& _kernel, Mat& kernel
 }
 
 #if __aarch64__
-static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+static void conv3x3s1_winograd23_int8_neon(const Mat& bottom_blob, Mat& top_blob, const std::vector<Mat> &kernel_tm_test, const Option& opt)
 {
     int w = bottom_blob.w;
+    int h = bottom_blob.h;
     int inch = bottom_blob.c;
 
     int outw = top_blob.w;
     int outh = top_blob.h;
     int outch = top_blob.c;
 
-    const signed char* kernel = _kernel;
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
 
-    int nn_outch = outch >> 1;
-    int remain_outch_start = nn_outch << 1; 
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp < nn_outch; pp++)
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);  
+    
+    // double start = ncnn::get_current_time();
+    // BEGIN transform input
+    Mat bottom_blob_tm;
     {
-        int p = pp * 2;
-
-        Mat out0 = top_blob.channel(p);
-        Mat out1 = top_blob.channel(p+1);
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-        out0.fill(0);
-        out1.fill(0);
-
-        const signed char* kernel0 = (const signed char *)kernel + p * inch * 9;
-        const signed char* kernel1 = (const signed char *)kernel + (p + 1) * inch * 9;
-        
-        for (int q=0; q<inch; q++)
-        {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr0n = outptr0 + outw;
-            int* outptr1n = outptr1 + outw;
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-            const signed char* img0 = bottom_blob.channel(q);
+        const int tiles = nColBlocks * nRowBlocks;
 
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
-            const signed char* r3 = img0 + w * 3;
+        bottom_blob_tm.create(4, inch, tiles*4, 2u, opt.workspace_allocator);
 
-            int i = 0;
+        // BT
+        // const float itm[4][4] = {
+        //     {1.0f,  0.0f, -1.0f,  0.0f},
+        //     {0.0f,  1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  0.00f, 1.0f}
+        // };        
 
-            int8x8_t _k00 = vdup_n_s8(kernel0[0]);
-            int8x8_t _k01 = vdup_n_s8(kernel0[1]);
-            int8x8_t _k02 = vdup_n_s8(kernel0[2]);
-            int8x8_t _k03 = vdup_n_s8(kernel0[3]);
-            int8x8_t _k04 = vdup_n_s8(kernel0[4]);
-            int8x8_t _k05 = vdup_n_s8(kernel0[5]);
-            int8x8_t _k06 = vdup_n_s8(kernel0[6]);
-            int8x8_t _k07 = vdup_n_s8(kernel0[7]);
-            int8x8_t _k08 = vdup_n_s8(kernel0[8]);
-
-            int8x8_t _k10 = vdup_n_s8(kernel1[0]);
-            int8x8_t _k11 = vdup_n_s8(kernel1[1]);
-            int8x8_t _k12 = vdup_n_s8(kernel1[2]);
-            int8x8_t _k13 = vdup_n_s8(kernel1[3]);
-            int8x8_t _k14 = vdup_n_s8(kernel1[4]);
-            int8x8_t _k15 = vdup_n_s8(kernel1[5]);
-            int8x8_t _k16 = vdup_n_s8(kernel1[6]);
-            int8x8_t _k17 = vdup_n_s8(kernel1[7]);
-            int8x8_t _k18 = vdup_n_s8(kernel1[8]); 
+        for (int q=0; q<inch; q++)
+        {
+            const signed char* img = bottom_blob_bordered.channel(q);
 
-            for (; i+1 < outh; i+=2)
+            for (int j=0; j<nColBlocks; j++)
             {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+                const signed char* r0 = img + w * j * 2;
+                const signed char* r1 = r0 + w;
+                const signed char* r2 = r1 + w;
+                const signed char* r3 = r2 + w;
 
-                for (; nn > 0; nn--)
+                for (int i = 0; i<nRowBlocks; i++)
                 {
-                    // outch 0
-                    int8x8_t _r0 = vld1_s8(r0);
-                    int8x8_t _r0n = vld1_s8(r0+8);
-                    int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
-                    int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
-
-                    int16x8_t _sum0 = vmull_s8(_r0, _k00);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k01);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k02);
-
-                    int8x8_t _r1 = vld1_s8(r1);
-                    int8x8_t _r1n = vld1_s8(r1+8);
-                    int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
-                    int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r1, _k03);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k04);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k05);
-
-                    int16x8_t _sum1 = vmull_s8(_r1, _k00);
-                    _sum1 = vmlal_s8(_sum1, _r11, _k01);
-                    _sum1 = vmlal_s8(_sum1, _r12, _k02);
-
-                    int8x8_t _r2 = vld1_s8(r2);
-                    int8x8_t _r2n = vld1_s8(r2+8);
-                    int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
-                    int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r2, _k06);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k07);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k08);
-
-                    _sum1 = vmlal_s8(_sum1, _r2, _k03);
-                    _sum1 = vmlal_s8(_sum1, _r21, _k04);
-                    _sum1 = vmlal_s8(_sum1, _r22, _k05);
-
-                    int8x8_t _r3 = vld1_s8(r3);
-                    int8x8_t _r3n = vld1_s8(r3+8);
-                    int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
-                    int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
-                    _sum1 = vmlal_s8(_sum1, _r3, _k06);
-                    _sum1 = vmlal_s8(_sum1, _r31, _k07);
-                    _sum1 = vmlal_s8(_sum1, _r32, _k08);
-
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    int32x4_t sum0n_s32 = vld1q_s32(outptr0+4);
-
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0));
-
-                    vst1q_s32(outptr0, sum0_s32);
-                    vst1q_s32(outptr0+4, sum0n_s32);
-
-                    int32x4_t sum1_s32 = vld1q_s32(outptr0n);
-                    int32x4_t sum1n_s32 = vld1q_s32(outptr0n+4);
-
-                    sum1_s32 = vaddw_s16(sum1_s32, vget_low_s16(_sum1));
-                    sum1n_s32 = vaddw_s16(sum1n_s32, vget_high_s16(_sum1));
+                    short* out_tm0 = bottom_blob_tm.channel(tiles*0+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm1 = bottom_blob_tm.channel(tiles*1+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm2 = bottom_blob_tm.channel(tiles*2+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm3 = bottom_blob_tm.channel(tiles*3+j*nRowBlocks+i).row<short>(q);
+
+                    short d0[4],d1[4],d2[4],d3[4];
+                    short w0[4],w1[4],w2[4],w3[4];
+                    short t0[4],t1[4],t2[4],t3[4];
+                    // load 
+                    for (int n = 0; n < 4; n++)
+                    {
+                        d0[n] = r0[n];
+                        d1[n] = r1[n];
+                        d2[n] = r2[n];
+                        d3[n] = r3[n];
+                    }
+                    // w = B_t * d
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        w0[n] = d0[n] - d2[n];
+                        w1[n] = d1[n] + d2[n];
+                        w2[n] = d2[n] - d1[n];
+                        w3[n] = d3[n] - d1[n];
+                    } 
+                    // transpose d to d_t
+                    {
+                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
+                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
+                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
+                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
+                    }
+                    // U = B_t * d_t
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        d0[n] = t0[n] - t2[n];
+                        d1[n] = t1[n] + t2[n];
+                        d2[n] = t2[n] - t1[n];
+                        d3[n] = t3[n] - t1[n];
+                    }                
+                    // save to out_tm
+                    for (int n = 0; n < 4; n++)
+                    {
+                        out_tm0[n] = d0[n];
+                        out_tm1[n] = d1[n];
+                        out_tm2[n] = d2[n];
+                        out_tm3[n] = d3[n];
+                    }
+                        
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;    
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
 
-                    vst1q_s32(outptr0n, sum1_s32);
-                    vst1q_s32(outptr0n+4, sum1n_s32);
+    // double end = ncnn::get_current_time();
+    // printf("trans A : %.3f ms\n", end - start);
+    // start = ncnn::get_current_time();
 
-                    // outch 1
-                    _sum0 = vmull_s8(_r0, _k10);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k11);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k12);
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-                    _sum0 = vmlal_s8(_sum0, _r1, _k13);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k14);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k15);
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-                    _sum0 = vmlal_s8(_sum0, _r2, _k16);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k17);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k18);
+        const int tiles = nColBlocks * nRowBlocks; 
 
-                    _sum1 = vmull_s8(_r1, _k10);
-                    _sum1 = vmlal_s8(_sum1, _r11, _k11);
-                    _sum1 = vmlal_s8(_sum1, _r12, _k12);
+        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
 
-                    _sum1 = vmlal_s8(_sum1, _r2, _k13);
-                    _sum1 = vmlal_s8(_sum1, _r21, _k14);
-                    _sum1 = vmlal_s8(_sum1, _r22, _k15);
+        for (int r=0; r<4; r++)
+        {
+            int nn_outch = 0;
+            int remain_outch_start = 0;
 
-                    _sum1 = vmlal_s8(_sum1, _r3, _k16);
-                    _sum1 = vmlal_s8(_sum1, _r31, _k17);
-                    _sum1 = vmlal_s8(_sum1, _r32, _k18);
+            nn_outch = outch >> 3;
+            remain_outch_start = nn_outch << 3;
 
-                    sum0_s32 = vld1q_s32(outptr1);
-                    sum0n_s32 = vld1q_s32(outptr1+4);
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int pp=0; pp<nn_outch; pp++)
+            {
+                int p = pp * 8;
+
+                int* output0_tm = top_blob_tm.channel(p);
+                int* output1_tm = top_blob_tm.channel(p+1);
+                int* output2_tm = top_blob_tm.channel(p+2);
+                int* output3_tm = top_blob_tm.channel(p+3);
+                int* output4_tm = top_blob_tm.channel(p+4);
+                int* output5_tm = top_blob_tm.channel(p+5);
+                int* output6_tm = top_blob_tm.channel(p+6);
+                int* output7_tm = top_blob_tm.channel(p+7);
+
+                output0_tm = output0_tm + r*4;
+                output1_tm = output1_tm + r*4;
+                output2_tm = output2_tm + r*4;
+                output3_tm = output3_tm + r*4;
+                output4_tm = output4_tm + r*4;
+                output5_tm = output5_tm + r*4;
+                output6_tm = output6_tm + r*4;
+                output7_tm = output7_tm + r*4;
+
+                for (int i=0; i<tiles; i++)
+                {
+                    const short* kptr = kernel_tm_test[r].channel(p/8);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
+
+                    int sum0[4] = {0};
+                    int sum1[4] = {0};
+                    int sum2[4] = {0};
+                    int sum3[4] = {0};
+                    int sum4[4] = {0};
+                    int sum5[4] = {0};
+                    int sum6[4] = {0};
+                    int sum7[4] = {0};
+
+                    for (int q=0; q<inch; q++)
+                    {
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                            sum1[n] += (int)r0[n] * kptr[n+4];
+                            sum2[n] += (int)r0[n] * kptr[n+8];
+                            sum3[n] += (int)r0[n] * kptr[n+12];
+                            sum4[n] += (int)r0[n] * kptr[n+16];
+                            sum5[n] += (int)r0[n] * kptr[n+20];
+                            sum6[n] += (int)r0[n] * kptr[n+24];
+                            sum7[n] += (int)r0[n] * kptr[n+28];
+                        }
+                        kptr += 32;
+                        r0 += 4;
+                    }
 
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0));
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                        output1_tm[n] = sum1[n];
+                        output2_tm[n] = sum2[n];
+                        output3_tm[n] = sum3[n];
+                        output4_tm[n] = sum4[n];
+                        output5_tm[n] = sum5[n];
+                        output6_tm[n] = sum6[n];
+                        output7_tm[n] = sum7[n];
+                    }
 
-                    vst1q_s32(outptr1, sum0_s32);
-                    vst1q_s32(outptr1+4, sum0n_s32);
+                    output0_tm += 16;
+                    output1_tm += 16;
+                    output2_tm += 16;
+                    output3_tm += 16;
+                    output4_tm += 16;
+                    output5_tm += 16;
+                    output6_tm += 16;
+                    output7_tm += 16;
+                }
+            }
 
-                    sum1_s32 = vld1q_s32(outptr1n);
-                    sum1n_s32 = vld1q_s32(outptr1n+4);
+            nn_outch = (outch - remain_outch_start) >> 2;
 
-                    sum1_s32 = vaddw_s16(sum1_s32, vget_low_s16(_sum1));
-                    sum1n_s32 = vaddw_s16(sum1n_s32, vget_high_s16(_sum1));
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int pp=0; pp<nn_outch; pp++)
+            {
+                int p = remain_outch_start + pp * 4;
 
-                    vst1q_s32(outptr1n, sum1_s32);
-                    vst1q_s32(outptr1n+4, sum1n_s32);
+                int* output0_tm = top_blob_tm.channel(p);
+                int* output1_tm = top_blob_tm.channel(p+1);
+                int* output2_tm = top_blob_tm.channel(p+2);
+                int* output3_tm = top_blob_tm.channel(p+3);
 
-                    r0 += 8;
-                    r1 += 8;
-                    r2 += 8;
-                    r3 += 8;
-                    outptr0 += 8;
-                    outptr1 += 8;
-                    outptr0n += 8;
-                    outptr1n += 8;
-                }
+                output0_tm = output0_tm + r*4;
+                output1_tm = output1_tm + r*4;
+                output2_tm = output2_tm + r*4;
+                output3_tm = output3_tm + r*4;
 
-                for (; remain>0; remain--)
+                for (int i=0; i<tiles; i++)
                 {
-                    int sum0 = 0;
-                    int sum0n = 0;
-                    int sum1 = 0;
-                    int sum1n = 0;
-
-                    //ToDo Neon
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
-
-                    sum0n += (int)r1[0] * kernel0[0];
-                    sum0n += (int)r1[1] * kernel0[1];
-                    sum0n += (int)r1[2] * kernel0[2];
-                    sum0n += (int)r2[0] * kernel0[3];
-                    sum0n += (int)r2[1] * kernel0[4];
-                    sum0n += (int)r2[2] * kernel0[5];
-                    sum0n += (int)r3[0] * kernel0[6];
-                    sum0n += (int)r3[1] * kernel0[7];
-                    sum0n += (int)r3[2] * kernel0[8];
-
-                    sum1n += (int)r1[0] * kernel1[0];
-                    sum1n += (int)r1[1] * kernel1[1];
-                    sum1n += (int)r1[2] * kernel1[2];
-                    sum1n += (int)r2[0] * kernel1[3];
-                    sum1n += (int)r2[1] * kernel1[4];
-                    sum1n += (int)r2[2] * kernel1[5];
-                    sum1n += (int)r3[0] * kernel1[6];
-                    sum1n += (int)r3[1] * kernel1[7];
-                    sum1n += (int)r3[2] * kernel1[8];
+                    const short* kptr = kernel_tm_test[r].channel(p/8 + (p%8)/4);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
+
+                    int sum0[4] = {0};
+                    int sum1[4] = {0};
+                    int sum2[4] = {0};
+                    int sum3[4] = {0};
+
+                    for (int q=0; q<inch; q++)
+                    {   
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                            sum1[n] += (int)r0[n] * kptr[n+4];
+                            sum2[n] += (int)r0[n] * kptr[n+8];
+                            sum3[n] += (int)r0[n] * kptr[n+12];
+                        }
+                        kptr += 16;
+                        r0 += 4;
+                    }
 
-                    *outptr0 += sum0;
-                    *outptr1 += sum1;
-                    *outptr0n += sum0n;
-                    *outptr1n += sum1n;
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                        output1_tm[n] = sum1[n];
+                        output2_tm[n] = sum2[n];
+                        output3_tm[n] = sum3[n];
+                    }
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    r3++;
-                    outptr0++;
-                    outptr1++;
-                    outptr0n++;
-                    outptr1n++;
+                    output0_tm += 16;
+                    output1_tm += 16;
+                    output2_tm += 16;
+                    output3_tm += 16;
                 }
-
-                r0 += 2 + w;
-                r1 += 2 + w;
-                r2 += 2 + w;
-                r3 += 2 + w;
-
-                outptr0 += outw;
-                outptr1 += outw;
-                outptr0n += outw;
-                outptr1n += outw;
             }
 
-            for (; i < outh; i++)
-            {
-                int nn = outw >> 3;
-                int remain = outw & 7;
-
-                for (; nn > 0; nn--)
-                {
-                    // outch 0
-                    int8x8_t _r0 = vld1_s8(r0);
-                    int8x8_t _r0n = vld1_s8(r0+8);
-                    int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
-                    int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
-
-                    int16x8_t _sum0 = vmull_s8(_r0, _k00);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k01);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k02);
-
-                    int8x8_t _r1 = vld1_s8(r1);
-                    int8x8_t _r1n = vld1_s8(r1+8);
-                    int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
-                    int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r1, _k03);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k04);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k05);
-
-                    int8x8_t _r2 = vld1_s8(r2);
-                    int8x8_t _r2n = vld1_s8(r2+8);
-                    int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
-                    int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r2, _k06);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k07);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k08);
-
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    int32x4_t sum0n_s32 = vld1q_s32(outptr0+4);
+            remain_outch_start += nn_outch << 2;
 
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0));
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=remain_outch_start; p<outch; p++)
+            {
+                int* output0_tm = top_blob_tm.channel(p);
 
-                    vst1q_s32(outptr0, sum0_s32);
-                    vst1q_s32(outptr0+4, sum0n_s32);
+                output0_tm = output0_tm + r*4;
 
-                    // outch 1
-                    _sum0 = vmull_s8(_r0, _k10);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k11);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k12);
+                for (int i=0; i<tiles; i++)
+                {
+                    const short* kptr = kernel_tm_test[r].channel(p/8 + (p%8)/4 + p%4);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
 
-                    _sum0 = vmlal_s8(_sum0, _r1, _k13);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k14);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k15);
+                    int sum0[4] = {0};
 
-                    _sum0 = vmlal_s8(_sum0, _r2, _k16);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k17);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k18);
+                    for (int q=0; q<inch; q++)
+                    {
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                        }
+                        kptr += 4; 
+                        r0 += 4;
+                    }
 
-                    sum0_s32 = vld1q_s32(outptr1);
-                    sum0n_s32 = vld1q_s32(outptr1+4);
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                    }
+                    output0_tm += 16;
+                }
+            }
+        }   
+    }
+    bottom_blob_tm = Mat();
+    // END dot    
+
+    // end = ncnn::get_current_time();
+    // printf("dot B   : %.3f ms\n", end - start);
+    // start = ncnn::get_current_time();
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    {
+        // AT
+        // const float itm[2][4] = {
+        //     {1.0f,  1.0f,  1.0f,  0.0f},
+        //     {0.0f,  1.0f, -1.0f,  1.0f}
+        // }; 
 
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0));
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-                    vst1q_s32(outptr1, sum0_s32);
-                    vst1q_s32(outptr1+4, sum0n_s32);
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-                    r0 += 8;
-                    r1 += 8;
-                    r2 += 8;
-                    outptr0 += 8;
-                    outptr1 += 8;
-                }
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<outch; p++)
+        {
+            int* out_tile = top_blob_tm.channel(p);
+            int* outRow0 = top_blob_bordered.channel(p);
+            int* outRow1 = outRow0 + outw;     
 
-                for (; remain>0; remain--)
+            for (int j=0; j<nColBlocks; j++)
+            {
+                for(int i=0; i<nRowBlocks; i++)
                 {
-                    int sum0 = 0;
-                    int sum1 = 0;
-
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
+                    int s0[4],s1[4],s2[4],s3[4];
+                    int w0[4],w1[4];
+                    int d0[2],d1[2],d2[2],d3[2];
+                    int o0[2],o1[2];
+                    // load
+                    for (int n = 0; n < 4; n++)
+                    {
+                        s0[n] = out_tile[n];
+                        s1[n] = out_tile[n+ 4];
+                        s2[n] = out_tile[n+ 8];
+                        s3[n] = out_tile[n+12];
+                    }
+                    // w = A_T * W
+                    for (int n = 0; n < 4; n++)
+                    {
+                        w0[n] = s0[n] + s1[n] + s2[n];
+                        w1[n] = s1[n] - s2[n] + s3[n];
+                    }
+                    // transpose w to w_t
+                    {
+                        d0[0] = w0[0]; d0[1] = w1[0];
+                        d1[0] = w0[1]; d1[1] = w1[1];
+                        d2[0] = w0[2]; d2[1] = w1[2];
+                        d3[0] = w0[3]; d3[1] = w1[3];
+                    }
+                    // Y = A_T * w_t
+                    for (int n = 0; n < 2; n++)
+                    {
+                        o0[n] = d0[n] + d1[n] + d2[n];
+                        o1[n] = d1[n] - d2[n] + d3[n];
+                    }
+                    // save to top blob tm,why right 2,because the G' = G*2
+                    outRow0[0] = o0[0] >> 2;
+                    outRow0[1] = o0[1] >> 2;
+                    outRow1[0] = o1[0] >> 2;
+                    outRow1[1] = o1[1] >> 2;
 
-                    *outptr0 += sum0;
-                    *outptr1 += sum1;
+                    out_tile += 16;
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    outptr0++;
-                    outptr1++;
+                    outRow0 += 2;
+                    outRow1 += 2;
                 }
 
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
+                outRow0 += outw;
+                outRow1 += outw;
             }
-
-            kernel0 += 9;
-            kernel1 += 9;
-        }
+        }        
     }
+    // END transform output 
+    // end = ncnn::get_current_time();
+    // printf("trans C : %.3f ms\n", end - start);
+    
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);  
+}
+
+static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    int nn_outch = outch >> 2;
+    int remain_outch_start = nn_outch << 2;
 
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p=remain_outch_start; p<outch; p++)
+    for (int pp=0; pp < nn_outch; pp++)
     {
-        Mat out0 = top_blob.channel(p);
+        int p = pp * 4;
+
+        Mat out0 = top_blob.channel(p+0);
+        Mat out1 = top_blob.channel(p+1);
+        Mat out2 = top_blob.channel(p+2);
+        Mat out3 = top_blob.channel(p+3);
 
         out0.fill(0);
+        out1.fill(0);
+        out2.fill(0);
+        out3.fill(0);
 
-        const signed char* kernel0 = (const signed char *)kernel + p * inch * 9;
+        const signed char* ktmp = _kernel.channel(p/4);
 
-        for (int q=0; q<inch; q++)
-        {                   
+        for (int q = 0; q < inch; q++)
+        {
             int* outptr0 = out0;
-            int* outptr0n = outptr0 + outw;
-        
-            const signed char* img0 = bottom_blob.channel(q);
-            
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
-            const signed char* r3 = img0 + w * 3;
+            int* outptr1 = out1;
+            int* outptr2 = out2;
+            int* outptr3 = out3;
 
-            int i = 0;
+            const signed char *img0 = bottom_blob.channel(q);
+
+            const signed char *r0 = img0;
+            const signed char *r1 = img0 + w;
+            const signed char *r2 = img0 + w * 2;
+
+            int i = 0;    
+
+            for (; i < outh; i++)
+            {
+#if 0 //__ARM_NEON
+                int nn = outw >> 3;
+                int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if 0 //__ARM_NEON
+                if (nn > 0)
+                {
+                asm volatile(
+                    "0:                         \n"
+                    "vld1.s8    {d0-d3}, [%8]!  \n"// d0=(k00-k30 k01-k31) d1=(k02-k32 k03-k33) d2=(k04-k34 k05-k35) d3=(k06-k36 k07-k37)
+                    // r0
+                    "pld        [%5, #128]      \n"
+                    "vld1.s8    {d8-d9}, [%5]   \n"// d8=r00-r07 d9=r08-r015 q4
+                    "add        %5, #8          \n"
+                    "pld        [%1, #128]      \n"
+                    "vld1.s32   {d12-d15}, [%1] \n"// sum00-sum07 q6 q7
+                    "pld        [%2, #128]      \n"
+                    "vld1.s32   {d16-d19}, [%2] \n"// sum10-sum17 q8 q9
+                    "pld        [%3, #128]      \n"
+                    "vld1.s32   {d20-d23}, [%3] \n"// sum20-sum27 q10 q11
+                    "pld        [%4, #128]      \n"
+                    "vld1.s32   {d24-d27}, [%4] \n"// sum30-sum37 q12 q13
+                    
+                    "vmovl.s8   q3, d3          \n"// d6(k06-k36) d7(k07-k37) 
+                    "vmovl.s8   q2, d2          \n"// d4(k04-k34) d5(k05-k35)
+                    "vmovl.s8   q1, d1          \n"// d2(k02-k32) d3(k03-k33)
+                    "vmovl.s8   q0, d0          \n"// d0(k00-k30) d1(k01-k31)
+                    "vmovl.s8   q5, d8          \n"// d10(r00-r03) d11(r04-r07)
+                    
+                    "vmlal.s16  q6, d10, d0[0]  \n"// sum(00-07) += (r00-r07) * k00
+                    "vmlal.s16  q7, d11, d0[0]  \n"
+                    "vmlal.s16  q8, d10, d0[1]  \n"// sum(10-17) += (r00-r07) * k10
+                    "vmlal.s16  q9, d11, d0[1]  \n"     
+                    "vmlal.s16  q10, d10, d0[2] \n"// sum(20-27) += (r00-r07) * k20
+                    "vmlal.s16  q11, d11, d0[2] \n"
+                    "vmlal.s16  q12, d10, d0[3] \n"// sum(30-37) += (r00-r07) * k30
+                    "vmlal.s16  q13, d11, d0[3] \n"
+
+                    "vext.s8    q4, q4, #1      \n"// d8=r01-r08 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r01-r04) d11(r05-r08)
+                    
+                    "vmlal.s16  q6, d10, d1[0]  \n"// sum(00-07) += (r01-r08) * k01
+                    "vmlal.s16  q7, d11, d1[0]  \n"
+                    "vmlal.s16  q8, d10, d1[1]  \n"// sum(10-17) += (r01-r08) * k11
+                    "vmlal.s16  q9, d11, d1[1]  \n"     
+                    "vmlal.s16  q10, d10, d1[2] \n"// sum(20-27) += (r01-r08) * k21
+                    "vmlal.s16  q11, d11, d1[2] \n"
+                    "vmlal.s16  q12, d10, d1[3] \n"// sum(30-37) += (r01-r08) * k31
+                    "vmlal.s16  q13, d11, d1[3] \n"
+                    
+                    "vext.s8    q4, q4, #1      \n"// d8=r02-r09 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r02-r05) d11(r06-r09)
+                    
+                    "vmlal.s16  q6, d10, d2[0]  \n"// sum(00-07) += (r02-r09) * k02
+                    "vmlal.s16  q7, d11, d2[0]  \n"
+                    "vmlal.s16  q8, d10, d2[1]  \n"// sum(10-17) += (r02-r09) * k12
+                    "vmlal.s16  q9, d11, d2[1]  \n"     
+                    "vmlal.s16  q10, d10, d2[2] \n"// sum(20-27) += (r02-r09) * k22
+                    "vmlal.s16  q11, d11, d2[2] \n"
+                    "vmlal.s16  q12, d10, d2[3] \n"// sum(30-37) += (r02-r09) * k32
+                    "vmlal.s16  q13, d11, d2[3] \n"                 
+                    
+                    // r1
+                    "pld        [%6, #128]      \n"
+                    "vld1.s8    {d8-d9}, [%6]   \n"// d8=r10-r17 d9=r18-r115 q4
+                    "add        %6, #8          \n"
+                    "vmovl.s8   q5, d8          \n"// d10(r10-r13) d11(r14-r17)
+                    
+                    "vmlal.s16  q6, d10, d3[0]  \n"// sum(00-07) += (r10-r17) * k03
+                    "vmlal.s16  q7, d11, d3[0]  \n"
+                    "vmlal.s16  q8, d10, d3[1]  \n"// sum(10-17) += (r10-r17) * k13
+                    "vmlal.s16  q9, d11, d3[1]  \n"     
+                    "vmlal.s16  q10, d10, d3[2] \n"// sum(20-27) += (r10-r17) * k23
+                    "vmlal.s16  q11, d11, d3[2] \n"
+                    "vmlal.s16  q12, d10, d3[3] \n"// sum(30-37) += (r10-r17) * k33
+                    "vmlal.s16  q13, d11, d3[3] \n"
+                    
+                    "vext.s8    q4, q4, #1      \n"// d8=r11-r18 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r11-r14) d11(r15-r18)
+
+                    "vmlal.s16  q6, d10, d4[0]  \n"// sum(00-07) += (r11-r18) * k04
+                    "vmlal.s16  q7, d11, d4[0]  \n"
+                    "vmlal.s16  q8, d10, d4[1]  \n"// sum(10-17) += (r11-r18) * k14
+                    "vmlal.s16  q9, d11, d4[1]  \n"     
+                    "vmlal.s16  q10, d10, d4[2] \n"// sum(20-27) += (r11-r18) * k24
+                    "vmlal.s16  q11, d11, d4[2] \n"
+                    "vmlal.s16  q12, d10, d4[3] \n"// sum(30-37) += (r11-r18) * k34
+                    "vmlal.s16  q13, d11, d4[3] \n"
+                    
+                    "vext.s8    q4, q4, #1      \n"// d8=r12-r19 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r12-r15) d11(r16-r19)
+
+                    "vmlal.s16  q6, d10, d5[0]  \n"// sum(00-07) += (r12-r19) * k05
+                    "vmlal.s16  q7, d11, d5[0]  \n"
+                    "vmlal.s16  q8, d10, d5[1]  \n"// sum(10-17) += (r12-r19) * k15
+                    "vmlal.s16  q9, d11, d5[1]  \n"     
+                    "vmlal.s16  q10, d10, d5[2] \n"// sum(20-27) += (r12-r19) * k25
+                    "vmlal.s16  q11, d11, d5[2] \n"
+                    "vmlal.s16  q12, d10, d5[3] \n"// sum(30-37) += (r12-r19) * k35
+                    "vmlal.s16  q13, d11, d5[3] \n"
+
+                    // r2
+                    "pld        [%7, #128]      \n"
+                    "vld1.s8    {d8-d9}, [%7]   \n"// d8=r20-r27 d9=r28-r215 q4
+                    "add        %7, #8          \n"
+                    "vmovl.s8   q5, d8          \n"// d10(r20-r23) d11(r24-r27)
+
+                    "vmlal.s16  q6, d10, d6[0]  \n"// sum(00-07) += (r20-r27) * k06
+                    "vmlal.s16  q7, d11, d6[0]  \n"
+                    "vmlal.s16  q8, d10, d6[1]  \n"// sum(10-17) += (r20-r27) * k16
+                    "vmlal.s16  q9, d11, d6[1]  \n"     
+                    "vmlal.s16  q10, d10, d6[2] \n"// sum(20-27) += (r20-r27) * k26
+                    "vmlal.s16  q11, d11, d6[2] \n"
+                    "vmlal.s16  q12, d10, d6[3] \n"// sum(30-37) += (r20-r27) * k36
+                    "vmlal.s16  q13, d11, d6[3] \n"
+                    
+                    "vext.s8    q4, q4, #1      \n"// d8=r21-r28 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r21-r24) d11(r25-r28)
+
+                    "vmlal.s16  q6, d10, d7[0]  \n"// sum(00-07) += (r21-r28) * k07
+                    "vmlal.s16  q7, d11, d7[0]  \n"
+                    "vmlal.s16  q8, d10, d7[1]  \n"// sum(10-17) += (r21-r28) * k17
+                    "vmlal.s16  q9, d11, d7[1]  \n"     
+                    "vmlal.s16  q10, d10, d7[2] \n"// sum(20-27) += (r21-r28) * k27
+                    "vmlal.s16  q11, d11, d7[2] \n"
+                    "vmlal.s16  q12, d10, d7[3] \n"// sum(30-37) += (r21-r28) * k37
+                    "vmlal.s16  q13, d11, d7[3] \n"
+                    
+                    "vld1.s8    {d0}, [%8]      \n"// d0(k08-k38 xx-xx)
+                    "add        %8, #4          \n"
+                    "vmovl.s8   q0, d0          \n"// d0(k08-k38) d1(xx-xx)
+
+                    "vext.s8    q4, q4, #1      \n"// d8=r22-r29 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r22-r25) d11(r26-r29)
+
+                    "vmlal.s16  q6, d10, d0[0]  \n"// sum(00-07) += (r22-r29) * k08
+                    "vmlal.s16  q7, d11, d0[0]  \n"
+                    "vmlal.s16  q8, d10, d0[1]  \n"// sum(10-17) += (r22-r29) * k18
+                    "vmlal.s16  q9, d11, d0[1]  \n"     
+                    "vmlal.s16  q10, d10, d0[2] \n"// sum(20-27) += (r22-r29) * k28
+                    "vmlal.s16  q11, d11, d0[2] \n"
+                    "vmlal.s16  q12, d10, d0[3] \n"// sum(30-37) += (r22-r29) * k38
+                    "vmlal.s16  q13, d11, d0[3] \n"
+                    
+                    "vst1.s32   {d12-d15}, [%1]! \n"// sum00-sum07 q6 q7
+                    "vst1.s32   {d16-d19}, [%2]! \n"// sum10-sum17 q8 q9
+                    "vst1.s32   {d20-d23}, [%3]! \n"// sum20-sum27 q10 q11
+                    "vst1.s32   {d24-d27}, [%4]! \n"// sum30-sum37 q12 q13
+
+                    "sub        %8, #36          \n"
+                    "subs       %0, #1           \n"
+
+                    "bne        0b               \n"
+
+                    : "=r"(nn),         // %0
+                      "=r"(outptr0),    // %1
+                      "=r"(outptr1),    // %2
+                      "=r"(outptr2),    // %3
+                      "=r"(outptr3),    // %4
+                      "=r"(r0),         // %5
+                      "=r"(r1),         // %6
+                      "=r"(r2),         // %7
+                      "=r"(ktmp)        // %8
+                    : "0"(nn),
+                      "1"(outptr0),
+                      "2"(outptr1),
+                      "3"(outptr2),
+                      "4"(outptr3),
+                      "5"(r0),
+                      "6"(r1),
+                      "7"(r2),
+                      "8"(ktmp)
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" //q14 q15 not be used...
+                );
+                }
+#endif
+#if 0 //__ARM_NEON
+                if (remain >= 4)
+                {
+                    remain -= 4;
+                asm volatile(
+                    "vld1.s8    {d0-d3}, [%7]!  \n"// d0=(k00-k30 k01-k31) d1=(k02-k32 k03-k33) d2=(k04-k34 k05-k35) d3=(k06-k36 k07-k37)
+                    // r0
+                    "vld1.s8    {d8}, [%4]      \n"// d8=r00-r07
+                    "add        %4, #4          \n"
+                    "vld1.s32   {d12-d13}, [%0] \n"// sum00-sum03 q6
+                    "vld1.s32   {d16-d17}, [%1] \n"// sum10-sum13 q8
+                    "vld1.s32   {d20-d21}, [%2] \n"// sum20-sum23 q10
+                    "vld1.s32   {d24-d25}, [%3] \n"// sum30-sum33 q12
+                    
+                    "vmovl.s8   q3, d3          \n"// d6(k06-k36) d7(k07-k37) 
+                    "vmovl.s8   q2, d2          \n"// d4(k04-k34) d5(k05-k35)
+                    "vmovl.s8   q1, d1          \n"// d2(k02-k32) d3(k03-k33)
+                    "vmovl.s8   q0, d0          \n"// d0(k00-k30) d1(k01-k31)
+                    "vmovl.s8   q5, d8          \n"// d10(r00-r03)
+                    
+                    "vmlal.s16  q6, d10, d0[0]  \n"// sum(00-03) += (r00-r03) * k00
+                    "vmlal.s16  q8, d10, d0[1]  \n"// sum(10-13) += (r00-r03) * k10
+                    "vmlal.s16  q10, d10, d0[2] \n"// sum(20-23) += (r00-r03) * k20
+                    "vmlal.s16  q12, d10, d0[3] \n"// sum(30-33) += (r00-r03) * k30
+
+                    "vext.s8    d8, d8, #1      \n"// d8=r01-r08
+                    "vmovl.s8   q5, d8          \n"// d10(r01-r04)
+                    
+                    "vmlal.s16  q6, d10, d1[0]  \n"// sum(00-03) += (r01-r04) * k01
+                    "vmlal.s16  q8, d10, d1[1]  \n"// sum(10-13) += (r01-r04) * k11
+                    "vmlal.s16  q10, d10, d1[2] \n"// sum(20-23) += (r01-r04) * k21
+                    "vmlal.s16  q12, d10, d1[3] \n"// sum(30-33) += (r01-r04) * k31
+                    
+                    "vext.s8    d8, d8, #1      \n"// d8=r02-r09
+                    "vmovl.s8   q5, d8          \n"// d10(r02-r05)
+                    
+                    "vmlal.s16  q6, d10, d2[0]  \n"// sum(00-03) += (r02-r05) * k02
+                    "vmlal.s16  q8, d10, d2[1]  \n"// sum(10-13) += (r02-r05) * k12
+                    "vmlal.s16  q10, d10, d2[2] \n"// sum(20-23) += (r02-r05) * k22
+                    "vmlal.s16  q12, d10, d2[3] \n"// sum(30-33) += (r02-r05) * k32
+                    
+                    // r1
+                    "vld1.s8    {d8}, [%5]      \n"// d8=r10-r17
+                    "add        %5, #4          \n"
+                    "vmovl.s8   q5, d8          \n"// d10(r10-r13)
+                    
+                    "vmlal.s16  q6, d10, d3[0]  \n"// sum(00-03) += (r10-r13) * k03
+                    "vmlal.s16  q8, d10, d3[1]  \n"// sum(10-13) += (r10-r13) * k13
+                    "vmlal.s16  q10, d10, d3[2] \n"// sum(20-23) += (r10-r13) * k23
+                    "vmlal.s16  q12, d10, d3[3] \n"// sum(30-33) += (r10-r13) * k33
+                    
+                    "vext.s8    d8, d8, #1      \n"// d8=r11-r18
+                    "vmovl.s8   q5, d8          \n"// d10(r11-r14)
+
+                    "vmlal.s16  q6, d10, d4[0]  \n"// sum(00-03) += (r11-r14) * k04
+                    "vmlal.s16  q8, d10, d4[1]  \n"// sum(10-13) += (r11-r14) * k14
+                    "vmlal.s16  q10, d10, d4[2] \n"// sum(20-23) += (r11-r14) * k24
+                    "vmlal.s16  q12, d10, d4[3] \n"// sum(30-33) += (r11-r14) * k34
+                    
+                    "vext.s8    d8, d8, #1      \n"// d8=r12-r19 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r12-r15)
+
+                    "vmlal.s16  q6, d10, d5[0]  \n"// sum(00-03) += (r12-r15) * k05
+                    "vmlal.s16  q8, d10, d5[1]  \n"// sum(10-13) += (r12-r15) * k15
+                    "vmlal.s16  q10, d10, d5[2] \n"// sum(20-23) += (r12-r15) * k25
+                    "vmlal.s16  q12, d10, d5[3] \n"// sum(30-33) += (r12-r15) * k35
+
+                    // r2
+                    "vld1.s8    {d8}, [%6]      \n"// d8=r20-r27
+                    "add        %6, #4          \n"
+                    "vmovl.s8   q5, d8          \n"// d10(r20-r23)
+
+                    "vmlal.s16  q6, d10, d6[0]  \n"// sum(00-03) += (r20-r23) * k06
+                    "vmlal.s16  q8, d10, d6[1]  \n"// sum(10-13) += (r20-r23) * k16
+                    "vmlal.s16  q10, d10, d6[2] \n"// sum(20-23) += (r20-r23) * k26
+                    "vmlal.s16  q12, d10, d6[3] \n"// sum(30-33) += (r20-r23) * k36
+                    
+                    "vext.s8    q4, q4, #1      \n"// d8=r21-r28 q4
+                    "vmovl.s8   q5, d8          \n"// d10(r21-r24)
+
+                    "vmlal.s16  q6, d10, d7[0]  \n"// sum(00-03) += (r21-r24) * k07
+                    "vmlal.s16  q8, d10, d7[1]  \n"// sum(10-13) += (r21-r24) * k17
+                    "vmlal.s16  q10, d10, d7[2] \n"// sum(20-23) += (r21-r24) * k27
+                    "vmlal.s16  q12, d10, d7[3] \n"// sum(30-33) += (r21-r24) * k37
+                    
+                    "vld1.s8    {d0}, [%7]      \n"// d0(k08-k38 xx-xx)
+                    "add        %7, #4          \n"
+                    "vmovl.s8   q0, d0          \n"// d0(k08-k38) d1(xx-xx)
+
+                    "vext.s8    d8, d8, #1      \n"// d8=r22-r25
+                    "vmovl.s8   q5, d8          \n"// d10(r22-r25)
+
+                    "vmlal.s16  q6, d10, d0[0]  \n"// sum(00-03) += (r22-r25) * k08
+                    "vmlal.s16  q8, d10, d0[1]  \n"// sum(10-13) += (r22-r25) * k18
+                    "vmlal.s16  q10, d10, d0[2] \n"// sum(20-23) += (r22-r25) * k28
+                    "vmlal.s16  q12, d10, d0[3] \n"// sum(30-33) += (r22-r25) * k38
+                    
+                    "vst1.s32   {d12-d13}, [%0]! \n"// sum00-sum03 q6
+                    "vst1.s32   {d16-d17}, [%1]! \n"// sum10-sum13 q8
+                    "vst1.s32   {d20-d21}, [%2]! \n"// sum20-sum23 q10
+                    "vst1.s32   {d24-d25}, [%3]! \n"// sum30-sum33 q12 
+
+                    "sub        %7, #36          \n"
+
+                    : "=r"(outptr0),    // %0
+                      "=r"(outptr1),    // %1
+                      "=r"(outptr2),    // %2
+                      "=r"(outptr3),    // %3
+                      "=r"(r0),         // %4
+                      "=r"(r1),         // %5
+                      "=r"(r2),         // %6
+                      "=r"(ktmp)        // %7
+                    : "0"(outptr0),
+                      "1"(outptr1),
+                      "2"(outptr2),
+                      "3"(outptr3),
+                      "4"(r0),
+                      "5"(r1),
+                      "6"(r2),
+                      "7"(ktmp)
+                    : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13" //q14 q15 not be used...
+                );
+                }
+#endif
+                for (; remain>0; remain--)
+                {
+#if 0 //__ARM_NEON
+                    asm volatile(
+                        "vld1.s8    {d0[]}, [%4]!       \n"// d0(r00)
+                        "vld1.s8    {d1[]}, [%4]!       \n"// d1(r01)
+
+                        "vld1.s8    {d4-d7}, [%7]!      \n"// d4(k00-k30 k01-k31) d5(k02-k32 k03-k33) d6(k04-k34 k05-k35) d7(k06-k36 k07-k37)
+
+                        "vsli.64    d0, d1, #32         \n"// d0(r00 r00 r00 r00 r01 r01 r01 r01)
+
+                        "vld1.s8    {d2[]}, [%4]        \n"// d2(r02 r02 r02 r02 r02 r02 r02 r02)
+                        "sub        %4, %4, #2          \n"
+                        "vld1.s8    {d3[]}, [%5]!       \n"// d3(r10 r10 r10 r10 r10 r10 r10 r10)
+                        
+                        "vmovl.s8   q5, d7              \n"// d10(k06-k36) d11(k07-k37)
+                        "vmovl.s8   q4, d6              \n"// d8(k04-k34) d9(k05-k35)
+                        "vmovl.s8   q3, d5              \n"// d6(k02-k32) d7(k03-k33)
+                        "vmovl.s8   q2, d4              \n"// d4(k00-k30) d5(k01-k31)
+                        
+                        "vmovl.s8   q0, d0              \n"// d0(r00 r00 r00 r00) d1(r01 r01 r01 r01)
+
+                        "vsli.64    d2, d3, #32         \n"// d2(r02 r02 r02 r02 r10 r10 r10 r10)
+                        
+                        "vmull.s16  q8, d0, d4          \n"// (r00) * (k00-k30)
+                        "vmull.s16  q9, d1, d5          \n"// (r01) * (k01-k31)
+                        
+                        "vmovl.s8   q10, d2             \n"// d20(r02 r02 r02 r02) d21(r10 r10 r10 r10)
+
+                        "vld1.s8    {d0[]}, [%5]!       \n"// d0(r11 r11 r11 r11 r11 r11 r11 r11)
+                        "vld1.s8    {d1[]}, [%5]        \n"// d1(r12 r12 r12 r12 r12 r12 r12 r12)
+                        "sub        %5, %5, #2          \n"
+
+                        "vsli.64    d0, d1, #32         \n"// d0(r11 r11 r11 r11 r12 r12 r12 r12)
+
+                        "vmlal.s16  q8, d20, d6         \n"// (r02) * (k02-k32)
+                        "vmlal.s16  q9, d21, d7         \n"// (r10) * (k03-k33)
+                        
+                        "vmovl.s8   q0, d0              \n"// d0(r11 r11 r11 r11 ) d1(r12 r12 r12 r12)
+
+                        "vld1.s8    {d2[]}, [%6]!       \n"// d2(r20 r20 r20 r20 r20 r20 r20 r20)
+                        "vld1.s8    {d3[]}, [%6]!       \n"// d3(r21 r21 r21 r21 r21 r21 r21 r21)
+
+                        "vsli.64    d2, d3, #32         \n"// d2(r20 r20 r20 r20 r21 r21 r21 r21)
+
+                        "vmlal.s16  q8, d0, d8          \n"// (r11) * (k04-k34)
+                        "vmlal.s16  q9, d1, d9          \n"// (r12) * (k05-k35)     
+
+                        "vmovl.s8   q2, d2              \n"// d4(r20 r20 r20 r20) d5(r21 r21 r21 r21)
+
+                        "vld1.s8    {d0[]}, [%6]        \n"// d0(r22 r22 r22 r22 r22 r22 r22 r22)
+                        "sub        %6, %6, #2          \n"
+                        "veor       d1, d1, d1          \n"// d1 = 0
+
+                        "vld1.s8    {d6}, [%7]          \n"// d6 = k08-k38 xxxx
+                        "sub        %7, #32             \n"
+
+                        "vsli.64    d0, d1, #32         \n"// d0(r22 r22 r22 r22 0 0 0 0)
+                        "vmovl.s8   q4, d6              \n"// d8(k08-k38)
+                        "vmovl.s8   q0, d0              \n"// d0(r22 r22 r22 r22) d1(0 0 0 0)
+
+                        "vmlal.s16  q8, d4, d10         \n"// (r20) * (k06-k36)
+                        "vmlal.s16  q9, d5, d11         \n"// (r21) * (k07-k37)
+
+                        "vld1.s32   {d20[0]}, [%0]      \n"
+
+                        "vmlal.s16  q8, d0, d8          \n"// (r22) * (k08-k38)
+
+                        "vld1.s32   {d20[1]}, [%1]      \n"
+
+                        "vadd.s32   q8, q8, q9          \n"
+
+                        "vld1.s32   {d21[0]}, [%2]      \n"
+                        "vld1.s32   {d21[1]}, [%3]      \n"
+
+                        "vadd.s32   q10, q10, q8        \n"
+
+                        "vst1.s32   {d20[0]}, [%0]!     \n"
+                        "vst1.s32   {d20[1]}, [%1]!     \n"
+                        "vst1.s32   {d21[0]}, [%2]!     \n"
+                        "vst1.s32   {d21[1]}, [%3]!     \n"
+
+                        : "=r"(outptr0),    // %0
+                          "=r"(outptr1),    // %1
+                          "=r"(outptr2),    // %2
+                          "=r"(outptr3),    // %3
+                          "=r"(r0),         // %4
+                          "=r"(r1),         // %5
+                          "=r"(r2),         // %6
+                          "=r"(ktmp)        // %7
+                        : "0"(outptr0),
+                          "1"(outptr1),
+                          "2"(outptr2),
+                          "3"(outptr3),
+                          "4"(r0),
+                          "5"(r1),
+                          "6"(r2),
+                          "7"(ktmp)
+                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q8", "q9", "q10"
+                    );
+#else
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+
+                    sum0 += r0[0] * ktmp[0];
+                    sum1 += r0[0] * ktmp[1];
+                    sum2 += r0[0] * ktmp[2];
+                    sum3 += r0[0] * ktmp[3];
+
+                    sum0 += r0[1] * ktmp[4];
+                    sum1 += r0[1] * ktmp[5];
+                    sum2 += r0[1] * ktmp[6];
+                    sum3 += r0[1] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += r0[2] * ktmp[0];
+                    sum1 += r0[2] * ktmp[1];
+                    sum2 += r0[2] * ktmp[2];
+                    sum3 += r0[2] * ktmp[3];
+
+                    sum0 += r1[0] * ktmp[4];
+                    sum1 += r1[0] * ktmp[5];
+                    sum2 += r1[0] * ktmp[6];
+                    sum3 += r1[0] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += r1[1] * ktmp[0];
+                    sum1 += r1[1] * ktmp[1];
+                    sum2 += r1[1] * ktmp[2];
+                    sum3 += r1[1] * ktmp[3];
+
+                    sum0 += r1[2] * ktmp[4];
+                    sum1 += r1[2] * ktmp[5];
+                    sum2 += r1[2] * ktmp[6];
+                    sum3 += r1[2] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += r2[0] * ktmp[0];
+                    sum1 += r2[0] * ktmp[1];
+                    sum2 += r2[0] * ktmp[2];
+                    sum3 += r2[0] * ktmp[3];
+
+                    sum0 += r2[1] * ktmp[4];
+                    sum1 += r2[1] * ktmp[5];
+                    sum2 += r2[1] * ktmp[6];
+                    sum3 += r2[1] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += r2[2] * ktmp[0];
+                    sum1 += r2[2] * ktmp[1];
+                    sum2 += r2[2] * ktmp[2];
+                    sum3 += r2[2] * ktmp[3];
+                    ktmp += 8;
+
+                    *outptr0 += sum0;
+                    *outptr1 += sum1;
+                    *outptr2 += sum2;
+                    *outptr3 += sum3;
+
+                    ktmp -= 8*5;
+
+                    outptr0++;
+                    outptr1++;
+                    outptr2++;
+                    outptr3++;
+#endif
+                    r0++;
+                    r1++;
+                    r2++;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            ktmp += 4*9;
+        }
+    }
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p=remain_outch_start; p<outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        const signed char* ktmp = _kernel.channel(p/4 + p%4);
+
+        for (int q=0; q<inch; q++)
+        {
+            int* outptr0 = out0;
+            int* outptr0n = outptr0 + outw;
+
+            const signed char* img0 = bottom_blob.channel(q);
+
+            const signed char* r0 = img0;
+            const signed char* r1 = img0 + w;
+            const signed char* r2 = img0 + w * 2;
+            const signed char* r3 = img0 + w * 3;
+
+            int i = 0;
+
+#if 0 //__ARM_NEON
+            int8x16_t _k0123456789x = vld1q_s8(ktmp);
+            int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k0123456789x));
+            int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k0123456789x));
+
+            int16x4_t _k0123 = vget_low_s16(_k_s16);
+            int16x4_t _k4567 = vget_high_s16(_k_s16);
+            int16x4_t _k8xxx = vget_low_s16(_kn_s16);
+#endif // __ARM_NEON
+
+            for (; i+1 < outh; i+=2)
+            {
+#if 0 //__ARM_NEON
+                int nn = outw >> 3;
+                int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
+
+#if 0 //__ARM_NEON
+                for (; nn >0; nn--)
+                {
+                    // r0
+                    int8x8_t _r0 = vld1_s8(r0);
+                    int8x8_t _r0n = vld1_s8(r0+8);
+                    int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
+                    int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
+                    int16x8_t _r0_s16 = vmovl_s8(_r0);   // r00 - r07
+                    int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08 
+                    int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09
+
+                    int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00
+                    int32x4_t _sum0n = vmull_lane_s16(vget_high_s16(_r0_s16), _k0123, 0);
+
+                    int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01
+                    int32x4_t _sum1n = vmull_lane_s16(vget_high_s16(_r01_s16), _k0123, 1);
+
+                    int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02
+                    int32x4_t _sum2n = vmull_lane_s16(vget_high_s16(_r02_s16), _k0123, 2);
+
+                    // r1
+                    int8x8_t _r1 = vld1_s8(r1);
+                    int8x8_t _r1n = vld1_s8(r1+8);
+                    int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
+                    int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
+                    int16x8_t _r1_s16 = vmovl_s8(_r1);   // r10 - r17
+                    int16x8_t _r11_s16 = vmovl_s8(_r11); // r11 - r18
+                    int16x8_t _r12_s16 = vmovl_s8(_r12); // r12 - r19
+
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r1_s16), _k0123, 3); // (r10 - r17) * k03
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r1_s16), _k0123, 3);
+
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r11_s16), _k4567, 0); // (r11 - r18) * k04
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r11_s16), _k4567, 0);
+
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r12_s16), _k4567, 1); // (r12 - r19) * k05
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r12_s16), _k4567, 1); 
+
+                    int32x4_t _sum4 = vmull_lane_s16(vget_low_s16(_r1_s16), _k0123, 0); // (r10 - r17) * k00
+                    int32x4_t _sum4n = vmull_lane_s16(vget_high_s16(_r1_s16), _k0123, 0);
+
+                    int32x4_t _sum5 = vmull_lane_s16(vget_low_s16(_r11_s16), _k0123, 1); // (r11 - r18) * k01
+                    int32x4_t _sum5n = vmull_lane_s16(vget_high_s16(_r11_s16), _k0123, 1);
+
+                    int32x4_t _sum6 = vmull_lane_s16(vget_low_s16(_r12_s16), _k0123, 2); // (r12 - r19) * k02
+                    int32x4_t _sum6n = vmull_lane_s16(vget_high_s16(_r12_s16), _k0123, 2);
+
+                    // r2
+                    int8x8_t _r2 = vld1_s8(r2);
+                    int8x8_t _r2n = vld1_s8(r2+8);
+                    int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
+                    int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
+                    int16x8_t _r2_s16 = vmovl_s8(_r2);   // r20 - r27
+                    int16x8_t _r21_s16 = vmovl_s8(_r21); // r21 - r28
+                    int16x8_t _r22_s16 = vmovl_s8(_r22); // r22 - r29
+
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r2_s16), _k4567, 2); // (r20 - r27) * k06
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r2_s16), _k4567, 2);
+
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r21_s16), _k4567, 3); // (r21 - r28) * k07
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r21_s16), _k4567, 3);
+
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r22_s16), _k8xxx, 0); // (r22 - r29) * k08
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r22_s16), _k8xxx, 0);
+
+                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r2_s16), _k0123, 3); // (r20 - r27) * k03
+                    _sum4n = vmlal_lane_s16(_sum4n, vget_high_s16(_r2_s16), _k0123, 3);
+
+                    _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r21_s16), _k4567, 0); // (r21 - r28) * k04
+                    _sum5n = vmlal_lane_s16(_sum5n, vget_high_s16(_r21_s16), _k4567, 0);
+
+                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r22_s16), _k4567, 1); // (r22 - r29) * k05
+                    _sum6n = vmlal_lane_s16(_sum6n, vget_high_s16(_r22_s16), _k4567, 1);
+
+                    // load output sum0 sum0n
+                    int32x4_t _out00 = vld1q_s32(outptr0);
+                    int32x4_t _out01 = vld1q_s32(outptr0+4);
+                    int32x4_t _out10 = vld1q_s32(outptr0n);
+                    int32x4_t _out11 = vld1q_s32(outptr0n+4);
+
+                    // r3
+                    int8x8_t _r3 = vld1_s8(r3);
+                    int8x8_t _r3n = vld1_s8(r3+8);
+                    int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
+                    int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
+                    int16x8_t _r3_s16 = vmovl_s8(_r3);   // r30 - r37
+                    int16x8_t _r31_s16 = vmovl_s8(_r31); // r31 - r38
+                    int16x8_t _r32_s16 = vmovl_s8(_r32); // r32 - r39
+
+                    _sum0 = vaddq_s32(_sum0, _sum1);
+                    _sum0n = vaddq_s32(_sum0n, _sum1n);
+                    _sum2 = vaddq_s32(_sum2, _sum0);
+                    _sum2n = vaddq_s32(_sum2n, _sum0n);
+
+                    _out00 = vaddq_s32(_out00, _sum2);
+                    _out01 = vaddq_s32(_out01, _sum2n);
+
+                    vst1q_s32(outptr0, _out00);
+                    vst1q_s32(outptr0+4, _out01);
+
+                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r3_s16), _k4567, 2); // (r30 - r37) * k06
+                    _sum4n = vmlal_lane_s16(_sum4n, vget_high_s16(_r3_s16), _k4567, 2);
+
+                    _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r31_s16), _k4567, 3); // (r31 - r38) * k07
+                    _sum5n = vmlal_lane_s16(_sum5n, vget_high_s16(_r31_s16), _k4567, 3);
 
-            int8x8_t _k00 = vdup_n_s8(kernel0[0]);
-            int8x8_t _k01 = vdup_n_s8(kernel0[1]);
-            int8x8_t _k02 = vdup_n_s8(kernel0[2]);
-            int8x8_t _k03 = vdup_n_s8(kernel0[3]);
-            int8x8_t _k04 = vdup_n_s8(kernel0[4]);
-            int8x8_t _k05 = vdup_n_s8(kernel0[5]);
-            int8x8_t _k06 = vdup_n_s8(kernel0[6]);
-            int8x8_t _k07 = vdup_n_s8(kernel0[7]);
-            int8x8_t _k08 = vdup_n_s8(kernel0[8]);
+                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r32_s16), _k8xxx, 0); // (r32 - r39) * k08
+                    _sum6n = vmlal_lane_s16(_sum6n, vget_high_s16(_r32_s16), _k8xxx, 0); 
 
-            for (; i+1 < outh; i+=2)
-            {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+                    _sum4 = vaddq_s32(_sum4, _sum5);
+                    _sum4n = vaddq_s32(_sum4n, _sum5n);
+                    _sum6 = vaddq_s32(_sum6, _sum4);
+                    _sum6n = vaddq_s32(_sum6n, _sum4n);
+
+                    _out10 = vaddq_s32(_out10, _sum6);
+                    _out11 = vaddq_s32(_out11, _sum6n);
+
+                    vst1q_s32(outptr0n, _out10);
+                    vst1q_s32(outptr0n+4, _out11);
 
-                for (; nn > 0; nn--)
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
+                    r3 += 8;
+                    outptr0 += 8;
+                    outptr0n += 8;
+                }
+#endif
+#if 0 //__ARM_NEON
+                if (remain >= 4)
                 {
+                    remain -= 4;
+
+                    // r0
                     int8x8_t _r0 = vld1_s8(r0);
                     int8x8_t _r0n = vld1_s8(r0+8);
                     int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
                     int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
+                    int16x8_t _r0_s16 = vmovl_s8(_r0);   // r00 - r07
+                    int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08
+                    int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09
 
-                    int16x8_t _sum0 = vmull_s8(_r0, _k00);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k01);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k02);
+                    int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00
+                    int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01
+                    int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02
 
+                    // r1
                     int8x8_t _r1 = vld1_s8(r1);
                     int8x8_t _r1n = vld1_s8(r1+8);
                     int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
                     int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r1, _k03);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k04);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k05);
+                    int16x8_t _r1_s16 = vmovl_s8(_r1);   // r10 - r17
+                    int16x8_t _r11_s16 = vmovl_s8(_r11); // r11 - r18
+                    int16x8_t _r12_s16 = vmovl_s8(_r12); // r12 - r19
+
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r1_s16), _k0123, 3); // (r10 - r17) * k03
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r11_s16), _k4567, 0); // (r11 - r18) * k04
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r12_s16), _k4567, 1); // (r12 - r19) * k05
 
-                    int16x8_t _sum1 = vmull_s8(_r1, _k00);
-                    _sum1 = vmlal_s8(_sum1, _r11, _k01);
-                    _sum1 = vmlal_s8(_sum1, _r12, _k02);
+                    int32x4_t _sum4 = vmull_lane_s16(vget_low_s16(_r1_s16), _k0123, 0); // (r10 - r17) * k00
+                    int32x4_t _sum5 = vmull_lane_s16(vget_low_s16(_r11_s16), _k0123, 1); // (r11 - r18) * k01
+                    int32x4_t _sum6 = vmull_lane_s16(vget_low_s16(_r12_s16), _k0123, 2); // (r12 - r19) * k02
 
+                    // r2
                     int8x8_t _r2 = vld1_s8(r2);
                     int8x8_t _r2n = vld1_s8(r2+8);
                     int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
                     int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r2, _k06);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k07);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k08);
+                    int16x8_t _r2_s16 = vmovl_s8(_r2);   // r20 - r27
+                    int16x8_t _r21_s16 = vmovl_s8(_r21); // r21 - r28
+                    int16x8_t _r22_s16 = vmovl_s8(_r22); // r22 - r29
+
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r2_s16), _k4567, 2); // (r20 - r27) * k06
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r21_s16), _k4567, 3); // (r21 - r28) * k07
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r22_s16), _k8xxx, 0); // (r22 - r29) * k08
+
+                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r2_s16), _k0123, 3); // (r20 - r27) * k03
+                    _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r21_s16), _k4567, 0); // (r21 - r28) * k04
+                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r22_s16), _k4567, 1); // (r22 - r29) * k05
 
-                    _sum1 = vmlal_s8(_sum1, _r2, _k03);
-                    _sum1 = vmlal_s8(_sum1, _r21, _k04);
-                    _sum1 = vmlal_s8(_sum1, _r22, _k05);
+                    // load output sum0 sum0n
+                    int32x4_t _out00 = vld1q_s32(outptr0);
+                    int32x4_t _out10 = vld1q_s32(outptr0n);
 
+                    // r3
                     int8x8_t _r3 = vld1_s8(r3);
                     int8x8_t _r3n = vld1_s8(r3+8);
                     int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
                     int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
-                    _sum1 = vmlal_s8(_sum1, _r3, _k06);
-                    _sum1 = vmlal_s8(_sum1, _r31, _k07);
-                    _sum1 = vmlal_s8(_sum1, _r32, _k08);
+                    int16x8_t _r3_s16 = vmovl_s8(_r3);   // r30 - r37
+                    int16x8_t _r31_s16 = vmovl_s8(_r31); // r31 - r38
+                    int16x8_t _r32_s16 = vmovl_s8(_r32); // r32 - r39
 
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    int32x4_t sum0n_s32 = vld1q_s32(outptr0+4);
+                    _sum0 = vaddq_s32(_sum0, _sum1);
+                    _sum2 = vaddq_s32(_sum2, _sum0);
+                    _out00 = vaddq_s32(_out00, _sum2);
 
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0)); 
+                    vst1q_s32(outptr0, _out00);
 
-                    vst1q_s32(outptr0, sum0_s32);
-                    vst1q_s32(outptr0+4, sum0n_s32);
+                    _sum4 = vmlal_lane_s16(_sum4, vget_low_s16(_r3_s16), _k4567, 2); // (r30 - r37) * k06
+                    _sum5 = vmlal_lane_s16(_sum5, vget_low_s16(_r31_s16), _k4567, 3); // (r31 - r38) * k07
+                    _sum6 = vmlal_lane_s16(_sum6, vget_low_s16(_r32_s16), _k8xxx, 0); // (r32 - r39) * k08
 
-                    int32x4_t sum1_s32 = vld1q_s32(outptr0n);
-                    int32x4_t sum1n_s32 = vld1q_s32(outptr0n+4);
+                    _sum4 = vaddq_s32(_sum4, _sum5);
+                    _sum6 = vaddq_s32(_sum6, _sum4);
 
-                    sum1_s32 = vaddw_s16(sum1_s32, vget_low_s16(_sum1));
-                    sum1n_s32 = vaddw_s16(sum1n_s32, vget_high_s16(_sum1));
+                    _out10 = vaddq_s32(_out10, _sum6);
 
-                    vst1q_s32(outptr0n, sum1_s32);
-                    vst1q_s32(outptr0n+4, sum1n_s32);
+                    vst1q_s32(outptr0n, _out10);
 
-                    r0 += 8;
-                    r1 += 8;
-                    r2 += 8;
-                    r3 += 8;
-                    outptr0 += 8;
-                    outptr0n += 8;
+                    r0 += 4;
+                    r1 += 4;
+                    r2 += 4;
+                    r3 += 4;
+                    outptr0 += 4;
+                    outptr0n += 4;
                 }
-
+#endif
                 for (; remain>0; remain--)
                 {
-                    // Todo neon
-                    int sum0 = 0;
-                    int sum0n = 0;
+#if 0 //__ARM_NEON
+                    asm volatile(
+                        "vld1.s8    {d0[0]}, [%2]!  \n"
+                        "vld1.s8    {d0[1]}, [%2]!  \n"
+                        "vld1.s8    {d0[2]}, [%2]   \n"
+                        "sub        %2, #2          \n"
 
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum0n += (int)r1[0] * kernel0[0];
-                    sum0n += (int)r1[1] * kernel0[1];
-                    sum0n += (int)r1[2] * kernel0[2];
-                    sum0n += (int)r2[0] * kernel0[3];
-                    sum0n += (int)r2[1] * kernel0[4];
-                    sum0n += (int)r2[2] * kernel0[5];
-                    sum0n += (int)r3[0] * kernel0[6];
-                    sum0n += (int)r3[1] * kernel0[7];
-                    sum0n += (int)r3[2] * kernel0[8];
+                        "vld1.s8    {d0[3]}, [%3]!  \n"
+                        "vld1.s8    {d0[4]}, [%3]!  \n"
+                        "vld1.s8    {d0[5]}, [%3]   \n"
+                        "sub        %3, #2          \n"
 
-                    *outptr0 += sum0;
-                    *outptr0n += sum0n;
+                        "vld1.s8    {d0[6]}, [%4]!  \n"
+                        "vld1.s8    {d0[7]}, [%4]!  \n"// d0(r00 r01 r02 r10 r11 r12 r20 r21)
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    r3++;
-                    outptr0++;
-                    outptr0n++;
-                }
+                        "vld1.s8    {d4[]}, [%4]    \n"// d4(r22 r22 r22 r22 r22 r22 r22 r22) 
+                        "sub        %4, #2          \n"
 
-                r0 += 2 + w;
-                r1 += 2 + w;
-                r2 += 2 + w;
-                r3 += 2 + w;
+                        "vext.s8    d1, d0, d4, #3  \n"// d1(r10 r11 r12 r22 r21 r22 r22 r22)
 
-                outptr0 += outw;
-                outptr0n += outw;
-            }
+                        "vld1.s8    {d1[6]}, [%5]!  \n"
+                        "vld1.s8    {d1[7]}, [%5]!  \n"// d1(r10 r11 r12 r22 r21 r22 r30 r31)
 
-            for (; i < outh; i++)
-            {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+                        "vld1.s8    {d2}, [%6]!     \n"// d2(k00 k01 k02 k10 k11 k12 k20 k21)
 
-                for (; nn > 0; nn--)
-                {
-                    int8x8_t _r0 = vld1_s8(r0);
-                    int8x8_t _r0n = vld1_s8(r0+8);
-                    int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
-                    int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
+                        "vld1.s8    {d5[]}, [%5]    \n"// d5(r32 r32 r32 r32 r32 r32 r32 r32)
+                        "sub        %5, #2          \n"
 
-                    int16x8_t _sum0 = vmull_s8(_r0, _k00);
-                    _sum0 = vmlal_s8(_sum0, _r01, _k01);
-                    _sum0 = vmlal_s8(_sum0, _r02, _k02);
+                        "veor       d3, d1, d1      \n"// d3(00 00 00 00 00 00 00 00)
 
-                    int8x8_t _r1 = vld1_s8(r1);
-                    int8x8_t _r1n = vld1_s8(r1+8);
-                    int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
-                    int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r1, _k03);
-                    _sum0 = vmlal_s8(_sum0, _r11, _k04);
-                    _sum0 = vmlal_s8(_sum0, _r12, _k05);
+                        "vmull.s8   q8, d0, d2      \n"// sum0 = (r00 - r21) * (k00 - k21)
+                        "vmull.s8   q9, d1, d2      \n"// sum1 = (r10 - r31) * (k00 - k21)
 
-                    int8x8_t _r2 = vld1_s8(r2);
-                    int8x8_t _r2n = vld1_s8(r2+8);
-                    int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
-                    int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                    _sum0 = vmlal_s8(_sum0, _r2, _k06);
-                    _sum0 = vmlal_s8(_sum0, _r21, _k07);
-                    _sum0 = vmlal_s8(_sum0, _r22, _k08);
+                        "vld1.s8    {d3[0]}, [%6]   \n"// d3(k22 00 00 00 00 00 00 00)
+                        "sub        %6, #8          \n"
 
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    int32x4_t sum0n_s32 = vld1q_s32(outptr0+4);
+                        "vmull.s8   q10, d4, d3     \n"// r22 * k22
+                        "vmull.s8   q11, d5, d3     \n"// r22 * k22
 
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum0));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum0));
+                        "vld1.s32   {d6[0]}, [%0]   \n"
 
-                    vst1q_s32(outptr0, sum0_s32);
-                    vst1q_s32(outptr0+4, sum0n_s32);
+                        "vaddl.s16  q10, d16, d18   \n"
+                        "vaddl.s16  q11, d18, d22   \n"
+                        "vaddw.s16  q10, q10, d17   \n"
+                        "vaddw.s16  q11, q11, d19   \n"
 
-                    r0 += 8;
-                    r1 += 8;
-                    r2 += 8;
-                    outptr0 += 8;
-                }
+                        "vld1.s32   {d6[1]}, [%1]   \n"
 
-                for (; remain>0; remain--)
-                {
+                        "vpadd.s32  d20, d20, d21   \n"
+                        "vpadd.s32  d22, d22, d23   \n"
+                        "vpadd.s32  d20, d20, d22   \n"
+                        "vadd.s32   d6, d6, d20     \n"
+
+                        "vst1.s32   {d6[0]}, [%0]!  \n"
+                        "vst1.s32   {d6[1]}, [%1]!  \n"
+
+                        : "=r"(outptr0),    // %0
+                          "=r"(outptr0n),   // %1
+                          "=r"(r0),         // %2
+                          "=r"(r1),         // %3
+                          "=r"(r2),         // %4
+                          "=r"(r3),         // %5
+                          "=r"(ktmp)        // %6
+                        : "0"(outptr0),
+                          "1"(outptr0n),
+                          "2"(r0),
+                          "3"(r1),
+                          "4"(r2),
+                          "5"(r3),
+                          "6"(ktmp)
+                        : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+                    );
+#else
                     int sum0 = 0;
+                    int sum0n = 0;
+
+                    sum0 += r0[0] * ktmp[0];
+                    sum0 += r0[1] * ktmp[1];
+                    sum0 += r0[2] * ktmp[2];
+                    sum0 += r1[0] * ktmp[3];
+                    sum0 += r1[1] * ktmp[4];
+                    sum0 += r1[2] * ktmp[5];
+                    sum0 += r2[0] * ktmp[6];
+                    sum0 += r2[1] * ktmp[7];
+                    sum0 += r2[2] * ktmp[8];
 
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
+                    sum0n += r1[0] * ktmp[0];
+                    sum0n += r1[1] * ktmp[1];
+                    sum0n += r1[2] * ktmp[2];
+                    sum0n += r2[0] * ktmp[3];
+                    sum0n += r2[1] * ktmp[4];
+                    sum0n += r2[2] * ktmp[5];
+                    sum0n += r3[0] * ktmp[6];
+                    sum0n += r3[1] * ktmp[7];
+                    sum0n += r3[2] * ktmp[8];
 
                     *outptr0 += sum0;
+                    *outptr0n += sum0n;
 
+                    outptr0++;
+                    outptr0n++;
+#endif
                     r0++;
                     r1++;
                     r2++;
-                    outptr0++;
-                }   
-
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
-            }           
-            kernel0 += 9;
-        }       
-    }
-}
-
-static void conv3x3s2_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
-{
-    int w = bottom_blob.w;
-    int inch = bottom_blob.c;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int tailstep = w - 2 * outw + w;
-
-    const signed char* kernel = _kernel;
-    
-    int nn_outch = outch >> 2;
-    int remain_outch_start = nn_outch << 2; 
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp < nn_outch; pp++)
-    {
-        int p = pp * 4;
-
-        Mat out0 = top_blob.channel(p);
-        Mat out1 = top_blob.channel(p + 1);
-        Mat out2 = top_blob.channel(p + 2);
-        Mat out3 = top_blob.channel(p + 3);       
-        
-        out0.fill(0.f);
-        out1.fill(0.f);
-        out2.fill(0.f);
-        out3.fill(0.f);
-
-        const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
-        const signed char* kernel1 = (const signed char*)kernel + (p + 1) * inch * 9;
-        const signed char* kernel2 = (const signed char*)kernel + (p + 2) * inch * 9;
-        const signed char* kernel3 = (const signed char*)kernel + (p + 3) * inch * 9;
-
-        for (int q=0; q<inch; q++)
-        {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
-            int* outptr2 = out2;
-            int* outptr3 = out3; 
-
-            const signed char* img0 = bottom_blob.channel(q);
-
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
+                    r3++;
+                }
 
-            int i = 0;
+                r0 += 2 + w;
+                r1 += 2 + w;
+                r2 += 2 + w;
+                r3 += 2 + w;
 
-            int8x16_t _k0 = vld1q_s8(kernel0);
-            int8x16_t _k1 = vld1q_s8(kernel1);
-            int8x16_t _k2 = vld1q_s8(kernel2);
-            int8x16_t _k3 = vld1q_s8(kernel3);
+                outptr0 += outw;
+                outptr0n += outw;
+            }
 
             for (; i < outh; i++)
             {
+#if 0 //__ARM_NEON
                 int nn = outw >> 3;
                 int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
 
-                if (nn > 0)
-                {
-                asm volatile(
-                    "0:                                \n"
-                    // r0
-                    "prfm   pldl1keep, [%5, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%5], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%5]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b,  %16.b[0]           \n"
-                    "dup    v10.8b, %17.b[0]           \n"
-                    "dup    v11.8b, %18.b[0]           \n"
-                    "dup    v12.8b, %19.b[0]           \n"
-
-                    "smull  v13.8h, v4.8b, v9.8b       \n"
-                    "smull  v14.8h, v4.8b, v10.8b      \n"
-                    "smull  v15.8h, v4.8b, v11.8b      \n"
-                    "smull  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[1]            \n"
-                    "dup    v10.8b, %17.b[1]           \n"
-                    "dup    v11.8b, %18.b[1]           \n"
-                    "dup    v12.8b, %19.b[1]           \n"
-
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[2]            \n"
-                    "dup    v10.8b, %17.b[2]           \n"
-                    "dup    v11.8b, %18.b[2]           \n"
-                    "dup    v12.8b, %19.b[2]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // r1
-                    "prfm   pldl1keep, [%6, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%6], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%6]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b, %16.b[3]            \n"
-                    "dup    v10.8b, %17.b[3]           \n"
-                    "dup    v11.8b, %18.b[3]           \n"
-                    "dup    v12.8b, %19.b[3]           \n"
-
-                    "smlal  v13.8h, v4.8b, v9.8b       \n"
-                    "smlal  v14.8h, v4.8b, v10.8b      \n"
-                    "smlal  v15.8h, v4.8b, v11.8b      \n"
-                    "smlal  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[4]            \n"
-                    "dup    v10.8b, %17.b[4]           \n"
-                    "dup    v11.8b, %18.b[4]           \n"
-                    "dup    v12.8b, %19.b[4]           \n"
-
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[5]            \n"
-                    "dup    v10.8b, %17.b[5]           \n"
-                    "dup    v11.8b, %18.b[5]           \n"
-                    "dup    v12.8b, %19.b[5]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // r2
-                    "prfm   pldl1keep, [%7, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%7], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%7]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b, %16.b[6]            \n"
-                    "dup    v10.8b, %17.b[6]           \n"
-                    "dup    v11.8b, %18.b[6]           \n"
-                    "dup    v12.8b, %19.b[6]           \n"
-
-                    "smlal  v13.8h, v4.8b, v9.8b       \n"
-                    "smlal  v14.8h, v4.8b, v10.8b      \n"
-                    "smlal  v15.8h, v4.8b, v11.8b      \n"
-                    "smlal  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[7]            \n"
-                    "dup    v10.8b, %17.b[7]           \n"
-                    "dup    v11.8b, %18.b[7]           \n"
-                    "dup    v12.8b, %19.b[7]           \n"
-
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[8]            \n"
-                    "dup    v10.8b, %17.b[8]           \n"
-                    "dup    v11.8b, %18.b[8]           \n"
-                    "dup    v12.8b, %19.b[8]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // sum0 - sum3
-                    "prfm   pldl1keep, [%1, #128]      \n"
-                    "prfm   pldl1keep, [%2, #128]      \n"
-                    "prfm   pldl1keep, [%3, #128]      \n"
-                    "prfm   pldl1keep, [%4, #128]      \n"
-                    "ld1    {v17.4s, v18.4s}, [%1]     \n"
-                    "ld1    {v19.4s, v20.4s}, [%2]     \n"
-                    "ld1    {v21.4s, v22.4s}, [%3]     \n"
-                    "ld1    {v23.4s, v24.4s}, [%4]     \n"
-
-                    "saddw  v17.4s, v17.4s, v13.4h     \n"
-                    "saddw2  v18.4s, v18.4s, v13.8h    \n"
-                    "saddw  v19.4s, v19.4s, v14.4h     \n"
-                    "saddw2  v20.4s, v20.4s, v14.8h    \n"
-                    "saddw  v21.4s, v21.4s, v15.4h     \n"
-                    "saddw2  v22.4s, v22.4s, v15.8h    \n"
-                    "saddw  v23.4s, v23.4s, v16.4h     \n"
-                    "saddw2  v24.4s, v24.4s, v16.8h    \n"
-                    "st1    {v17.4s, v18.4s}, [%1], #32\n"
-                    "st1    {v19.4s, v20.4s}, [%2], #32\n"
-                    "st1    {v21.4s, v22.4s}, [%3], #32\n"
-                    "st1    {v23.4s, v24.4s}, [%4], #32\n"
-                    "subs   %w0, %w0, #1               \n"
-                    "bne    0b                         \n"
-                    : "=r"(nn),         //%0
-                      "=r"(outptr0),    //%1
-                      "=r"(outptr1),    //%2
-                      "=r"(outptr2),    //%3
-                      "=r"(outptr3),    //%4
-                      "=r"(r0),         //%5
-                      "=r"(r1),         //%6
-                      "=r"(r2)          //%7
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(outptr1),
-                      "3"(outptr2),
-                      "4"(outptr3),
-                      "5"(r0),
-                      "6"(r1),
-                      "7"(r2),
-                      "w"(_k0),         //%16
-                      "w"(_k1),         //%17
-                      "w"(_k2),         //%18
-                      "w"(_k3)          //%19
-                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
-                );
-                }
-
-                if (remain >= 4)
+#if 0 //__ARM_NEON
+                for (; nn >0; nn--)
                 {
-                    remain -= 4;
-
-                asm volatile(
                     // r0
-                    "prfm   pldl1keep, [%5, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%5], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%5]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b,  %16.b[0]           \n"
-                    "dup    v10.8b, %17.b[0]           \n"
-                    "dup    v11.8b, %18.b[0]           \n"
-                    "dup    v12.8b, %19.b[0]           \n"
-
-                    "smull  v13.8h, v4.8b, v9.8b       \n"
-                    "smull  v14.8h, v4.8b, v10.8b      \n"
-                    "smull  v15.8h, v4.8b, v11.8b      \n"
-                    "smull  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[1]            \n"
-                    "dup    v10.8b, %17.b[1]           \n"
-                    "dup    v11.8b, %18.b[1]           \n"
-                    "dup    v12.8b, %19.b[1]           \n"
-
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[2]            \n"
-                    "dup    v10.8b, %17.b[2]           \n"
-                    "dup    v11.8b, %18.b[2]           \n"
-                    "dup    v12.8b, %19.b[2]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // r1
-                    "prfm   pldl1keep, [%6, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%6], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%6]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b, %16.b[3]            \n"
-                    "dup    v10.8b, %17.b[3]           \n"
-                    "dup    v11.8b, %18.b[3]           \n"
-                    "dup    v12.8b, %19.b[3]           \n"
-
-                    "smlal  v13.8h, v4.8b, v9.8b       \n"
-                    "smlal  v14.8h, v4.8b, v10.8b      \n"
-                    "smlal  v15.8h, v4.8b, v11.8b      \n"
-                    "smlal  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[4]            \n"
-                    "dup    v10.8b, %17.b[4]           \n"
-                    "dup    v11.8b, %18.b[4]           \n"
-                    "dup    v12.8b, %19.b[4]           \n"
-
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[5]            \n"
-                    "dup    v10.8b, %17.b[5]           \n"
-                    "dup    v11.8b, %18.b[5]           \n"
-                    "dup    v12.8b, %19.b[5]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // r2
-                    "prfm   pldl1keep, [%7, #128]      \n"
-                    "ld2    {v4.8b, v5.8b}, [%7], #16  \n"
-                    "ld2    {v6.8b, v7.8b}, [%7]       \n"
-                    "ext    v8.8b, v4.8b, v6.8b, #1    \n"
-
-                    "dup    v9.8b, %16.b[6]            \n"
-                    "dup    v10.8b, %17.b[6]           \n"
-                    "dup    v11.8b, %18.b[6]           \n"
-                    "dup    v12.8b, %19.b[6]           \n"
-
-                    "smlal  v13.8h, v4.8b, v9.8b       \n"
-                    "smlal  v14.8h, v4.8b, v10.8b      \n"
-                    "smlal  v15.8h, v4.8b, v11.8b      \n"
-                    "smlal  v16.8h, v4.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[7]            \n"
-                    "dup    v10.8b, %17.b[7]           \n"
-                    "dup    v11.8b, %18.b[7]           \n"
-                    "dup    v12.8b, %19.b[7]           \n"
- 
-                    "smlal  v13.8h, v5.8b, v9.8b       \n"
-                    "smlal  v14.8h, v5.8b, v10.8b      \n"
-                    "smlal  v15.8h, v5.8b, v11.8b      \n"
-                    "smlal  v16.8h, v5.8b, v12.8b      \n"
-
-                    "dup    v9.8b, %16.b[8]            \n"
-                    "dup    v10.8b, %17.b[8]           \n"
-                    "dup    v11.8b, %18.b[8]           \n"
-                    "dup    v12.8b, %19.b[8]           \n"
-
-                    "smlal  v13.8h, v8.8b, v9.8b       \n"
-                    "smlal  v14.8h, v8.8b, v10.8b      \n"
-                    "smlal  v15.8h, v8.8b, v11.8b      \n"
-                    "smlal  v16.8h, v8.8b, v12.8b      \n"
-                    // sum0 - sum3
-                    "prfm   pldl1keep, [%1, #128]      \n"
-                    "prfm   pldl1keep, [%2, #128]      \n"
-                    "prfm   pldl1keep, [%3, #128]      \n"
-                    "prfm   pldl1keep, [%4, #128]      \n"
-                    "ld1    {v17.4s}, [%1]             \n"
-                    "ld1    {v19.4s}, [%2]             \n"
-                    "ld1    {v21.4s}, [%3]             \n"
-                    "ld1    {v23.4s}, [%4]             \n"
-
-                    "saddw  v17.4s, v17.4s, v13.4h     \n"
-                    "saddw  v19.4s, v19.4s, v14.4h     \n"
-                    "saddw  v21.4s, v21.4s, v15.4h     \n"
-                    "saddw  v23.4s, v23.4s, v16.4h     \n"
-
-                    "st1    {v17.4s}, [%1], #16        \n"
-                    "st1    {v19.4s}, [%2], #16        \n"
-                    "st1    {v21.4s}, [%3], #16        \n"
-                    "st1    {v23.4s}, [%4], #16        \n"
-                    "sub    %5, %5, #8                 \n"
-                    "sub    %6, %6, #8                 \n"
-                    "sub    %7, %7, #8                 \n"
-                    : "=r"(nn),         //%0
-                      "=r"(outptr0),    //%1
-                      "=r"(outptr1),    //%2
-                      "=r"(outptr2),    //%3
-                      "=r"(outptr3),    //%4
-                      "=r"(r0),         //%5
-                      "=r"(r1),         //%6
-                      "=r"(r2)          //%7
-                    : "0"(nn),
-                      "1"(outptr0),
-                      "2"(outptr1),
-                      "3"(outptr2),
-                      "4"(outptr3),
-                      "5"(r0),
-                      "6"(r1),
-                      "7"(r2),
-                      "w"(_k0),         //%16
-                      "w"(_k1),         //%17
-                      "w"(_k2),         //%18
-                      "w"(_k3)          //%19
-                    : "cc", "memory", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24"
-                );
-                }
-
-                for (; remain>0; remain--)
-                {
-                    int sum0 = 0;
-                    int sum1 = 0;
-                    int sum2 = 0;
-                    int sum3 = 0;
-
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
-
-                    sum2 += (int)r0[0] * kernel2[0];
-                    sum2 += (int)r0[1] * kernel2[1];
-                    sum2 += (int)r0[2] * kernel2[2];
-                    sum2 += (int)r1[0] * kernel2[3];
-                    sum2 += (int)r1[1] * kernel2[4];
-                    sum2 += (int)r1[2] * kernel2[5];
-                    sum2 += (int)r2[0] * kernel2[6];
-                    sum2 += (int)r2[1] * kernel2[7];
-                    sum2 += (int)r2[2] * kernel2[8];
-
-                    sum3 += (int)r0[0] * kernel3[0];
-                    sum3 += (int)r0[1] * kernel3[1];
-                    sum3 += (int)r0[2] * kernel3[2];
-                    sum3 += (int)r1[0] * kernel3[3];
-                    sum3 += (int)r1[1] * kernel3[4];
-                    sum3 += (int)r1[2] * kernel3[5];
-                    sum3 += (int)r2[0] * kernel3[6];
-                    sum3 += (int)r2[1] * kernel3[7];
-                    sum3 += (int)r2[2] * kernel3[8];
-
-                    *outptr0 += sum0;
-                    *outptr1 += sum1;
-                    *outptr2 += sum2;
-                    *outptr3 += sum3;
+                    int8x8_t _r0 = vld1_s8(r0);
+                    int8x8_t _r0n = vld1_s8(r0+8);
+                    int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
+                    int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
+                    int16x8_t _r0_s16 = vmovl_s8(_r0);   // r00 - r07
+                    int16x8_t _r01_s16 = vmovl_s8(_r01); // r01 - r08 
+                    int16x8_t _r02_s16 = vmovl_s8(_r02); // r02 - r09
 
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
-                    outptr0++;
-                    outptr1++;
-                    outptr2++;
-                    outptr3++;
-                }       
+                    int32x4_t _sum0 = vmull_lane_s16(vget_low_s16(_r0_s16), _k0123, 0); // (r00 - r07) * k00
+                    int32x4_t _sum0n = vmull_lane_s16(vget_high_s16(_r0_s16), _k0123, 0);
 
-                r0 += tailstep;
-                r1 += tailstep;
-                r2 += tailstep;
-            }
+                    int32x4_t _sum1 = vmull_lane_s16(vget_low_s16(_r01_s16), _k0123, 1); // (r01 - r08) * k01
+                    int32x4_t _sum1n = vmull_lane_s16(vget_high_s16(_r01_s16), _k0123, 1);
 
-            kernel0 += 9;
-            kernel1 += 9;
-            kernel2 += 9;
-            kernel3 += 9;
-        }
-    }
+                    int32x4_t _sum2 = vmull_lane_s16(vget_low_s16(_r02_s16), _k0123, 2); // (r02 - r09) * k02
+                    int32x4_t _sum2n = vmull_lane_s16(vget_high_s16(_r02_s16), _k0123, 2);
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p=remain_outch_start; p<outch; p++)
-    {
-        Mat out0 = top_blob.channel(p);
+                    // r1
+                    int8x8_t _r1 = vld1_s8(r1);
+                    int8x8_t _r1n = vld1_s8(r1+8);
+                    int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
+                    int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
+                    int16x8_t _r1_s16 = vmovl_s8(_r1);   // r10 - r17
+                    int16x8_t _r11_s16 = vmovl_s8(_r11); // r11 - r18
+                    int16x8_t _r12_s16 = vmovl_s8(_r12); // r12 - r19
 
-        out0.fill(0.f);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r1_s16), _k0123, 3); // (r10 - r17) * k03
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r1_s16), _k0123, 3);
 
-        const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r11_s16), _k4567, 0); // (r11 - r18) * k04
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r11_s16), _k4567, 0);
 
-        for (int q=0; q<inch; q++)
-        {
-            int* outptr0 = out0;
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r12_s16), _k4567, 1); // (r12 - r19) * k05
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r12_s16), _k4567, 1);
 
-            const signed char* img0 = bottom_blob.channel(q);
+                    // r2
+                    int8x8_t _r2 = vld1_s8(r2);
+                    int8x8_t _r2n = vld1_s8(r2+8);
+                    int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
+                    int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
+                    int16x8_t _r2_s16 = vmovl_s8(_r2);   // r20 - r27
+                    int16x8_t _r21_s16 = vmovl_s8(_r21); // r21 - r28
+                    int16x8_t _r22_s16 = vmovl_s8(_r22); // r22 - r29
 
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_r2_s16), _k4567, 2); // (r20 - r27) * k06
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_r2_s16), _k4567, 2);
 
-            int i = 0;
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_r21_s16), _k4567, 3); // (r21 - r28) * k07
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_r21_s16), _k4567, 3);
 
-            int8x8_t _k0 = vdup_n_s8(kernel0[0]);
-            int8x8_t _k1 = vdup_n_s8(kernel0[1]);
-            int8x8_t _k2 = vdup_n_s8(kernel0[2]);
-            int8x8_t _k3 = vdup_n_s8(kernel0[3]);
-            int8x8_t _k4 = vdup_n_s8(kernel0[4]);
-            int8x8_t _k5 = vdup_n_s8(kernel0[5]);
-            int8x8_t _k6 = vdup_n_s8(kernel0[6]);
-            int8x8_t _k7 = vdup_n_s8(kernel0[7]);
-            int8x8_t _k8 = vdup_n_s8(kernel0[8]);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_r22_s16), _k8xxx, 0); // (r22 - r29) * k08
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_r22_s16), _k8xxx, 0); 
 
-            for (; i < outh; i++)
-            {  
-#if __ARM_NEON
-                int nn = outw >> 3;
-                int remain = outw & 7;
-#else
-                int remain = outw;
-#endif // __ARM_NEON
+                    // load output sum0 sum0n
+                    int32x4_t _out00 = vld1q_s32(outptr0);
+                    int32x4_t _out01 = vld1q_s32(outptr0+4);
 
-#if __ARM_NEON
-                for (; nn >0; nn--)
-                {
-                    int8x8x2_t _r0 = vld2_s8(r0);
-                    int8x8x2_t _r0n = vld2_s8(r0+16);
-                    int8x8_t _r00 = _r0.val[0];
-                    int8x8_t _r01 = _r0.val[1];
-                    int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1);
-
-                    int16x8_t _sum = vmull_s8(_r00, _k0);
-                    _sum = vmlal_s8(_sum, _r01, _k1);
-                    _sum = vmlal_s8(_sum, _r02, _k2);
-
-                    int8x8x2_t _r1 = vld2_s8(r1);
-                    int8x8x2_t _r1n = vld2_s8(r1+16);
-                    int8x8_t _r10 = _r1.val[0];
-                    int8x8_t _r11 = _r1.val[1];
-                    int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1);
-                    _sum = vmlal_s8(_sum, _r10, _k3);
-                    _sum = vmlal_s8(_sum, _r11, _k4);
-                    _sum = vmlal_s8(_sum, _r12, _k5);
-
-                    int8x8x2_t _r2 = vld2_s8(r2);
-                    int8x8x2_t _r2n = vld2_s8(r2+16);
-                    int8x8_t _r20 = _r2.val[0];
-                    int8x8_t _r21 = _r2.val[1];
-                    int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1);
-                    _sum = vmlal_s8(_sum, _r20, _k6);
-                    _sum = vmlal_s8(_sum, _r21, _k7);
-                    _sum = vmlal_s8(_sum, _r22, _k8);
-
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    int32x4_t sum0n_s32 = vld1q_s32(outptr0+4);
-
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum));
-                    sum0n_s32 = vaddw_s16(sum0n_s32, vget_high_s16(_sum));
-
-                    vst1q_s32(outptr0, sum0_s32);
-                    vst1q_s32(outptr0+4, sum0n_s32);
+                    _sum0 = vaddq_s32(_sum0, _sum1);
+                    _sum0n = vaddq_s32(_sum0n, _sum1n);
+                    _sum2 = vaddq_s32(_sum2, _sum0);
+                    _sum2n = vaddq_s32(_sum2n, _sum0n);
 
-                    r0 += 16;
-                    r1 += 16;
-                    r2 += 16;
-                    outptr0 += 8;
-                }
-#endif
-#if __ARM_NEON
-                if (remain >= 4)
-                {
-                    remain -= 4;
+                    _out00 = vaddq_s32(_out00, _sum2);
+                    _out01 = vaddq_s32(_out01, _sum2n);
 
-                    int8x8x2_t _r0 = vld2_s8(r0);
-                    int8x8x2_t _r0n = vld2_s8(r0+16);
-                    int8x8_t _r00 = _r0.val[0];
-                    int8x8_t _r01 = _r0.val[1];
-                    int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1);
-
-                    int16x8_t _sum = vmull_s8(_r00, _k0);
-                    _sum = vmlal_s8(_sum, _r01, _k1);
-                    _sum = vmlal_s8(_sum, _r02, _k2);
-
-                    int8x8x2_t _r1 = vld2_s8(r1);
-                    int8x8x2_t _r1n = vld2_s8(r1+16);
-                    int8x8_t _r10 = _r1.val[0];
-                    int8x8_t _r11 = _r1.val[1];
-                    int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1);
-                    _sum = vmlal_s8(_sum, _r10, _k3);
-                    _sum = vmlal_s8(_sum, _r11, _k4);
-                    _sum = vmlal_s8(_sum, _r12, _k5);
-
-                    int8x8x2_t _r2 = vld2_s8(r2);
-                    int8x8x2_t _r2n = vld2_s8(r2+16);
-                    int8x8_t _r20 = _r2.val[0];
-                    int8x8_t _r21 = _r2.val[1];
-                    int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1);
-                    _sum = vmlal_s8(_sum, _r20, _k6);
-                    _sum = vmlal_s8(_sum, _r21, _k7);
-                    _sum = vmlal_s8(_sum, _r22, _k8);
-
-                    int32x4_t sum0_s32 = vld1q_s32(outptr0);
-                    sum0_s32 = vaddw_s16(sum0_s32, vget_low_s16(_sum));
-                    vst1q_s32(outptr0, sum0_s32);
+                    vst1q_s32(outptr0, _out00);
+                    vst1q_s32(outptr0+4, _out01);
 
                     r0 += 8;
                     r1 += 8;
                     r2 += 8;
-                    outptr0 += 4;
-                }                
+                    r3 += 8;
+                    outptr0 += 8;
+                    outptr0n += 8;
+                }
 #endif
                 for (; remain>0; remain--)
                 {
                     int sum0 = 0;
-                    
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-                    
+
+                    sum0 += r0[0] * ktmp[0];
+                    sum0 += r0[1] * ktmp[1];
+                    sum0 += r0[2] * ktmp[2];
+                    sum0 += r1[0] * ktmp[3];
+                    sum0 += r1[1] * ktmp[4];
+                    sum0 += r1[2] * ktmp[5];
+                    sum0 += r2[0] * ktmp[6];
+                    sum0 += r2[1] * ktmp[7];
+                    sum0 += r2[2] * ktmp[8];
+
                     *outptr0 += sum0;
 
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
+                    r0++;
+                    r1++;
+                    r2++;
                     outptr0++;
                 }
 
-                r0 += tailstep;
-                r1 += tailstep;
-                r2 += tailstep;
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
             }
 
-            kernel0 += 9;
-        }       
-    }   
+            ktmp += 9;
+        }
+    }
 }
-#else // __aarch64__
-static void conv3x3s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
+
+static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
 {
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
@@ -1356,1207 +1748,1225 @@ static void conv3x3s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat
     int outh = top_blob.h;
     int outch = top_blob.c;
 
-    const signed char* kernel = _kernel;
+    const int tailstep = w - 2*outw + w;
 
-    int nn_outch = outch >> 1;
-    int remain_outch_start = nn_outch << 1; 
+    int nn_outch = outch >> 3;
+    int remain_outch_start = nn_outch << 3;
 
     #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp < nn_outch; pp++)
+    for (int pp=0; pp<nn_outch; pp++)
     {
-        int p = pp * 2;
+        int p = pp * 8;
 
-        Mat out0 = top_blob.channel(p);
+        Mat out0 = top_blob.channel(p+0);
         Mat out1 = top_blob.channel(p+1);
+        Mat out2 = top_blob.channel(p+2);
+        Mat out3 = top_blob.channel(p+3);
+        Mat out4 = top_blob.channel(p+4);
+        Mat out5 = top_blob.channel(p+5);
+        Mat out6 = top_blob.channel(p+6);
+        Mat out7 = top_blob.channel(p+7);
 
         out0.fill(0);
         out1.fill(0);
+        out2.fill(0);
+        out3.fill(0);
+        out4.fill(0);
+        out5.fill(0);
+        out6.fill(0);
+        out7.fill(0);
+
+        const signed char* ktmp = _kernel.channel(p/8);
 
-        const signed char* kernel0 = (const signed char *)kernel + p * inch * 9;
-        const signed char* kernel1 = (const signed char *)kernel + (p + 1) * inch * 9;
-        
         for (int q=0; q<inch; q++)
         {
             int* outptr0 = out0;
             int* outptr1 = out1;
-            int* outptr0n = outptr0 + outw;
-            int* outptr1n = outptr1 + outw;
-        
+            int* outptr2 = out2;
+            int* outptr3 = out3;
+            int* outptr4 = out4;
+            int* outptr5 = out5;
+            int* outptr6 = out6;
+            int* outptr7 = out7;
+
             const signed char* img0 = bottom_blob.channel(q);
 
             const signed char* r0 = img0;
             const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
-            const signed char* r3 = img0 + w * 3;
+            const signed char* r2 = img0 + w*2;
 
             int i = 0;
 
-            for (; i+1 < outh; i+=2)
+            for (; i < outh; i++)
             {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+#if __ARM_NEON
+                int nn = outw >> 2;
+                int remain = outw & 3;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
 
-                if (nn > 0)
-                {
-                    asm volatile(
-                        "vld1.8    {d26-d27}, [%0]    \n"
-                        "vld1.8    {d28-d29}, [%1]    \n"
-                        : "=r"(kernel0), // %0
-                          "=r"(kernel1)  // %1
-                        : "0"(kernel0),
-                          "1"(kernel1)
-                        : "cc", "memory"
-                    );
+#if __ARM_NEON
+                for (; nn>0; nn--)
+                {                  
+                    // load output ch 0-7
+                    int32x4_t _sum0 = vld1q_s32(outptr0);// out0
+                    int32x4_t _sum1 = vld1q_s32(outptr1);// out1
+                    int32x4_t _sum2 = vld1q_s32(outptr2);// out2
+                    int32x4_t _sum3 = vld1q_s32(outptr3);// out3
+                    int32x4_t _sum4 = vld1q_s32(outptr4);// out4
+                    int32x4_t _sum5 = vld1q_s32(outptr5);// out5
+                    int32x4_t _sum6 = vld1q_s32(outptr6);// out6
+                    int32x4_t _sum7 = vld1q_s32(outptr7);// out7
 
-                    asm volatile(
-                        "0:                             \n"
-                        "pld        [%5, #128]          \n"
-                        "vld1.32    {d0-d1}, [%5]       \n"// r0
-                        "add        %5, #8              \n"
-                        "vext.8     d2, d0, d1, #1      \n"
-                        "vext.8     d3, d0, d1, #2      \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"
-                        "vmull.s8   q2, d0, d1          \n"// k0
-                        "vmlal.s8   q2, d2, d30         \n"// k1
-                        "vmlal.s8   q2, d3, d31         \n"// k2
-                        
-                        "pld        [%6, #128]          \n"
-                        "vld1.32    {d6-d7}, [%6]       \n"// r1
-                        "add        %6, #8              \n"
-                        "vext.8     d8, d6, d7, #1      \n"
-                        "vext.8     d9, d6, d7, #2      \n"
-                        
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d6, d1          \n"// k3
-                        "vmlal.s8   q2, d8, d30         \n"// k4
-                        "vmlal.s8   q2, d9, d31         \n"// k5
-
-                        "pld        [%7, #128]          \n"
-                        "vld1.32    {d10-d11}, [%7]     \n"// r2
-                        "add        %7, #8              \n"
-                        "vext.8     d12, d10, d11, #1   \n"
-                        "vext.8     d13, d10, d11, #2   \n"
-                        
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"     
-                        "vmlal.s8   q2, d10, d1         \n"// k6
-                        "vmlal.s8   q2, d12, d30        \n"// k7
-                        "vmlal.s8   q2, d13, d31        \n"// k8
-                        
-                        "pld        [%8, #128]          \n"
-                        "vld1.32    {d14-d15}, [%8]     \n"// r3
-                        "add        %8, #8              \n"
-                        "vext.8     d16, d14, d15, #1   \n"
-                        "vext.8     d17, d14, d15, #2   \n"     
-                        
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d18-d21}, [%1]     \n"// sum0
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%1]!    \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"     
-                        "vmull.s8   q2, d6, d1          \n"// k0
-                        "vmlal.s8   q2, d8, d30         \n"// k1
-                        "vmlal.s8   q2, d9, d31         \n"// k2
-
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k3
-                        "vmlal.s8   q2, d12, d30        \n"// k4
-                        "vmlal.s8   q2, d13, d31        \n"// k5
-
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"
-                        "vmlal.s8   q2, d14, d1         \n"// k6
-                        "vmlal.s8   q2, d16, d30        \n"// k7
-                        "vmlal.s8   q2, d17, d31        \n"// k8
-
-                        "pld        [%2, #128]          \n"
-                        "vld1.32    {d18-d21}, [%2]     \n"// sum0n
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%2]!    \n"
-                        
-                        "vdup.s8     d1, d28[0]         \n"
-                        "vdup.s8    d30, d28[1]         \n"
-                        "vdup.s8    d31, d28[2]         \n"
-                        "vmull.s8   q2, d0, d1          \n"// k0n
-                        "vmlal.s8   q2, d2, d30         \n"// k1n
-                        "vmlal.s8   q2, d3, d31         \n"// k2n
-
-                        "vdup.s8     d1, d28[3]         \n"
-                        "vdup.s8    d30, d28[4]         \n"
-                        "vdup.s8    d31, d28[5]         \n"     
-                        "vmlal.s8   q2, d6, d1          \n"// k3n
-                        "vmlal.s8   q2, d8, d30         \n"// k4n
-                        "vmlal.s8   q2, d9, d31         \n"// k5n
-
-                        "vdup.s8     d1, d28[6]         \n"
-                        "vdup.s8    d30, d28[7]         \n"
-                        "vdup.s8    d31, d29[0]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k6n
-                        "vmlal.s8   q2, d12, d30        \n"// k7n
-                        "vmlal.s8   q2, d13, d31        \n"// k8n
-
-                        "pld        [%3, #128]          \n"
-                        "vld1.32    {d18-d21}, [%3]     \n"// sum1
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%3]!    \n"
-                        
-                        "vdup.s8     d1, d28[0]         \n"
-                        "vdup.s8    d30, d28[1]         \n"
-                        "vdup.s8    d31, d28[2]         \n"
-                        "vmull.s8   q2, d6, d1          \n"// k0n
-                        "vmlal.s8   q2, d8, d30         \n"// k1n
-                        "vmlal.s8   q2, d9, d31         \n"// k2n
-
-                        "vdup.s8     d1, d28[3]         \n"
-                        "vdup.s8    d30, d28[4]         \n"
-                        "vdup.s8    d31, d28[5]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k3n
-                        "vmlal.s8   q2, d12, d30        \n"// k4n
-                        "vmlal.s8   q2, d13, d31        \n"// k5n
-
-                        "vdup.s8     d1, d28[6]         \n"
-                        "vdup.s8    d30, d28[7]         \n"
-                        "vdup.s8    d31, d29[0]         \n"
-                        "vmlal.s8   q2, d14, d1         \n"// k6n
-                        "vmlal.s8   q2, d16, d30        \n"// k7n
-                        "vmlal.s8   q2, d17, d31        \n"// k8n
-
-                        "pld        [%4, #128]          \n"
-                        "vld1.32    {d18-d21}, [%4]     \n"// sum1n
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%4]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(outptr0n),       // %2
-                          "=r"(outptr1),        // %3
-                          "=r"(outptr1n),       // %4
-                          "=r"(r0),             // %5
-                          "=r"(r1),             // %6
-                          "=r"(r2),             // %7
-                          "=r"(r3)              // %8
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(outptr0n),
-                          "3"(outptr1),
-                          "4"(outptr1n),
-                          "5"(r0),
-                          "6"(r1),
-                          "7"(r2),
-                          "8"(r3)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q15"
-                    );
-                }
+                    // r0
+                    int8x8x2_t _r0_s8 = vld2_s8(r0);
+                    int8x8_t _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1);
+                    // k0 - k2
+                    int8x8_t _k0_8 = vld1_s8(ktmp);    //(k00-k70)
+                    int8x8_t _k1_8 = vld1_s8(ktmp+8);  //(k01-k71)
+                    int8x8_t _k2_8 = vld1_s8(ktmp+16); //(k02-k72)
+
+                    int16x8_t _r0 = vmovl_s8(_r0_s8.val[0]);
+                    int16x8_t _r1 = vmovl_s8(_r0_s8.val[1]);
+                    int16x8_t _r2 = vmovl_s8(_r2_s8);
+
+                    int16x8_t _k0 = vmovl_s8(_k0_8);
+                    int16x8_t _k1 = vmovl_s8(_k1_8);
+                    int16x8_t _k2 = vmovl_s8(_k2_8);
+                    // dot row 1 k0
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7);
+                    // dot row 1 k1
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7);
+                    // dot row 1 k2
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7);
+
+                    // r1
+                    _r0_s8 = vld2_s8(r1);
+                    _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1);
+                    // k3 - k5
+                    _k0_8 = vld1_s8(ktmp+24);    //(k03-k73)
+                    _k1_8 = vld1_s8(ktmp+32);  //(k04-k74)
+                    _k2_8 = vld1_s8(ktmp+40); //(k05-k75)
+
+                    _r0 = vmovl_s8(_r0_s8.val[0]);
+                    _r1 = vmovl_s8(_r0_s8.val[1]);
+                    _r2 = vmovl_s8(_r2_s8);
+
+                    _k0 = vmovl_s8(_k0_8);
+                    _k1 = vmovl_s8(_k1_8);
+                    _k2 = vmovl_s8(_k2_8);
+                    // dot row 2 k3
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7);
+                    // dot row 2 k4
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7);
+                    // dot row 2 k5
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7);
+
+                    // r2 
+                    _r0_s8 = vld2_s8(r2);
+                    _r2_s8 = vext_s8(_r0_s8.val[0], _r0_s8.val[0], 1);
+                    // k6 - k8
+                    _k0_8 = vld1_s8(ktmp+48); //(k06-k76)
+                    _k1_8 = vld1_s8(ktmp+56); //(k07-k77)
+                    _k2_8 = vld1_s8(ktmp+64); //(k08-k78)
+
+                    _r0 = vmovl_s8(_r0_s8.val[0]);
+                    _r1 = vmovl_s8(_r0_s8.val[1]);
+                    _r2 = vmovl_s8(_r2_s8);
+
+                    _k0 = vmovl_s8(_k0_8);
+                    _k1 = vmovl_s8(_k1_8);
+                    _k2 = vmovl_s8(_k2_8);
+                    // dot row 2 k6
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r0), _k0, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r0), _k0, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r0), _k0, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r0), _k0, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r0), _k0, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r0), _k0, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r0), _k0, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r0), _k0, 7);
+                    // dot row 2 k7
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r1), _k1, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r1), _k1, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r1), _k1, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r1), _k1, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r1), _k1, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r1), _k1, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r1), _k1, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r1), _k1, 7);
+                    // dot row 2 k8
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r2), _k2, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_r2), _k2, 1);
+                    _sum2 = vmlal_laneq_s16(_sum2, vget_low_s16(_r2), _k2, 2);
+                    _sum3 = vmlal_laneq_s16(_sum3, vget_low_s16(_r2), _k2, 3);
+                    _sum4 = vmlal_laneq_s16(_sum4, vget_low_s16(_r2), _k2, 4);
+                    _sum5 = vmlal_laneq_s16(_sum5, vget_low_s16(_r2), _k2, 5);
+                    _sum6 = vmlal_laneq_s16(_sum6, vget_low_s16(_r2), _k2, 6);
+                    _sum7 = vmlal_laneq_s16(_sum7, vget_low_s16(_r2), _k2, 7);
+
+                    // save s32 to memory
+                    vst1q_s32(outptr0, _sum0);
+                    vst1q_s32(outptr1, _sum1);
+                    vst1q_s32(outptr2, _sum2);
+                    vst1q_s32(outptr3, _sum3);
+                    vst1q_s32(outptr4, _sum4);
+                    vst1q_s32(outptr5, _sum5);
+                    vst1q_s32(outptr6, _sum6);
+                    vst1q_s32(outptr7, _sum7);
+               
+                    r0 += 8;
+                    r1 += 8;
+                    r2 += 8;
 
+                    outptr0 += 4;
+                    outptr1 += 4;
+                    outptr2 += 4;
+                    outptr3 += 4;
+                    outptr4 += 4;
+                    outptr5 += 4;
+                    outptr6 += 4;
+                    outptr7 += 4;
+                }
+                
+#endif // __ARM_NEON
                 for (; remain>0; remain--)
                 {
-                    int sum0 = 0;
-                    int sum0n = 0;
-                    int sum1 = 0;
-                    int sum1n = 0;
-
-                    //ToDo Neon
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
-
-                    sum0n += (int)r1[0] * kernel0[0];
-                    sum0n += (int)r1[1] * kernel0[1];
-                    sum0n += (int)r1[2] * kernel0[2];
-                    sum0n += (int)r2[0] * kernel0[3];
-                    sum0n += (int)r2[1] * kernel0[4];
-                    sum0n += (int)r2[2] * kernel0[5];
-                    sum0n += (int)r3[0] * kernel0[6];
-                    sum0n += (int)r3[1] * kernel0[7];
-                    sum0n += (int)r3[2] * kernel0[8];
-
-                    sum1n += (int)r1[0] * kernel1[0];
-                    sum1n += (int)r1[1] * kernel1[1];
-                    sum1n += (int)r1[2] * kernel1[2];
-                    sum1n += (int)r2[0] * kernel1[3];
-                    sum1n += (int)r2[1] * kernel1[4];
-                    sum1n += (int)r2[2] * kernel1[5];
-                    sum1n += (int)r3[0] * kernel1[6];
-                    sum1n += (int)r3[1] * kernel1[7];
-                    sum1n += (int)r3[2] * kernel1[8];
+#if __ARM_NEON
+                    int8x8_t _r0_s8 = vld1_s8(r0);// (a00 a01 a02 ....)
+                    int8x8_t _r1_s8 = vld1_s8(r1);// (a10 a11 a12 ....)
+                    int8x8_t _r2_s8 = vld1_s8(r2);// (a20 a21 a22 ....)
+
+                    int16x8_t _r0 = vmovl_s8(_r0_s8);
+                    int16x8_t _r1 = vmovl_s8(_r1_s8);
+                    int16x8_t _r2 = vmovl_s8(_r2_s8);
+
+                    int32x4_t _sum03, _sum47;
+                    _sum03 = vld1q_lane_s32(outptr0, _sum03, 0);// out0
+                    _sum03 = vld1q_lane_s32(outptr1, _sum03, 1);// out1
+                    _sum03 = vld1q_lane_s32(outptr2, _sum03, 2);// out2
+                    _sum03 = vld1q_lane_s32(outptr3, _sum03, 3);// out3
+                    _sum47 = vld1q_lane_s32(outptr4, _sum47, 0);// out4
+                    _sum47 = vld1q_lane_s32(outptr5, _sum47, 1);// out5
+                    _sum47 = vld1q_lane_s32(outptr6, _sum47, 2);// out6
+                    _sum47 = vld1q_lane_s32(outptr7, _sum47, 3);// out7
+
+                    // k0 - k2
+                    int8x8_t _k0_8 = vld1_s8(ktmp);    //(k00-k70)
+                    int8x8_t _k1_8 = vld1_s8(ktmp+8);  //(k01-k71)
+                    int8x8_t _k2_8 = vld1_s8(ktmp+16); //(k02-k72)
+
+                    int16x8_t _k0 = vmovl_s8(_k0_8);
+                    int16x8_t _k1 = vmovl_s8(_k1_8);
+                    int16x8_t _k2 = vmovl_s8(_k2_8);
+
+                    int32x4_t _sum0 = vmull_laneq_s16(vget_low_s16(_k0), _r0, 0);
+                    int32x4_t _sum0n = vmull_laneq_s16(vget_high_s16(_k0), _r0, 0);
+                    int32x4_t _sum1 = vmull_laneq_s16(vget_low_s16(_k1), _r0, 1);
+                    int32x4_t _sum1n = vmull_laneq_s16(vget_high_s16(_k1), _r0, 1);
+                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r0, 2);
+                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r0, 2);
+
+                    // k3 - k5
+                    _k0_8 = vld1_s8(ktmp+24); //(k03-k73)
+                    _k1_8 = vld1_s8(ktmp+32); //(k04-k74)
+                    _k2_8 = vld1_s8(ktmp+40); //(k05-k75)
+
+                    _k0 = vmovl_s8(_k0_8);
+                    _k1 = vmovl_s8(_k1_8);
+                    _k2 = vmovl_s8(_k2_8);
+
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r1, 0);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r1, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r1, 1);
+                    _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r1, 1);
+                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r1, 2);
+                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r1, 2);
+
+                    // k6 - k8
+                    _k0_8 = vld1_s8(ktmp+48); //(k06-k76)
+                    _k1_8 = vld1_s8(ktmp+56); //(k07-k77)
+                    _k2_8 = vld1_s8(ktmp+64); //(k08-k78)
+
+                    _k0 = vmovl_s8(_k0_8);
+                    _k1 = vmovl_s8(_k1_8);
+                    _k2 = vmovl_s8(_k2_8);
+
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_k0), _r2, 0);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_k0), _r2, 0);
+                    _sum1 = vmlal_laneq_s16(_sum1, vget_low_s16(_k1), _r2, 1);
+                    _sum1n = vmlal_laneq_s16(_sum1n, vget_high_s16(_k1), _r2, 1);
+                    _sum03 = vmlal_laneq_s16(_sum03, vget_low_s16(_k2), _r2, 2);
+                    _sum47 = vmlal_laneq_s16(_sum47, vget_high_s16(_k2), _r2, 2);
 
-                    *outptr0 += sum0;
-                    *outptr1 += sum1;
-                    *outptr0n += sum0n;
-                    *outptr1n += sum1n;
+                    _sum0 = vaddq_s32(_sum0, _sum1);
+                    _sum0n = vaddq_s32(_sum0n, _sum1n);
+                    _sum03 = vaddq_s32(_sum03, _sum0);
+                    _sum47 = vaddq_s32(_sum47, _sum0n);
+
+                    vst1q_lane_s32(outptr0, _sum03, 0);
+                    vst1q_lane_s32(outptr1, _sum03, 1);
+                    vst1q_lane_s32(outptr2, _sum03, 2);
+                    vst1q_lane_s32(outptr3, _sum03, 3);
+                    vst1q_lane_s32(outptr4, _sum47, 0);
+                    vst1q_lane_s32(outptr5, _sum47, 1);
+                    vst1q_lane_s32(outptr6, _sum47, 2);
+                    vst1q_lane_s32(outptr7, _sum47, 3);
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    r3++;
                     outptr0++;
                     outptr1++;
-                    outptr0n++;
-                    outptr1n++;
-                }
+                    outptr2++;
+                    outptr3++;
+                    outptr4++;
+                    outptr5++;
+                    outptr6++;
+                    outptr7++;
+#else // __ARM_NEON
+                    int sum0 = 0;
+                    int sum1 = 0;
+                    int sum2 = 0;
+                    int sum3 = 0;
+                    int sum4 = 0;
+                    int sum5 = 0;
+                    int sum6 = 0;
+                    int sum7 = 0;
 
-                r0 += 2 + w;
-                r1 += 2 + w;
-                r2 += 2 + w;
-                r3 += 2 + w;
+                    sum0 += (int)r0[0] * ktmp[0];
+                    sum1 += (int)r0[0] * ktmp[1];
+                    sum2 += (int)r0[0] * ktmp[2];
+                    sum3 += (int)r0[0] * ktmp[3];
+                    sum4 += (int)r0[0] * ktmp[4];
+                    sum5 += (int)r0[0] * ktmp[5];
+                    sum6 += (int)r0[0] * ktmp[6];
+                    sum7 += (int)r0[0] * ktmp[7];
+                    ktmp += 8;
 
-                outptr0 += outw;
-                outptr1 += outw;
-                outptr0n += outw;
-                outptr1n += outw;
-            }
+                    sum0 += (int)r0[1] * ktmp[0];
+                    sum1 += (int)r0[1] * ktmp[1];
+                    sum2 += (int)r0[1] * ktmp[2];
+                    sum3 += (int)r0[1] * ktmp[3];
+                    sum4 += (int)r0[1] * ktmp[4];
+                    sum5 += (int)r0[1] * ktmp[5];
+                    sum6 += (int)r0[1] * ktmp[6];
+                    sum7 += (int)r0[1] * ktmp[7];
+                    ktmp += 8;
 
-            for (; i < outh; i++)
-            {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+                    sum0 += (int)r0[2] * ktmp[0];
+                    sum1 += (int)r0[2] * ktmp[1];
+                    sum2 += (int)r0[2] * ktmp[2];
+                    sum3 += (int)r0[2] * ktmp[3];
+                    sum4 += (int)r0[2] * ktmp[4];
+                    sum5 += (int)r0[2] * ktmp[5];
+                    sum6 += (int)r0[2] * ktmp[6];
+                    sum7 += (int)r0[2] * ktmp[7];
+                    ktmp += 8;
 
-                if (nn > 0)
-                {
-                    asm volatile(
-                        "vld1.8    {d26-d27}, [%0]    \n"
-                        "vld1.8    {d28-d29}, [%1]    \n"
-                        : "=r"(kernel0), // %0
-                          "=r"(kernel1)  // %1
-                        : "0"(kernel0),
-                          "1"(kernel1)
-                        : "cc", "memory"
-                    );
+                    sum0 += (int)r1[0] * ktmp[0];
+                    sum1 += (int)r1[0] * ktmp[1];
+                    sum2 += (int)r1[0] * ktmp[2];
+                    sum3 += (int)r1[0] * ktmp[3];
+                    sum4 += (int)r1[0] * ktmp[4];
+                    sum5 += (int)r1[0] * ktmp[5];
+                    sum6 += (int)r1[0] * ktmp[6];
+                    sum7 += (int)r1[0] * ktmp[7];
+                    ktmp += 8;
 
-                    asm volatile(
-                        "0:                             \n"
-                        "pld        [%3, #128]          \n"
-                        "vld1.32    {d0-d1}, [%3]       \n"// r0
-                        "add        %3, #8              \n"
-                        "vext.8     d2, d0, d1, #1      \n"
-                        "vext.8     d3, d0, d1, #2      \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"
-                        "vmull.s8   q2, d0, d1          \n"// k0
-                        "vmlal.s8   q2, d2, d30         \n"// k1
-                        "vmlal.s8   q2, d3, d31         \n"// k2
-                        
-                        "pld        [%4, #128]          \n"
-                        "vld1.32    {d6-d7}, [%4]       \n"// r1
-                        "add        %4, #8              \n"
-                        "vext.8     d8, d6, d7, #1      \n"
-                        "vext.8     d9, d6, d7, #2      \n"
-                        
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d6, d1          \n"// k3
-                        "vmlal.s8   q2, d8, d30         \n"// k4
-                        "vmlal.s8   q2, d9, d31         \n"// k5
-
-                        "pld        [%5, #128]          \n"
-                        "vld1.32    {d10-d11}, [%5]     \n"// r2
-                        "add        %5, #8              \n"
-                        "vext.8     d12, d10, d11, #1   \n"
-                        "vext.8     d13, d10, d11, #2   \n"
-                        
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k6
-                        "vmlal.s8   q2, d12, d30        \n"// k7
-                        "vmlal.s8   q2, d13, d31        \n"// k8
-                        
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d18-d21}, [%1]     \n"// sum0
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%1]!    \n"
-                        
-                        "vdup.s8     d1, d28[0]         \n"
-                        "vdup.s8     d7, d28[1]         \n"
-                        "vdup.s8    d11, d28[2]         \n" 
-                        "vmull.s8   q2, d0, d1          \n"// k0n
-                        "vmlal.s8   q2, d2, d7          \n"// k1n
-                        "vmlal.s8   q2, d3, d11         \n"// k2n
-
-                        "vdup.s8     d1, d28[3]         \n"
-                        "vdup.s8     d7, d28[4]         \n"
-                        "vdup.s8    d11, d28[5]         \n"
-                        "vmlal.s8   q2, d6, d1          \n"// k3n
-                        "vmlal.s8   q2, d8, d7          \n"// k4n
-                        "vmlal.s8   q2, d9, d11         \n"// k5n
-
-                        "vdup.s8     d1, d28[6]         \n"
-                        "vdup.s8     d7, d28[7]         \n"
-                        "vdup.s8    d11, d29[0]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k6n
-                        "vmlal.s8   q2, d12, d7         \n"// k7n
-                        "vmlal.s8   q2, d13, d11        \n"// k8n
-
-                        "pld        [%2, #128]          \n"
-                        "vld1.32    {d18-d21}, [%2]     \n"// sum1
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%2]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(outptr1),        // %2
-                          "=r"(r0),             // %3
-                          "=r"(r1),             // %4
-                          "=r"(r2)              // %5
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(outptr1),
-                          "3"(r0),
-                          "4"(r1),
-                          "5"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-                    );
-                }
+                    sum0 += (int)r1[1] * ktmp[0];
+                    sum1 += (int)r1[1] * ktmp[1];
+                    sum2 += (int)r1[1] * ktmp[2];
+                    sum3 += (int)r1[1] * ktmp[3];
+                    sum4 += (int)r1[1] * ktmp[4];
+                    sum5 += (int)r1[1] * ktmp[5];
+                    sum6 += (int)r1[1] * ktmp[6];
+                    sum7 += (int)r1[1] * ktmp[7];
+                    ktmp += 8;
 
-                for (; remain>0; remain--)
-                {
-                    int sum0 = 0;
-                    int sum1 = 0;
+                    sum0 += (int)r1[2] * ktmp[0];
+                    sum1 += (int)r1[2] * ktmp[1];
+                    sum2 += (int)r1[2] * ktmp[2];
+                    sum3 += (int)r1[2] * ktmp[3];
+                    sum4 += (int)r1[2] * ktmp[4];
+                    sum5 += (int)r1[2] * ktmp[5];
+                    sum6 += (int)r1[2] * ktmp[6];
+                    sum7 += (int)r1[2] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += (int)r2[0] * ktmp[0];
+                    sum1 += (int)r2[0] * ktmp[1];
+                    sum2 += (int)r2[0] * ktmp[2];
+                    sum3 += (int)r2[0] * ktmp[3];
+                    sum4 += (int)r2[0] * ktmp[4];
+                    sum5 += (int)r2[0] * ktmp[5];
+                    sum6 += (int)r2[0] * ktmp[6];
+                    sum7 += (int)r2[0] * ktmp[7];
+                    ktmp += 8;
 
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
+                    sum0 += (int)r2[1] * ktmp[0];
+                    sum1 += (int)r2[1] * ktmp[1];
+                    sum2 += (int)r2[1] * ktmp[2];
+                    sum3 += (int)r2[1] * ktmp[3];
+                    sum4 += (int)r2[1] * ktmp[4];
+                    sum5 += (int)r2[1] * ktmp[5];
+                    sum6 += (int)r2[1] * ktmp[6];
+                    sum7 += (int)r2[1] * ktmp[7];
+                    ktmp += 8;
+
+                    sum0 += (int)r2[2] * ktmp[0];
+                    sum1 += (int)r2[2] * ktmp[1];
+                    sum2 += (int)r2[2] * ktmp[2];
+                    sum3 += (int)r2[2] * ktmp[3];
+                    sum4 += (int)r2[2] * ktmp[4];
+                    sum5 += (int)r2[2] * ktmp[5];
+                    sum6 += (int)r2[2] * ktmp[6];
+                    sum7 += (int)r2[2] * ktmp[7];
+                    ktmp += 8;
 
                     *outptr0 += sum0;
                     *outptr1 += sum1;
+                    *outptr2 += sum2;
+                    *outptr3 += sum3;
+                    *outptr4 += sum4;
+                    *outptr5 += sum5;
+                    *outptr6 += sum6;
+                    *outptr7 += sum7;
+
+                    ktmp -= 8*9;
 
-                    r0++;
-                    r1++;
-                    r2++;
                     outptr0++;
                     outptr1++;
+                    outptr2++;
+                    outptr3++;
+                    outptr4++;
+                    outptr5++;
+                    outptr6++;
+                    outptr7++;
+#endif // __ARM_NEON
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
                 }
 
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
             }
 
-            kernel0 += 9;
-            kernel1 += 9;
+            ktmp += 8*9;
         }
     }
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p=remain_outch_start; p<outch; p++)
     {
-        Mat out0 = top_blob.channel(p);
+        Mat out = top_blob.channel(p);
 
-        out0.fill(0);
+        out.fill(0);
 
-        const signed char* kernel0 = (const signed char *)kernel + p * inch * 9;
+        const signed char* ktmp = _kernel.channel(p/8 + p%8);
 
         for (int q=0; q<inch; q++)
-        {                   
-            int* outptr0 = out0;
-            int* outptr0n = outptr0 + outw;
-        
+        {
+            int* outptr = out;
+
             const signed char* img0 = bottom_blob.channel(q);
-            
+
             const signed char* r0 = img0;
             const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
-            const signed char* r3 = img0 + w * 3;
+            const signed char* r2 = img0 + w*2;
 
             int i = 0;
 
-            for (; i+1 < outh; i+=2)
+            for (; i < outh; i++)
             {
+#if __ARM_NEON
                 int nn = outw >> 3;
                 int remain = outw & 7;
+#else
+                int remain = outw;
+#endif // __ARM_NEON
 
-                if (nn > 0)
-                {
-                    asm volatile(
-                        "vld1.8    {d26-d27}, [%0]    \n"
-                        : "=r"(kernel0) // %0
-                        : "0"(kernel0)
-                        : "cc", "memory"
-                    );
-
-                    asm volatile(
-                        "0:                             \n"
-                        "pld        [%3, #128]          \n"
-                        "vld1.32    {d0-d1}, [%3]       \n"// r0
-                        "add        %3, #8              \n"
-                        "vext.8     d2, d0, d1, #1      \n"
-                        "vext.8     d3, d0, d1, #2      \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"
-                        "vmull.s8   q2, d0, d1          \n"// k0
-                        "vmlal.s8   q2, d2, d30         \n"// k1
-                        "vmlal.s8   q2, d3, d31         \n"// k2
-                        
-                        "pld        [%4, #128]          \n"
-                        "vld1.32    {d6-d7}, [%4]       \n"// r1
-                        "add        %4, #8              \n"
-                        "vext.8     d8, d6, d7, #1      \n"
-                        "vext.8     d9, d6, d7, #2      \n"
-                        
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d6, d1          \n"// k3
-                        "vmlal.s8   q2, d8, d30         \n"// k4
-                        "vmlal.s8   q2, d9, d31         \n"// k5
-
-                        "pld        [%5, #128]          \n"
-                        "vld1.32    {d10-d11}, [%5]     \n"// r2
-                        "add        %5, #8              \n"
-                        "vext.8     d12, d10, d11, #1   \n"
-                        "vext.8     d13, d10, d11, #2   \n"
-                        
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k6
-                        "vmlal.s8   q2, d12, d30        \n"// k7
-                        "vmlal.s8   q2, d13, d31        \n"// k8
-                        
-                        "pld        [%6, #128]          \n"
-                        "vld1.32    {d14-d15}, [%6]     \n"// r3
-                        "add        %6, #8              \n"
-                        "vext.8     d16, d14, d15, #1   \n"
-                        "vext.8     d17, d14, d15, #2   \n"
-                        
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d18-d21}, [%1]     \n"// sum0
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%1]!    \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"
-                        "vmull.s8   q2, d6, d1          \n"// k0
-                        "vmlal.s8   q2, d8, d30         \n"// k1
-                        "vmlal.s8   q2, d9, d31         \n"// k2
-
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k3
-                        "vmlal.s8   q2, d12, d30        \n"// k4
-                        "vmlal.s8   q2, d13, d31        \n"// k5
-
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"
-                        "vmlal.s8   q2, d14, d1         \n"// k6
-                        "vmlal.s8   q2, d16, d30        \n"// k7
-                        "vmlal.s8   q2, d17, d31        \n"// k8
-
-                        "pld        [%2, #128]          \n"
-                        "vld1.32    {d18-d21}, [%2]     \n"// sum0n
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%2]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(outptr0n),       // %2
-                          "=r"(r0),             // %3
-                          "=r"(r1),             // %4
-                          "=r"(r2),             // %5
-                          "=r"(r3)              // %6
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(outptr0n),
-                          "3"(r0),
-                          "4"(r1),
-                          "5"(r2),
-                          "6"(r3)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-                    );
-                }
-
-                for (; remain>0; remain--)
-                {
-                    //Todo Neon
-
-                    int sum0 = 0;
-                    int sum0n = 0;
-
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum0n += (int)r1[0] * kernel0[0];
-                    sum0n += (int)r1[1] * kernel0[1];
-                    sum0n += (int)r1[2] * kernel0[2];
-                    sum0n += (int)r2[0] * kernel0[3];
-                    sum0n += (int)r2[1] * kernel0[4];
-                    sum0n += (int)r2[2] * kernel0[5];
-                    sum0n += (int)r3[0] * kernel0[6];
-                    sum0n += (int)r3[1] * kernel0[7];
-                    sum0n += (int)r3[2] * kernel0[8];
+#if __ARM_NEON
+                for (; nn>0; nn--)
+                {                  
+                    // load output ch 0
+                    int32x4_t _sum0 = vld1q_s32(outptr);// out0
+                    int32x4_t _sum0n = vld1q_s32(outptr+4);// out0n
+
+                    int8x8x2_t _r0_s8 = vld2_s8(r0); 
+                    int8x8x2_t _r0n_s8 = vld2_s8(r0+16);
+
+                    int8x8x2_t _r1_s8 = vld2_s8(r1);
+                    int8x8x2_t _r1n_s8 = vld2_s8(r1+16);
+
+                    int8x8x2_t _r2_s8 = vld2_s8(r2);
+                    int8x8x2_t _r2n_s8 = vld2_s8(r2+16);
+
+                    int8x8_t _r02_s8 = vext_s8(_r0_s8.val[0], _r0n_s8.val[0], 1);
+                    int8x8_t _r12_s8 = vext_s8(_r1_s8.val[0], _r1n_s8.val[0], 1);
+                    int8x8_t _r22_s8 = vext_s8(_r2_s8.val[0], _r2n_s8.val[0], 1);
+
+                    int16x8_t _r00 = vmovl_s8(_r0_s8.val[0]); // r00
+                    int16x8_t _r01 = vmovl_s8(_r0_s8.val[1]); // r01
+                    int16x8_t _r02 = vmovl_s8(_r02_s8);       // r02
+
+                    int16x8_t _r10 = vmovl_s8(_r1_s8.val[0]); // r10
+                    int16x8_t _r11 = vmovl_s8(_r1_s8.val[1]); // r11
+                    int16x8_t _r12 = vmovl_s8(_r12_s8);       // r12
+
+                    int16x8_t _r20 = vmovl_s8(_r2_s8.val[0]); // r20
+                    int16x8_t _r21 = vmovl_s8(_r2_s8.val[1]); // r21
+                    int16x8_t _r22 = vmovl_s8(_r22_s8);       // r22
+
+                    int8x16_t _k_s8 = vld1q_s8(ktmp);
+                    int16x8_t _k_s16 = vmovl_s8(vget_low_s8(_k_s8)); // k0...k8
+                    int16x8_t _kn_s16 = vmovl_s8(vget_high_s8(_k_s8));// k9... 
+
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r00), _k_s16, 0);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r00), _k_s16, 0);
+                    int32x4_t _sum01 = vmull_laneq_s16(vget_low_s16(_r01), _k_s16, 1);
+                    int32x4_t _sum01n = vmull_laneq_s16(vget_high_s16(_r01), _k_s16, 1);
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r02), _k_s16, 2);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r02), _k_s16, 2);
+
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r10), _k_s16, 3);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r10), _k_s16, 3);
+                    _sum01 = vmlal_laneq_s16(_sum01, vget_low_s16(_r11), _k_s16, 4);
+                    _sum01n = vmlal_laneq_s16(_sum01n, vget_high_s16(_r11), _k_s16, 4);
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r12), _k_s16, 5);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r12), _k_s16, 5);
+
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r20), _k_s16, 6);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r20), _k_s16, 6);
+                    _sum01 = vmlal_laneq_s16(_sum01, vget_low_s16(_r21), _k_s16, 7);
+                    _sum01n = vmlal_laneq_s16(_sum01n, vget_high_s16(_r21), _k_s16, 7);
+                    _sum0 = vmlal_laneq_s16(_sum0, vget_low_s16(_r22), _kn_s16, 0);
+                    _sum0n = vmlal_laneq_s16(_sum0n, vget_high_s16(_r22), _kn_s16, 0);
+
+                    _sum0 = vaddq_s32(_sum0, _sum01);
+                    _sum0n = vaddq_s32(_sum0n, _sum01n);
 
-                    *outptr0 += sum0;
-                    *outptr0n += sum0n;
+                    // save s32 to memory
+                    vst1q_s32(outptr, _sum0);
+                    vst1q_s32(outptr+4, _sum0n);
+               
+                    r0 += 16;
+                    r1 += 16;
+                    r2 += 16;
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    r3++;
-                    outptr0++;
-                    outptr0n++;
+                    outptr += 8;
                 }
+#endif // __ARM_NEON
 
-                r0 += 2 + w;
-                r1 += 2 + w;
-                r2 += 2 + w;
-                r3 += 2 + w;
-
-                outptr0 += outw;
-                outptr0n += outw;
-            }
+                if (remain > 0)
+                {
+#if __ARM_NEON
+                    int8x8_t _k01234567s8 = vld1_s8(ktmp);
+                    int8x8_t _k8xxxxxxxs8 = vld1_s8(ktmp+8);
+                    int8x8_t _k34567xxxs8 = vext_s8(_k01234567s8, _k01234567s8, 3);
+                    int8x8_t _k678xxxxxs8 = vext_s8(_k01234567s8, _k8xxxxxxxs8, 6);
+                    int16x8_t _k0123_s16 = vmovl_s8(_k01234567s8);
+                    int16x8_t _k3456_s16 = vmovl_s8(_k34567xxxs8);
+                    int16x8_t _k678x_s16 = vmovl_s8(_k678xxxxxs8);
+#endif
+                    for (; remain>0; remain--)
+                    {
+#if __ARM_NEON
+                        int8x8_t _r00s8 = vld1_s8(r0);
+                        int8x8_t _r10s8 = vld1_s8(r1);
+                        int8x8_t _r20s8 = vld1_s8(r2);
 
-            for (; i < outh; i++)
-            {
-                int nn = outw >> 3;
-                int remain = outw & 7;
+                        int16x8_t _r00s16 = vmovl_s8(_r00s8);
+                        int16x8_t _r10s16 = vmovl_s8(_r10s8);
+                        int16x8_t _r20s16 = vmovl_s8(_r20s8);
 
-                if (nn > 0)
-                {
-                    asm volatile(
-                        "vld1.8    {d26-d27}, [%0]    \n"
-                        : "=r"(kernel0) // %0
-                        : "0"(kernel0)
-                        : "cc", "memory"
-                    );
+                        int32x4_t _sum = vmull_s16(vget_low_s16(_r00s16), vget_low_s16(_k0123_s16));
+                        _sum = vmlal_s16(_sum, vget_low_s16(_r10s16), vget_low_s16(_k3456_s16));
+                        _sum = vmlal_s16(_sum, vget_low_s16(_r20s16), vget_low_s16(_k678x_s16));
 
-                    asm volatile(
-                        "0:                             \n"
-                        "pld        [%2, #128]          \n"
-                        "vld1.32    {d0-d1}, [%2]       \n"// r0
-                        "add        %2, #8              \n"
-                        "vext.8     d2, d0, d1, #1      \n"
-                        "vext.8     d3, d0, d1, #2      \n"
-                        
-                        "vdup.s8     d1, d26[0]         \n"
-                        "vdup.s8    d30, d26[1]         \n"
-                        "vdup.s8    d31, d26[2]         \n"
-                        "vmull.s8   q2, d0, d1          \n"// k0
-                        "vmlal.s8   q2, d2, d30         \n"// k1
-                        "vmlal.s8   q2, d3, d31         \n"// k2
-                        
-                        "pld        [%3, #128]          \n"
-                        "vld1.32    {d6-d7}, [%3]       \n"// r1
-                        "add        %3, #8              \n"
-                        "vext.8     d8, d6, d7, #1      \n"
-                        "vext.8     d9, d6, d7, #2      \n"
-                        
-                        "vdup.s8     d1, d26[3]         \n"
-                        "vdup.s8    d30, d26[4]         \n"
-                        "vdup.s8    d31, d26[5]         \n"
-                        "vmlal.s8   q2, d6, d1          \n"// k3
-                        "vmlal.s8   q2, d8, d30         \n"// k4
-                        "vmlal.s8   q2, d9, d31         \n"// k5
-
-                        "pld        [%4, #128]          \n"
-                        "vld1.32    {d10-d11}, [%4]     \n"// r2
-                        "add        %4, #8              \n"
-                        "vext.8     d12, d10, d11, #1   \n"
-                        "vext.8     d13, d10, d11, #2   \n"
-                        
-                        "vdup.s8     d1, d26[6]         \n"
-                        "vdup.s8    d30, d26[7]         \n"
-                        "vdup.s8    d31, d27[0]         \n"
-                        "vmlal.s8   q2, d10, d1         \n"// k6
-                        "vmlal.s8   q2, d12, d30        \n"// k7
-                        "vmlal.s8   q2, d13, d31        \n"// k8
-                        
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d18-d21}, [%1]     \n"// sum0
-                        "vaddw.s16   q9,  q9, d4        \n"
-                        "vaddw.s16  q10, q10, d5        \n"
-                        "vst1.32    {d18-d21}, [%1]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(r0),             // %2
-                          "=r"(r1),             // %3
-                          "=r"(r2)              // %4
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(r0),
-                          "3"(r1),
-                          "4"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
-                    );
-                }
+                        _sum = vsetq_lane_s32(*outptr, _sum, 3);
 
-                for (; remain>0; remain--)
-                {
-                    int sum0 = 0;
+                        *outptr = vaddvq_s32(_sum);
+#else
+                        int sum = 0;
 
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
+                        sum += (int)r0[0] * ktmp[0];
+                        sum += (int)r0[1] * ktmp[1];
+                        sum += (int)r0[2] * ktmp[2];
+                        sum += (int)r1[0] * ktmp[3];
+                        sum += (int)r1[1] * ktmp[4];
+                        sum += (int)r1[2] * ktmp[5];
+                        sum += (int)r2[0] * ktmp[6];
+                        sum += (int)r2[1] * ktmp[7];
+                        sum += (int)r2[2] * ktmp[8];
 
-                    *outptr0 += sum0;
+                        *outptr += sum;
+#endif // __ARM_NEON
+                        r0 += 2;
+                        r1 += 2;
+                        r2 += 2;
+                        outptr++;
+                    }
+                }
 
-                    r0++;
-                    r1++;
-                    r2++;
-                    outptr0++;
-                }   
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
 
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
-            }           
-            kernel0 += 9;
-        }       
+            ktmp += 9;
+        }
     }
 }
-
-static void conv3x3s2_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
+#else // __aarch64__
+static void conv3x3s1_winograd23_int8_neon(const Mat& bottom_blob, Mat& top_blob, const std::vector<Mat> &kernel_tm_test, const Option& opt)
 {
     int w = bottom_blob.w;
+    int h = bottom_blob.h;
     int inch = bottom_blob.c;
 
     int outw = top_blob.w;
     int outh = top_blob.h;
     int outch = top_blob.c;
 
-    const int tailstep = w - 2 * outw + w;
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
 
-    const signed char* kernel = _kernel;
-    
-    int nn_outch = outch >> 1;
-    int remain_outch_start = nn_outch << 1; 
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int pp=0; pp < nn_outch; pp++)
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);  
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
     {
-        int p = pp * 2;
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-        Mat out0 = top_blob.channel(p);
-        Mat out1 = top_blob.channel(p + 1);
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-        out0.fill(0.f);
-        out1.fill(0.f);
+        const int tiles = nColBlocks * nRowBlocks;
 
-        const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
-        const signed char* kernel1 = (const signed char*)kernel + (p + 1) * inch * 9;
+        bottom_blob_tm.create(4, inch, tiles*4, 2u, opt.workspace_allocator);
 
+        // BT
+        // const float itm[4][4] = {
+        //     {1.0f,  0.0f, -1.0f,  0.0f},
+        //     {0.0f,  1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  0.00f, 1.0f}
+        // };        
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
         for (int q=0; q<inch; q++)
         {
-            int* outptr0 = out0;
-            int* outptr1 = out1;
+            const signed char* img = bottom_blob_bordered.channel(q);
 
-            const signed char* img0 = bottom_blob.channel(q);
+            for (int j=0; j<nColBlocks; j++)
+            {
+                const signed char* r0 = img + w * j * 2;
+                const signed char* r1 = r0 + w;
+                const signed char* r2 = r1 + w;
+                const signed char* r3 = r2 + w;
 
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;
+                for (int i = 0; i<nRowBlocks; i++)
+                {
+                    short* out_tm0 = bottom_blob_tm.channel(tiles*0+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm1 = bottom_blob_tm.channel(tiles*1+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm2 = bottom_blob_tm.channel(tiles*2+j*nRowBlocks+i).row<short>(q);
+                    short* out_tm3 = bottom_blob_tm.channel(tiles*3+j*nRowBlocks+i).row<short>(q);
+#if __ARM_NEON
+                    asm volatile(
+                        // load
+                        "pld         [%0, #64]     \n"
+                        "vld1.s8     {d0}, [%0]    \n"
+                        "pld         [%1, #64]     \n"
+                        "vld1.s8     {d1}, [%1]    \n"
+                        "pld         [%2, #64]     \n"
+                        "vld1.s8     {d2}, [%2]    \n"
+                        "pld         [%3, #64]     \n"
+                        "vld1.s8     {d3}, [%3]    \n"
+                        // w = B_t * d, trans int8 to int16
+                        "vsubl.s8    q2, d0, d2    \n" // d4
+                        "vaddl.s8    q3, d1, d2    \n" // d6
+                        "vsubl.s8    q4, d2, d1    \n" // d8
+                        "vsubl.s8    q5, d3, d1    \n" // d10
+                        // transpose w to w_t
+                        "vtrn.s16    d4, d6        \n"
+                        "vtrn.s16    d8, d10       \n"
+                        "vtrn.s32    d4, d8        \n"
+                        "vtrn.s32    d6, d10       \n"
+                        // U = B_t * d_t
+                        "vsub.s16    d11, d4, d8   \n"
+                        "vadd.s16    d12, d6, d8   \n"
+                        "vsub.s16    d13, d8, d6   \n"
+                        "vsub.s16    d14, d10, d6  \n"
+                        // save
+                        "vst1.s32    {d11}, [%4]   \n"
+                        "vst1.s32    {d12}, [%5]   \n"
+                        "vst1.s32    {d13}, [%6]   \n"
+                        "vst1.s32    {d14}, [%7]   \n"
+                        : "=r"(r0),      // %0
+                          "=r"(r1),      // %1
+                          "=r"(r2),      // %2
+                          "=r"(r3),      // %3
+                          "=r"(out_tm0), // %4
+                          "=r"(out_tm1), // %5
+                          "=r"(out_tm2), // %6
+                          "=r"(out_tm3)  // %7
+                        : "0"(r0),
+                          "1"(r1),
+                          "2"(r2),
+                          "3"(r3),
+                          "4"(out_tm0),
+                          "5"(out_tm1),
+                          "6"(out_tm2),
+                          "7"(out_tm3)
+                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+                    );
+#else
+                    short d0[4],d1[4],d2[4],d3[4];
+                    short w0[4],w1[4],w2[4],w3[4];
+                    short t0[4],t1[4],t2[4],t3[4];
+                    // load 
+                    for (int n = 0; n < 4; n++)
+                    {
+                        d0[n] = r0[n];
+                        d1[n] = r1[n];
+                        d2[n] = r2[n];
+                        d3[n] = r3[n];
+                    }
+                    // w = B_t * d
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        w0[n] = d0[n] - d2[n];
+                        w1[n] = d1[n] + d2[n];
+                        w2[n] = d2[n] - d1[n];
+                        w3[n] = d3[n] - d1[n];
+                    } 
+                    // transpose d to d_t
+                    {
+                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
+                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
+                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
+                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
+                    }
+                    // U = B_t * d_t
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        d0[n] = t0[n] - t2[n];
+                        d1[n] = t1[n] + t2[n];
+                        d2[n] = t2[n] - t1[n];
+                        d3[n] = t3[n] - t1[n];
+                    }                
+                    // save to out_tm
+                    for (int n = 0; n < 4; n++)
+                    {
+                        out_tm0[n] = d0[n];
+                        out_tm1[n] = d1[n];
+                        out_tm2[n] = d2[n];
+                        out_tm3[n] = d3[n];
+                    }
+#endif                           
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;    
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
 
-            int i = 0;
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-            for (; i < outh; i++)
-            {                           
-                int nn = outw >> 3;
-                int remain = outw & 7;  
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-                asm volatile(
-                    "vld1.s8    {d22-d23}, [%0]    \n"
-                    "vld1.s8    {d24-d25}, [%1]    \n"
-                    : "=r"(kernel0), // %0
-                      "=r"(kernel1)  // %1
-                    : "0"(kernel0),
-                      "1"(kernel1)
-                    : "cc", "memory"
-                );
+        const int tiles = nColBlocks * nRowBlocks; 
 
-                if (nn > 0)
+        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int r=0; r<4; r++)
+        {
+            int nn_outch = 0;
+            int remain_outch_start = 0;
+
+            nn_outch = outch >> 3;
+            remain_outch_start = nn_outch << 3;
+
+            for (int pp=0; pp<nn_outch; pp++)
+            {
+                int p = pp * 8;
+
+                int* output0_tm = top_blob_tm.channel(p);
+                int* output1_tm = top_blob_tm.channel(p+1);
+                int* output2_tm = top_blob_tm.channel(p+2);
+                int* output3_tm = top_blob_tm.channel(p+3);
+                int* output4_tm = top_blob_tm.channel(p+4);
+                int* output5_tm = top_blob_tm.channel(p+5);
+                int* output6_tm = top_blob_tm.channel(p+6);
+                int* output7_tm = top_blob_tm.channel(p+7);
+
+                output0_tm = output0_tm + r*4;
+                output1_tm = output1_tm + r*4;
+                output2_tm = output2_tm + r*4;
+                output3_tm = output3_tm + r*4;
+                output4_tm = output4_tm + r*4;
+                output5_tm = output5_tm + r*4;
+                output6_tm = output6_tm + r*4;
+                output7_tm = output7_tm + r*4;
+
+                for (int i=0; i<tiles; i++)
                 {
+                    const short* kptr = kernel_tm_test[r].channel(p/8);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
+#if __ARM_NEON
                     asm volatile(
-                        "0:                             \n"
-                        "pld        [%3, #192]          \n"
-                        "vld2.s8    {d0-d1}, [%3]!      \n" // r0
-                        "vld2.s8    {d2-d3}, [%3]       \n"
-                        "vext.8     d3, d0, d2, #1      \n"
-            
-                        "vdup.s8    d26, d22[0]         \n"
-                        "vdup.s8    d27, d22[1]         \n"
-                        "vdup.s8    d28, d22[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "pld        [%4, #192]          \n"
-                        "vld2.s8    {d6-d7}, [%4]!      \n" // r1
-                        "vld2.s8    {d8-d9}, [%4]       \n"
-                        "vext.8     d9, d6, d8, #1      \n"
-                        
-                        "vdup.s8    d26, d22[3]         \n"
-                        "vdup.s8    d27, d22[4]         \n"
-                        "vdup.s8    d28, d22[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-
-                        "pld        [%5, #192]          \n" 
-                        "vld2.s8    {d10-d11}, [%5]!    \n" // r2
-                        "vld2.s8    {d12-d13}, [%5]     \n"
-                        "vext.8     d13, d10, d12, #1   \n"
+                        // inch loop
+                        "vmov.s32    q0, #0           \n"
+                        "vmov.s32    q1, #0           \n"
+                        "vmov.s32    q2, #0           \n"
+                        "vmov.s32    q3, #0           \n"
+                        "vmov.s32    q4, #0           \n"
+                        "vmov.s32    q5, #0           \n"
+                        "vmov.s32    q6, #0           \n"
+                        "vmov.s32    q7, #0           \n"
+                        "mov         r4, %20          \n"
                         
-                        "vdup.s8    d26, d22[6]         \n"
-                        "vdup.s8    d27, d22[7]         \n"
-                        "vdup.s8    d28, d23[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%1, #256]          \n"
-                        "vld1.32    {d14-d17}, [%1]     \n" //sum0
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vaddw.s16   q8, q8, d5         \n"
-                        "vst1.32    {d14-d17}, [%1]!    \n"
-                        
-                        "vdup.s8    d26, d24[0]         \n"
-                        "vdup.s8    d27, d24[1]         \n"
-                        "vdup.s8    d28, d24[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "vdup.s8    d26, d24[3]         \n"
-                        "vdup.s8    d27, d24[4]         \n"
-                        "vdup.s8    d28, d24[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-                        
-                        "vdup.s8    d26, d24[6]         \n"
-                        "vdup.s8    d27, d24[7]         \n"
-                        "vdup.s8    d28, d25[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%2, #256]          \n"
-                        "vld1.32    {d14-d17}, [%2]     \n" //sum1
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vaddw.s16   q8, q8, d5         \n"
-                        "vst1.32    {d14-d17}, [%2]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(outptr1),        // %2
-                          "=r"(r0),             // %3
-                          "=r"(r1),             // %4
-                          "=r"(r2)              // %5
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(outptr1),
-                          "3"(r0),
-                          "4"(r1),
-                          "5"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q13", "q14", "q15"
+                        "0:                           \n" // for (int q=0; q<inch; q++)
+                        "vld1.s16    {d16}, [%8]!     \n" // _r0 = vld1_s16(r0);  // input inch0
+                        "vld1.s16    {d18-d19}, [%9]  \n" // _k0 = vld1q_s16(kptr);
+                        "add         %9, #16          \n" 
+                        "vld1.s16    {d20-d21}, [%9]  \n" // _k0n = vld1q_s16(kptr+8);
+                        "add         %9, #16          \n"   
+                        "vld1.s16    {d22-d23}, [%9]  \n" // _k1 = vld1q_s16(kptr+16);
+                        "add         %9, #16          \n"  
+                        "vld1.s16    {d24-d25}, [%9]  \n" // _k1n = vld1q_s16(kptr+24);
+                        "add         %9, #16          \n"
+
+                        "vmlal.s16   q0, d16, d18     \n" // sum0 += (a00-a03) * (k00-k03)
+                        "vmlal.s16   q1, d16, d19     \n" // sum1 += (a00-a03) * (k10-k13)
+                        "vmlal.s16   q2, d16, d20     \n" // sum2 += (a00-a03) * (k20-k23)
+                        "vmlal.s16   q3, d16, d21     \n" // sum3 += (a00-a03) * (k30-k33)
+                        "vmlal.s16   q4, d16, d22     \n" // sum4 += (a00-a03) * (k40-k43)
+                        "vmlal.s16   q5, d16, d23     \n" // sum5 += (a00-a03) * (k50-k53)
+                        "vmlal.s16   q6, d16, d24     \n" // sum6 += (a00-a03) * (k60-k63)
+                        "vmlal.s16   q7, d16, d25     \n" // sum7 += (a00-a03) * (k70-k73)
+
+                        "subs        r4, r4, #1       \n"
+                        "bne         0b               \n" // end for
+
+                        "vst1.s32    {d0-d1}, [%0]    \n" // store the result to memory
+                        "vst1.s32    {d2-d3}, [%1]    \n"
+                        "vst1.s32    {d4-d5}, [%2]    \n"
+                        "vst1.s32    {d6-d7}, [%3]    \n"
+                        "vst1.s32    {d8-d9}, [%4]    \n"
+                        "vst1.s32    {d10-d11}, [%5]  \n"
+                        "vst1.s32    {d12-d13}, [%6]  \n"
+                        "vst1.s32    {d14-d15}, [%7]  \n"
+
+                        : "=r"(output0_tm), // %0
+                          "=r"(output1_tm), // %1
+                          "=r"(output2_tm), // %2
+                          "=r"(output3_tm), // %3
+                          "=r"(output4_tm), // %4
+                          "=r"(output5_tm), // %5
+                          "=r"(output6_tm), // %6
+                          "=r"(output7_tm), // %7
+                          "=r"(r0),         // %8
+                          "=r"(kptr)        // %9
+                        : "0"(output0_tm),
+                          "1"(output1_tm),
+                          "2"(output2_tm),
+                          "3"(output3_tm),
+                          "4"(output4_tm),
+                          "5"(output5_tm),
+                          "6"(output6_tm),
+                          "7"(output7_tm),
+                          "8"(r0),
+                          "9"(kptr),
+                          "r"(inch)         // %20
+                        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12"
                     );
-                }           
+#else
+                    int sum0[4] = {0};
+                    int sum1[4] = {0};
+                    int sum2[4] = {0};
+                    int sum3[4] = {0};
+                    int sum4[4] = {0};
+                    int sum5[4] = {0};
+                    int sum6[4] = {0};
+                    int sum7[4] = {0};
+
+                    for (int q=0; q<inch; q++)
+                    {
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                            sum1[n] += (int)r0[n] * kptr[n+4];
+                            sum2[n] += (int)r0[n] * kptr[n+8];
+                            sum3[n] += (int)r0[n] * kptr[n+12];
+                            sum4[n] += (int)r0[n] * kptr[n+16];
+                            sum5[n] += (int)r0[n] * kptr[n+20];
+                            sum6[n] += (int)r0[n] * kptr[n+24];
+                            sum7[n] += (int)r0[n] * kptr[n+28];
+                        }
+                        kptr += 32;
+                        r0 += 4;
+                    }
 
-                if (remain >= 4)
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                        output1_tm[n] = sum1[n];
+                        output2_tm[n] = sum2[n];
+                        output3_tm[n] = sum3[n];
+                        output4_tm[n] = sum4[n];
+                        output5_tm[n] = sum5[n];
+                        output6_tm[n] = sum6[n];
+                        output7_tm[n] = sum7[n];
+                    }
+#endif // __ARM_NEON
+                    output0_tm += 16;
+                    output1_tm += 16;
+                    output2_tm += 16;
+                    output3_tm += 16;
+                    output4_tm += 16;
+                    output5_tm += 16;
+                    output6_tm += 16;
+                    output7_tm += 16;
+                }
+            }
+
+            nn_outch = (outch - remain_outch_start) >> 2;
+
+            //#pragma omp parallel for num_threads(opt.num_threads)
+            for (int pp=0; pp<nn_outch; pp++)
+            {
+                int p = remain_outch_start + pp * 4;
+
+                int* output0_tm = top_blob_tm.channel(p);
+                int* output1_tm = top_blob_tm.channel(p+1);
+                int* output2_tm = top_blob_tm.channel(p+2);
+                int* output3_tm = top_blob_tm.channel(p+3);
+
+                output0_tm = output0_tm + r*4;
+                output1_tm = output1_tm + r*4;
+                output2_tm = output2_tm + r*4;
+                output3_tm = output3_tm + r*4;
+
+                for (int i=0; i<tiles; i++)
                 {
-                    remain -= 4;
+                    const short* kptr = kernel_tm_test[r].channel(p/8 + (p%8)/4);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
+#if __ARM_NEON
                     asm volatile(
-                        "pld        [%3, #192]          \n"
-                        "vld2.s8    {d0-d1}, [%3]!      \n" // r0
-                        "vld2.s8    {d2-d3}, [%3]       \n"
-                        "vext.8     d3, d0, d2, #1      \n"
-            
-                        "vdup.s8    d26, d22[0]         \n"
-                        "vdup.s8    d27, d22[1]         \n"
-                        "vdup.s8    d28, d22[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "pld        [%4, #192]          \n"
-                        "vld2.s8    {d6-d7}, [%4]!      \n" // r1
-                        "vld2.s8    {d8-d9}, [%4]       \n"
-                        "vext.8     d9, d6, d8, #1      \n"
-                        
-                        "vdup.s8    d26, d22[3]         \n"
-                        "vdup.s8    d27, d22[4]         \n"
-                        "vdup.s8    d28, d22[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-
-                        "pld        [%5, #192]          \n" 
-                        "vld2.s8    {d10-d11}, [%5]!    \n" // r2
-                        "vld2.s8    {d12-d13}, [%5]     \n"
-                        "vext.8     d13, d10, d12, #1   \n"
+                        // inch loop
+                        "vmov.s32    q0, #0           \n"
+                        "vmov.s32    q1, #0           \n"
+                        "vmov.s32    q2, #0           \n"
+                        "vmov.s32    q3, #0           \n"
+                        "mov         r4, %12          \n"
                         
-                        "sub        %3, #8              \n"
-                        "sub        %4, #8              \n"
-                        "sub        %5, #8              \n"
-                        
-                        "vdup.s8    d26, d22[6]         \n"
-                        "vdup.s8    d27, d22[7]         \n"
-                        "vdup.s8    d28, d23[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d14-d15}, [%1]     \n" //sum0
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vst1.32    {d14-d15}, [%1]!    \n"
-                        
-                        "vdup.s8    d26, d24[0]         \n"
-                        "vdup.s8    d27, d24[1]         \n"
-                        "vdup.s8    d28, d24[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "vdup.s8    d26, d24[3]         \n"
-                        "vdup.s8    d27, d24[4]         \n"
-                        "vdup.s8    d28, d24[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-                        
-                        "vdup.s8    d26, d24[6]         \n"
-                        "vdup.s8    d27, d24[7]         \n"
-                        "vdup.s8    d28, d25[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%2, #128]          \n"
-                        "vld1.32    {d14-d15}, [%2]     \n" //sum1
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vst1.32    {d14-d15}, [%2]!    \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),        // %1
-                          "=r"(outptr1),        // %2
-                          "=r"(r0),             // %3
-                          "=r"(r1),             // %4
-                          "=r"(r2)              // %5
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(outptr1),
-                          "3"(r0),
-                          "4"(r1),
-                          "5"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q13", "q14", "q15"
-                    );                    
+                        "0:                           \n" // for (int q=0; q<inch; q++)
+                        "vld1.s16    {d16}, [%4]!     \n" // _r0 = vld1_s16(r0);  // input inch0
+                        "vld1.s16    {d18-d19}, [%5]  \n" // _k0 = vld1q_s16(kptr);
+                        "add         %5, #16          \n" 
+                        "vld1.s16    {d20-d21}, [%5]  \n" // _k0n = vld1q_s16(kptr+8);
+                        "add         %5, #16          \n"
+
+                        "vmlal.s16   q0, d16, d18     \n" // sum0 += (a00-a03) * (k00-k03)
+                        "vmlal.s16   q1, d16, d19     \n" // sum1 += (a00-a03) * (k10-k13)
+                        "vmlal.s16   q2, d16, d20     \n" // sum2 += (a00-a03) * (k20-k23)
+                        "vmlal.s16   q3, d16, d21     \n" // sum3 += (a00-a03) * (k30-k33)
+
+                        "subs        r4, r4, #1       \n"
+                        "bne         0b               \n" // end for
+
+                        "vst1.s32    {d0-d1}, [%0]    \n" // store the result to memory
+                        "vst1.s32    {d2-d3}, [%1]    \n"
+                        "vst1.s32    {d4-d5}, [%2]    \n"
+                        "vst1.s32    {d6-d7}, [%3]    \n"
+
+                        : "=r"(output0_tm), // %0
+                          "=r"(output1_tm), // %1
+                          "=r"(output2_tm), // %2
+                          "=r"(output3_tm), // %3
+                          "=r"(r0),         // %4
+                          "=r"(kptr)        // %5
+                        : "0"(output0_tm),
+                          "1"(output1_tm),
+                          "2"(output2_tm),
+                          "3"(output3_tm),
+                          "4"(r0),
+                          "5"(kptr),
+                          "r"(inch)         // %12
+                        : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+                    );                   
+#else
+                    int sum0[4] = {0};
+                    int sum1[4] = {0};
+                    int sum2[4] = {0};
+                    int sum3[4] = {0};
+
+                    for (int q=0; q<inch; q++)
+                    {   
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                            sum1[n] += (int)r0[n] * kptr[n+4];
+                            sum2[n] += (int)r0[n] * kptr[n+8];
+                            sum3[n] += (int)r0[n] * kptr[n+12];
+                        }
+                        kptr += 16;
+                        r0 += 4;
+                    }
+
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                        output1_tm[n] = sum1[n];
+                        output2_tm[n] = sum2[n];
+                        output3_tm[n] = sum3[n];
+                    }
+#endif // __ARM_NEON
+                    output0_tm += 16;
+                    output1_tm += 16;
+                    output2_tm += 16;
+                    output3_tm += 16;
                 }
+            }
 
-                for (; remain>0; remain--)
+            remain_outch_start += nn_outch << 2;
+            //#pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=remain_outch_start; p<outch; p++)
+            {
+                int* output0_tm = top_blob_tm.channel(p);
+
+                output0_tm = output0_tm + r*4;
+
+                for (int i=0; i<tiles; i++)
                 {
-                    int sum0 = 0;
-                    int sum1 = 0;
-                
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-
-                    sum1 += (int)r0[0] * kernel1[0];
-                    sum1 += (int)r0[1] * kernel1[1];
-                    sum1 += (int)r0[2] * kernel1[2];
-                    sum1 += (int)r1[0] * kernel1[3];
-                    sum1 += (int)r1[1] * kernel1[4];
-                    sum1 += (int)r1[2] * kernel1[5];
-                    sum1 += (int)r2[0] * kernel1[6];
-                    sum1 += (int)r2[1] * kernel1[7];
-                    sum1 += (int)r2[2] * kernel1[8];
-                
-                    *outptr0 += sum0;
-                    *outptr1 += sum1;
+                    const short* kptr = kernel_tm_test[r].channel(p/8 + (p%8)/4 + p%4);
+                    const short* r0 = bottom_blob_tm.channel(tiles*r+i);
+#if __ARM_NEON
+                    asm volatile(
+                        // inch loop
+                        "vmov.s32    q0, #0           \n"
+                        "mov         r4, %6           \n"
+                        
+                        "0:                           \n" // for (int q=0; q<inch; q++)
+                        "vld1.s16    {d16}, [%1]      \n" // _r0 = vld1_s16(r0);  // input inch0
+                        "add         %1, #8           \n"
+                        "vld1.s16    {d18}, [%2]      \n" // _k0 = vld1q_s16(kptr);
+                        "add         %2, #8           \n"
+                        "vmlal.s16   q0, d16, d18     \n" // sum0 += (a00-a03) * (k00-k03)
+
+                        "subs        r4, r4, #1       \n"
+                        "bne         0b               \n" // end for
+
+                        "vst1.s32    {d0-d1}, [%0]    \n" // store the result to memory
+
+                        : "=r"(output0_tm), // %0
+                          "=r"(r0),         // %1
+                          "=r"(kptr)        // %2
+                        : "0"(output0_tm),
+                          "1"(r0),
+                          "2"(kptr),
+                          "r"(inch)         // %6
+                        : "cc", "memory", "r4", "q0", "q8", "q9"
+                    );               
+#else
+                    int sum0[4] = {0};
 
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
-                    outptr0++;
-                    outptr1++;
-                }       
+                    for (int q=0; q<inch; q++)
+                    {
+                        for (int n=0; n<4; n++)
+                        {
+                            sum0[n] += (int)r0[n] * kptr[n];
+                        }
+                        kptr += 4; 
+                        r0 += 4;
+                    }
 
-                r0 += tailstep;
-                r1 += tailstep;
-                r2 += tailstep;
+                    for (int n=0; n<4; n++)
+                    {
+                        output0_tm[n] = sum0[n];
+                    }           
+#endif                           
+                    output0_tm += 16;       
+                }
             }
-
-            kernel0 += 9;
-            kernel1 += 9;
-        }
+        }   
     }
+    bottom_blob_tm = Mat();
+    // END dot    
 
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p=remain_outch_start; p<outch; p++)
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
     {
-        Mat out0 = top_blob.channel(p);
-
-        out0.fill(0.f);
-
-        const signed char* kernel0 = (const signed char*)kernel + p * inch * 9;
-
-        for (int q=0; q<inch; q++)
-        {
-            int* outptr0 = out0;
+        // AT
+        // const float itm[2][4] = {
+        //     {1.0f,  1.0f,  1.0f,  0.0f},
+        //     {0.0f,  1.0f, -1.0f,  1.0f}
+        // }; 
 
-            const signed char* img0 = bottom_blob.channel(q);
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
 
-            const signed char* r0 = img0;
-            const signed char* r1 = img0 + w;
-            const signed char* r2 = img0 + w * 2;  
+        int nColBlocks = h_tm/4; // may be the block num in FeatherCNN
+        int nRowBlocks = w_tm/4;
 
-            int i = 0;
+        int32x2_t _shift = vdup_n_s32(-2);
 
-            for (; i < outh; i++)
-            {           
-                int nn = outw >> 3;
-                int remain = outw & 7;  
-                
-                asm volatile(
-                    "vld1.s8    {d22-d23}, [%0]    \n"
-                    : "=r"(kernel0) // %0
-                    : "0"(kernel0) 
-                    : "cc", "memory"
-                );
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<outch; p++)
+        {
+            int* out_tile = top_blob_tm.channel(p);
+            int* outRow0 = top_blob_bordered.channel(p);
+            int* outRow1 = outRow0 + outw;     
 
-                if (nn > 0)
+            for (int j=0; j<nColBlocks; j++)
+            {
+                for(int i=0; i<nRowBlocks; i++)
                 {
+#if __ARM_NEON
                     asm volatile(
-                        "0:                             \n"
-                        "pld        [%2, #192]          \n"
-                        "vld2.s8    {d0-d1}, [%2]!      \n" // r0
-                        "vld2.s8    {d2-d3}, [%2]       \n"
-                        "vext.8     d3, d0, d2, #1      \n"
-            
-                        "vdup.s8    d26, d22[0]         \n"
-                        "vdup.s8    d27, d22[1]         \n"
-                        "vdup.s8    d28, d22[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "pld        [%3, #192]          \n"
-                        "vld2.s8    {d6-d7}, [%3]!      \n" // r1
-                        "vld2.s8    {d8-d9}, [%3]       \n"
-                        "vext.8     d9, d6, d8, #1      \n"
-                        
-                        "vdup.s8    d26, d22[3]         \n"
-                        "vdup.s8    d27, d22[4]         \n"
-                        "vdup.s8    d28, d22[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-
-                        "pld        [%4, #192]          \n"
-                        "vld2.s8    {d10-d11}, [%4]!    \n" // r2
-                        "vld2.s8    {d12-d13}, [%4]     \n"
-                        "vext.8     d13, d10, d12, #1   \n"
+                        "pld        [%0, #512]      \n"
+                        "vldm        %0!, {d0-d7}   \n"
+
+                        "vaddq.s32    q0, q0, q1    \n" // s0 = s0 + s1 + s2;
+                        "vsubq.s32    q1, q1, q2    \n"
+                        "vaddq.s32    q0, q0, q2    \n" // s1 = s1 - s2 + s3;
+                        "vaddq.s32    q1, q1, q3    \n"
+
+                        "vtrn.s32    q0, q1         \n"
                         
-                        "vdup.s8    d26, d22[6]         \n"
-                        "vdup.s8    d27, d22[7]         \n"
-                        "vdup.s8    d28, d23[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%1, #256]          \n"
-                        "vld1.32    {d14-d17}, [%1]     \n" //sum0
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vaddw.s16   q8, q8, d5         \n"
-                        "vst1.32    {d14-d17}, [%1]!    \n"
-
-                        "subs       %0, #1              \n"
-                        "bne        0b                  \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),    // %1
-                          "=r"(r0),             // %2
-                          "=r"(r1),             // %3
-                          "=r"(r2)              // %4
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(r0),
-                          "3"(r1),
-                          "4"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q12", "q13", "q14"
+                        "vadd.s32    d8, d0, d2     \n" // o0 = d0 + d1 + d2;
+                        "vsub.s32    d9, d2, d1     \n"
+                        "vadd.s32    d8, d8, d1     \n" // o1 = d1 - d2 + d3;
+                        "vadd.s32    d9, d9, d3     \n"
+
+                        "vshl.s32    d8, d8, %P6    \n" // o0 = o0 >> 2
+                        "vshl.s32    d9, d9, %P6    \n" // o1 = o1 >> 2
+
+                        "vst1.s32    {d8}, [%1]!    \n"
+                        "vst1.s32    {d9}, [%2]!    \n"
+                        : "=r"(out_tile), // %0
+                          "=r"(outRow0),  // %1
+                          "=r"(outRow1)   // %2
+                        : "0"(out_tile),
+                          "1"(outRow0),
+                          "2"(outRow1),
+                          "w"(_shift)     // %6
+                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4"
                     );
-                }
-                
-                if (remain >= 4)
-                {
-                    remain -= 4;
-                    asm volatile(
-                        "pld        [%2, #192]          \n"
-                        "vld2.s8    {d0-d1}, [%2]!      \n" // r0
-                        "vld2.s8    {d2-d3}, [%2]       \n"
-                        "vext.8     d3, d0, d2, #1      \n"
-            
-                        "vdup.s8    d26, d22[0]         \n"
-                        "vdup.s8    d27, d22[1]         \n"
-                        "vdup.s8    d28, d22[2]         \n"
-                        "vmull.s8   q2, d0, d26         \n" // k00
-                        "vmlal.s8   q2, d1, d27         \n" // k01
-                        "vmlal.s8   q2, d3, d28         \n" // k02
-                        
-                        "pld        [%3, #192]          \n"
-                        "vld2.s8    {d6-d7}, [%3]!      \n" // r1
-                        "vld2.s8    {d8-d9}, [%3]       \n"
-                        "vext.8     d9, d6, d8, #1      \n"
-                        
-                        "vdup.s8    d26, d22[3]         \n"
-                        "vdup.s8    d27, d22[4]         \n"
-                        "vdup.s8    d28, d22[5]         \n"
-                        "vmlal.s8   q2, d6, d26         \n" // k03
-                        "vmlal.s8   q2, d7, d27         \n" // k04
-                        "vmlal.s8   q2, d9, d28         \n" // k05
-
-                        "pld        [%4, #192]          \n"
-                        "vld2.s8    {d10-d11}, [%4]!    \n" // r2
-                        "vld2.s8    {d12-d13}, [%4]     \n"
-                        "vext.8     d13, d10, d12, #1   \n"
-
-                        "sub        %2, #8              \n"
-                        "sub        %3, #8              \n"
-                        "sub        %4, #8              \n"                         
-                        
-                        "vdup.s8    d26, d22[6]         \n"
-                        "vdup.s8    d27, d22[7]         \n"
-                        "vdup.s8    d28, d23[0]         \n"
-                        "vmlal.s8   q2, d10, d26        \n" // k06
-                        "vmlal.s8   q2, d11, d27        \n" // k07
-                        "vmlal.s8   q2, d13, d28        \n" // k08
-
-                        "pld        [%1, #128]          \n"
-                        "vld1.32    {d14-d15}, [%1]     \n" //sum0
-                        "vaddw.s16   q7, q7, d4         \n"
-                        "vst1.32    {d14-d15}, [%1]!    \n"
-                        : "=r"(nn),             // %0
-                          "=r"(outptr0),    // %1
-                          "=r"(r0),             // %2
-                          "=r"(r1),             // %3
-                          "=r"(r2)              // %4
-                        : "0"(nn),
-                          "1"(outptr0),
-                          "2"(r0),
-                          "3"(r1),
-                          "4"(r2)
-                        : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q12", "q13", "q14"
-                    );                    
-                }
+#else
+                    int s0[4],s1[4],s2[4],s3[4];
+                    int w0[4],w1[4];
+                    int d0[2],d1[2],d2[2],d3[2];
+                    int o0[2],o1[2];
+                    // load
+                    for (int n = 0; n < 4; n++)
+                    {
+                        s0[n] = out_tile[n];
+                        s1[n] = out_tile[n+ 4];
+                        s2[n] = out_tile[n+ 8];
+                        s3[n] = out_tile[n+12];
+                    }
+                    // w = A_T * W
+                    for (int n = 0; n < 4; n++)
+                    {
+                        w0[n] = s0[n] + s1[n] + s2[n];
+                        w1[n] = s1[n] - s2[n] + s3[n];
+                    }
+                    // transpose w to w_t
+                    {
+                        d0[0] = w0[0]; d0[1] = w1[0];
+                        d1[0] = w0[1]; d1[1] = w1[1];
+                        d2[0] = w0[2]; d2[1] = w1[2];
+                        d3[0] = w0[3]; d3[1] = w1[3];
+                    }
+                    // Y = A_T * w_t
+                    for (int n = 0; n < 2; n++)
+                    {
+                        o0[n] = d0[n] + d1[n] + d2[n];
+                        o1[n] = d1[n] - d2[n] + d3[n];
+                    }
+                    // save to top blob tm,why right 2,because the G' = G*2
+                    outRow0[0] = o0[0] >> 2;
+                    outRow0[1] = o0[1] >> 2;
+                    outRow1[0] = o1[0] >> 2;
+                    outRow1[1] = o1[1] >> 2;
 
-                for (; remain>0; remain--)
-                {
-                    int sum0 = 0;
-                    
-                    sum0 += (int)r0[0] * kernel0[0];
-                    sum0 += (int)r0[1] * kernel0[1];
-                    sum0 += (int)r0[2] * kernel0[2];
-                    sum0 += (int)r1[0] * kernel0[3];
-                    sum0 += (int)r1[1] * kernel0[4];
-                    sum0 += (int)r1[2] * kernel0[5];
-                    sum0 += (int)r2[0] * kernel0[6];
-                    sum0 += (int)r2[1] * kernel0[7];
-                    sum0 += (int)r2[2] * kernel0[8];
-                    
-                    *outptr0 += sum0;
+                    out_tile += 16;
 
-                    r0 += 2;
-                    r1 += 2;
-                    r2 += 2;
-                    outptr0++;
+                    outRow0 += 2;
+                    outRow1 += 2;
+#endif // __ARM_NEON
                 }
 
-                r0 += tailstep;
-                r1 += tailstep;
-                r2 += tailstep;
+                outRow0 += outw;
+                outRow1 += outw;
             }
-
-            kernel0 += 9;
-        }       
-    }   
+        }        
+    }
+    // END transform output 
+    
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);  
 }
 
 static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
@@ -3366,7 +3776,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co
                         "sub        %3, #2          \n"
 
                         "vld1.s8    {d0[6]}, [%4]!  \n"
-                        "vld1.s8    {d0[7]}, [%4]!  \n"// d0(r00 r01 r02 r10 r11 r12 r22 r21)
+                        "vld1.s8    {d0[7]}, [%4]!  \n"// d0(r00 r01 r02 r10 r11 r12 r20 r21)
 
                         "vld1.s8    {d4[]}, [%4]    \n"// d4(r22 r22 r22 r22 r22 r22 r22 r22) 
                         "sub        %4, #2          \n"
@@ -3381,7 +3791,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co
                         "vld1.s8    {d5[]}, [%5]    \n"// d5(r32 r32 r32 r32 r32 r32 r32 r32)
                         "sub        %5, #2          \n"
 
-                        "veor       d3, d3          \n"// d3(00 00 00 00 00 00 00 00)
+                        "veor       d3, d1, d1      \n"// d3(00 00 00 00 00 00 00 00)
 
                         "vmull.s8   q8, d0, d2      \n"// sum0 = (r00 - r21) * (k00 - k21)
                         "vmull.s8   q9, d1, d2      \n"// sum1 = (r10 - r31) * (k00 - k21)
@@ -3404,7 +3814,7 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co
                         "vpadd.s32  d20, d20, d21   \n"
                         "vpadd.s32  d22, d22, d23   \n"
                         "vpadd.s32  d20, d20, d22   \n"
-                        "vpadd.s32  d6, d6, d20     \n"
+                        "vadd.s32   d6, d6, d20     \n"
 
                         "vst1.s32   {d6[0]}, [%0]!  \n"
                         "vst1.s32   {d6[1]}, [%1]!  \n"
@@ -3437,7 +3847,6 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co
                     sum0 += r1[2] * ktmp[5];
                     sum0 += r2[0] * ktmp[6];
                     sum0 += r2[1] * ktmp[7];
-
                     sum0 += r2[2] * ktmp[8];
 
                     sum0n += r1[0] * ktmp[0];
@@ -3448,7 +3857,6 @@ static void conv3x3s1_packed_int8_neon(const Mat &bottom_blob, Mat &top_blob, co
                     sum0n += r2[2] * ktmp[5];
                     sum0n += r3[0] * ktmp[6];
                     sum0n += r3[1] * ktmp[7];
-
                     sum0n += r3[2] * ktmp[8];
 
                     *outptr0 += sum0;
@@ -3705,7 +4113,7 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co
                     "vmovl.s8   q6, d12             \n"// q6(a02 a04 a06 a08 a010 a012 a014 a016) d13
 
                     "pld        [%8, #128]          \n"
-                    "vld1.s32   {d30-d31}, [%8]     \n"// out7  
+                    "vld1.s32   {d30-d31}, [%8]     \n"// out7
 
                     "vmlal.s16  q8, d8, d0[0]       \n"// sum0 += (a00 a02 a04 a06) * k00
                     "vmlal.s16  q9, d8, d0[1]       \n"// sum1 += (a00 a02 a04 a06) * k10
@@ -3723,7 +4131,7 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co
                     "vmlal.s16  q12, d10, d3[0]     \n"// sum4 += (a01-a07) * k41
                     "vmlal.s16  q13, d10, d3[1]     \n"// sum5 += (a01-a07) * k51
                     "vmlal.s16  q14, d10, d3[2]     \n"// sum6 += (a01-a07) * k61
-                    "vmlal.s16  q15, d10, d3[3]     \n"// sum7 += (a01-a07) * k71   
+                    "vmlal.s16  q15, d10, d3[3]     \n"// sum7 += (a01-a07) * k71
 
                     "pld        [%10, #64]         \n"
                     "vld2.s8    {d8-d9}, [%10]      \n"// d8(a10 a12 a14 a16 a18 a110 a112 a114), d9(a11 a13 a15 a17 a19 a111 a113 a115)
@@ -4293,3 +4701,25 @@ static void conv3x3s2_packed_int8_neon(const Mat& bottom_blob, Mat& top_blob, co
     }
 }
 #endif
+
+static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 3;
+    int kernel_h = 3;
+
+    int stride_w = 1;
+    int stride_h = 1;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
+
+static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 3;
+    int kernel_h = 3;
+
+    int stride_w = 2;
+    int stride_h = 2;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
diff --git a/src/layer/arm/convolution_5x5_int8.h b/src/layer/arm/convolution_5x5_int8.h
new file mode 100644
index 000000000..99abb8705
--- /dev/null
+++ b/src/layer/arm/convolution_5x5_int8.h
@@ -0,0 +1,35 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv5x5s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 5;
+    int kernel_h = 5;
+
+    int stride_w = 1;
+    int stride_h = 1;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
+
+static void conv5x5s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 5;
+    int kernel_h = 5;
+
+    int stride_w = 2;
+    int stride_h = 2;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
diff --git a/src/layer/arm/convolution_7x7_int8.h b/src/layer/arm/convolution_7x7_int8.h
new file mode 100644
index 000000000..d34f7a323
--- /dev/null
+++ b/src/layer/arm/convolution_7x7_int8.h
@@ -0,0 +1,35 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv7x7s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 7;
+    int kernel_h = 7;
+
+    int stride_w = 1;
+    int stride_h = 1;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
+
+static void conv7x7s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 7;
+    int kernel_h = 7;
+
+    int stride_w = 2;
+    int stride_h = 2;
+
+    conv_im2col_sgemm_int8_neon(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index 928e302ad..14be7ae45 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -14,6 +14,8 @@
 
 #include "convolution_arm.h"
 
+#include "benchmark.h"
+
 namespace ncnn {
 
 #include "convolution_1x1.h"
@@ -24,8 +26,11 @@ namespace ncnn {
 #include "convolution_7x7.h"
 
 #if __ARM_NEON
+#include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
+#include "convolution_5x5_int8.h"
+#include "convolution_7x7_int8.h"
 #endif // __ARM_NEON
 
 DEFINE_LAYER_CREATOR(Convolution_arm)
@@ -66,9 +71,12 @@ int Convolution_arm::load_model(const ModelBin& mb)
 
     if (use_int8_inference)
     {
-#if __ARM_NEON
-#if !__aarch64__
-        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        if (use_winograd3x3)
+        {
+            int num_input = weight_data_size / 9 / num_output;
+            conv3x3s1_winograd23_transform_kernel_int8_neon(weight_data, weight_3x3_winograd23_int8_data, num_input, num_output);
+        }
+        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
             int num_input = weight_data_size / 9 / num_output;
             conv3x3s1_transform_kernel_int8_neon(weight_data, weight_3x3s1_int8_data, num_input, num_output);
@@ -78,16 +86,15 @@ int Convolution_arm::load_model(const ModelBin& mb)
         {
             int num_input = weight_data_size / 9 / num_output;
             conv3x3s2_transform_kernel_int8_neon(weight_data, weight_3x3s2_int8_data, num_input, num_output);
-        }   
+        }
 
         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
         {
             int num_input = weight_data_size / num_output;
             conv1x1s1_sgemm_transform_kernel_int8_neon(weight_data, weight_1x1s1_sgemm_int8_data, num_input, num_output);
             use_sgemm1x1 = true;
-        }        
-#endif // !__aarch64__
-#endif // __ARM_NEON
+        }
+        
         return 0;
     }
 
@@ -233,7 +240,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     }
 
     const int kernel_size = kernel_w;
-    const int stride = stride_w;
+    //const int stride = stride_w;
+    int stride = stride_w;
 
     if (kernel_size > 7 || stride > 4 || dilation_w != dilation_h)
     {
@@ -293,43 +301,50 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 
 #if __ARM_NEON
     // kernel_size x stride
-    conv_int8_func conv_int8_func_table[5][5] =
+    conv_int8_func conv_int8_func_table[7][4] =
     {
         {
             conv1x1s1_int8_neon,
             conv1x1s2_int8_neon,
             0,
-            0,
             0
         }, // kernel_size = 1
         {
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 2
         {
             conv3x3s1_int8_neon,
             conv3x3s2_int8_neon,
             0,
-            0,
             0
         }, // kernel_size = 3
         {
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 4
         {
+            conv5x5s1_int8_neon,
+            conv5x5s2_int8_neon,
             0,
+            0
+        }, // kernel_size = 5
+        {
             0,
             0,
             0,
             0
-        }  // kernel_size = 5
+        }, // kernel_size = 6
+        {            
+            conv7x7s1_int8_neon,           
+            conv7x7s2_int8_neon,
+            0,
+            0
+        }  // kernel_size = 7                
     };
 #endif // __ARM_NEON
 
@@ -384,9 +399,9 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
             opt_g.blob_allocator = bottom_blob_int8.allocator;
 
             quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
-        }
+        }       
 
-        bottom_blob_unbordered = bottom_blob_int8;
+        bottom_blob_unbordered = bottom_blob_int8;       
     }
 
     Mat bottom_blob_bordered = bottom_blob_unbordered;
@@ -423,34 +438,90 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 
     if (use_int8_inference)
     {
-#if __ARM_NEON
-#if !__aarch64__
-        if (use_sgemm1x1)
-        {
-            conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
-        }
-        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+        if (use_int8_requantize == true)
         {
-            conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
+            Mat top_blob_tm;
+            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
+            if (top_blob_tm.empty())
+                return -100;
+            
+            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100; 
+
+            if (use_sgemm1x1)
+            {
+                conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob_tm, weight_1x1s1_sgemm_int8_data, opt);
+            }
+            else if (use_winograd3x3)
+            {
+                conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_int8_data, opt);
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s1_int8_data, opt);
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob_tm, weight_3x3s2_int8_data, opt);
+            }        
+            else
+            {
+                conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);
+            }
+
+            // requantize, reverse scale inplace
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
+            {
+                ncnn::Option opt_g = opt;
+                opt_g.num_threads = 1;
+                opt_g.blob_allocator = top_blob.allocator;
+
+                Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
+                Mat top_blob_g = top_blob.channel_range(p, 1);
+                requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
+            }          
         }
-        else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
-        {
-            conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
-        }        
         else
-#endif // !__aarch64__
-#endif // __ARM_NEON
         {
-            conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
-        }
+            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100; 
 
-        // dequantize, reverse scale inplace
-        {
-            ncnn::Option opt_g = opt;
-            opt_g.blob_allocator = top_blob.allocator;
+            if (use_sgemm1x1)
+            {
+                conv1x1s1_sgemm_int8_neon(bottom_blob_bordered, top_blob, weight_1x1s1_sgemm_int8_data, opt);
+            }
+            else if (use_winograd3x3)
+            {
+                conv3x3s1_winograd23_int8_neon(bottom_blob_bordered, top_blob, weight_3x3_winograd23_int8_data, opt);
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+            {
+                conv3x3s1_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s1_int8_data, opt);
+            }
+            else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
+            {
+                conv3x3s2_packed_int8_neon(bottom_blob_bordered, top_blob, weight_3x3s2_int8_data, opt);
+            }        
+            else
+            {
+                conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
+            }          
 
-            dequantize->forward_inplace(top_blob, opt_g);
-        }
+            // dequantize, reverse scale inplace
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
+            {
+                ncnn::Option opt_g = opt;
+                opt_g.num_threads = 1;
+                opt_g.blob_allocator = top_blob.allocator;
+
+                Mat top_blob_g = top_blob.channel_range(p, 1);
+                dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
+            }           
+        } 
 
         return 0;
     }
diff --git a/src/layer/arm/convolution_arm.h b/src/layer/arm/convolution_arm.h
index 2a3c55433..7de63bb2b 100644
--- a/src/layer/arm/convolution_arm.h
+++ b/src/layer/arm/convolution_arm.h
@@ -40,6 +40,8 @@ public:
     Mat weight_3x3s1_int8_data;
     Mat weight_3x3s2_int8_data;
     Mat weight_1x1s1_sgemm_int8_data;
+    Mat weight_3x3_winograd23_data;
+    std::vector<Mat> weight_3x3_winograd23_int8_data;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/convolution_sgemm_int8.h b/src/layer/arm/convolution_sgemm_int8.h
new file mode 100644
index 000000000..4ef2903f1
--- /dev/null
+++ b/src/layer/arm/convolution_sgemm_int8.h
@@ -0,0 +1,1598 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv_im2col_sgemm_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \
+            const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    // im2col
+    Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator);
+    {
+        const int stride = kernel_h*kernel_w*outw*outh;
+        signed char* ret = (signed char*)bottom_im2col;
+    
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<inch; p++)
+        {
+            const signed char* input = bottom_blob.channel(p);
+            int retID = stride * p;
+            for (int u=0; u<kernel_h; u++)
+            {
+                for (int v=0; v<kernel_w; v++)
+                {
+                    for (int i=0; i<outh; i++)
+                    {
+                        for (int j=0; j<outw; j++)
+                        {
+                            int row = u + i * stride_h;
+                            int col = v + j * stride_w;
+                            int index = row * w + col;
+                            ret[retID] = input[index];
+                            retID++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    int kernel_size = kernel_w * kernel_h;
+    int out_size = outw * outh;
+
+    // bottom_im2col memory packed 8 x 8
+    Mat bottom_tm(8*kernel_size, inch, out_size/8 + out_size%8, (size_t)1u, opt.workspace_allocator);
+    {
+        int nn_size = out_size >> 3;
+        int remain_size_start = nn_size << 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii=0; ii<nn_size; ii++)
+        {
+            int i = ii * 8;
+
+            const signed char* img0 = bottom_im2col.channel(0);
+            img0 += i;
+
+            signed char* tmpptr = bottom_tm.channel(i/8);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+#if __ARM_NEON
+#if __aarch64__
+                asm volatile(
+                    "prfm    pldl1keep, [%0, #64]    \n"
+                    "ld1     {v0.8b}, [%0]           \n"
+                    "st1     {v0.8b}, [%1]           \n"
+                    : "=r"(img0),   // %0
+                      "=r"(tmpptr)  // %1
+                    : "0"(img0),
+                      "1"(tmpptr)
+                    : "cc", "memory", "v0"
+                );                
+#else
+                asm volatile(
+                    "pld        [%0, #64]     \n"
+                    "vld1.s8   {d0}, [%0]     \n"
+                    "vst1.s8   {d0}, [%1]     \n"
+                    : "=r"(img0),   // %0
+                      "=r"(tmpptr)  // %1
+                    : "0"(img0),
+                      "1"(tmpptr)
+                    : "cc", "memory", "d0"
+                );
+#endif // __aarch64__                
+#else                
+                tmpptr[0] = img0[0];
+                tmpptr[1] = img0[1];
+                tmpptr[2] = img0[2];
+                tmpptr[3] = img0[3];
+                tmpptr[4] = img0[4];
+                tmpptr[5] = img0[5];
+                tmpptr[6] = img0[6];
+                tmpptr[7] = img0[7];
+#endif // __ARM_NEON              
+                tmpptr += 8;
+                img0 += out_size;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i=remain_size_start; i<out_size; i++)
+        {
+            const signed char* img0 = bottom_im2col.channel(0);
+            img0 += i;
+
+            signed char* tmpptr = bottom_tm.channel(i/8 + i%8);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                tmpptr[0] = img0[0];
+
+                tmpptr += 1;
+                img0 += out_size;
+            }
+        }       
+    }
+    
+#if __aarch64__
+    // kernel memory packed 8 x 8
+    Mat kernel_tm(8*kernel_size, inch, outch/8 + (outch%8)/4 + outch%4, (size_t)1u, opt.workspace_allocator);
+    {
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+        nn_outch = outch >> 3;
+        remain_outch_start = nn_outch << 3;      
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = pp * 8;
+
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+            const signed char* k1 = kernel + (p+1)*inch*kernel_size;
+            const signed char* k2 = kernel + (p+2)*inch*kernel_size;
+            const signed char* k3 = kernel + (p+3)*inch*kernel_size;
+            const signed char* k4 = kernel + (p+4)*inch*kernel_size;
+            const signed char* k5 = kernel + (p+5)*inch*kernel_size;
+            const signed char* k6 = kernel + (p+6)*inch*kernel_size;
+            const signed char* k7 = kernel + (p+7)*inch*kernel_size;                        
+
+            signed char* ktmp = kernel_tm.channel(p/8);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp[1] = k1[0];
+                ktmp[2] = k2[0];
+                ktmp[3] = k3[0];
+                ktmp[4] = k4[0];
+                ktmp[5] = k5[0];
+                ktmp[6] = k6[0];
+                ktmp[7] = k7[0];                
+                ktmp += 8;
+
+                k0 += 1;
+                k1 += 1;
+                k2 += 1;
+                k3 += 1;
+                k4 += 1;
+                k5 += 1;
+                k6 += 1;
+                k7 += 1;                
+            }            
+        }
+
+        nn_outch = (outch - remain_outch_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = remain_outch_start + pp * 4;
+
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+            const signed char* k1 = kernel + (p+1)*inch*kernel_size;
+            const signed char* k2 = kernel + (p+2)*inch*kernel_size;
+            const signed char* k3 = kernel + (p+3)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/8 + (p%8)/4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp[1] = k1[0];
+                ktmp[2] = k2[0];
+                ktmp[3] = k3[0];
+                ktmp += 4;
+
+                k0 += 1;
+                k1 += 1;
+                k2 += 1;
+                k3 += 1;
+            }
+        }
+
+        remain_outch_start += nn_outch << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=remain_outch_start; p<outch; p++)
+        {
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/8 + (p%8)/4 + p%4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp++;
+                k0++;
+            }
+        }
+    }
+#else
+    // kernel memory packed 4 x 8
+    Mat kernel_tm(4*kernel_size, inch, outch/4 + outch%4, (size_t)1u, opt.workspace_allocator);
+    {
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+        nn_outch = outch >> 2;
+        remain_outch_start = nn_outch << 2;      
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = pp * 4;
+
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+            const signed char* k1 = kernel + (p+1)*inch*kernel_size;
+            const signed char* k2 = kernel + (p+2)*inch*kernel_size;
+            const signed char* k3 = kernel + (p+3)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp[1] = k1[0];
+                ktmp[2] = k2[0];
+                ktmp[3] = k3[0];
+                ktmp += 4;
+
+                k0 += 1;
+                k1 += 1;
+                k2 += 1;
+                k3 += 1;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=remain_outch_start; p<outch; p++)
+        {
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/4 + p%4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp++;
+                k0++;
+            }
+        }
+    }
+#endif
+
+    // sgemm(int M, int N, int L, float* A, float* B, float* C)
+    {
+        //int M = outch;  // outch
+        int N = outw * outh; // outsize or out stride
+        int L = kernel_w * kernel_h * inch; // ksize * inch
+
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+#if __aarch64__
+        nn_outch = outch >> 3;
+        remain_outch_start = nn_outch << 3;
+#endif  
+
+#if __aarch64__
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int i = pp * 8;
+
+            int* output0 = top_blob.channel(i);
+            int* output1 = top_blob.channel(i+1);
+            int* output2 = top_blob.channel(i+2);
+            int* output3 = top_blob.channel(i+3);
+            int* output4 = top_blob.channel(i+4);
+            int* output5 = top_blob.channel(i+5);
+            int* output6 = top_blob.channel(i+6);
+            int* output7 = top_blob.channel(i+7);
+
+            int j=0;
+            for (; j+7<N; j=j+8)
+            {
+                signed char* vb = bottom_tm.channel(j/8);
+                signed char* va = kernel_tm.channel(i/8);
+#if __aarch64__
+                asm volatile(
+                    "eor    v16.16b, v16.16b, v16.16b    \n" // sum0
+                    "eor    v17.16b, v17.16b, v17.16b    \n" // sum0n
+                    "eor    v18.16b, v18.16b, v18.16b    \n" // sum1
+                    "eor    v19.16b, v19.16b, v19.16b    \n" // sum1n
+                    "eor    v20.16b, v20.16b, v20.16b    \n" // sum2
+                    "eor    v21.16b, v21.16b, v21.16b    \n" // sum2n
+                    "eor    v22.16b, v22.16b, v22.16b    \n" // sum3
+                    "eor    v23.16b, v23.16b, v23.16b    \n" // sum3n
+                    "eor    v24.16b, v24.16b, v24.16b    \n" // sum4
+                    "eor    v25.16b, v25.16b, v25.16b    \n" // sum4n
+                    "eor    v26.16b, v26.16b, v26.16b    \n" // sum5
+                    "eor    v27.16b, v27.16b, v27.16b    \n" // sum5n
+                    "eor    v28.16b, v28.16b, v28.16b    \n" // sum6
+                    "eor    v29.16b, v29.16b, v29.16b    \n" // sum6n
+                    "eor    v30.16b, v30.16b, v30.16b    \n" // sum7
+                    "eor    v31.16b, v31.16b, v31.16b    \n" // sum7n
+
+                    "lsr         w4, %w20, #3            \n"// r4 = nn = L >> 3
+                    "cmp         w4, #0                  \n"
+                    "beq         1f                      \n"
+
+                    "0:                                  \n"// for (; k+7<L; k=k+8)
+
+                    "prfm   pldl1keep, [%9, #128]                       \n"
+                    "ld1    {v0.8b, v1.8b, v2.8b, v3.8b}, [%9], #32     \n"
+                    "ld1    {v4.8b, v5.8b, v6.8b, v7.8b}, [%9], #32     \n"
+
+                    "prfm   pldl1keep, [%8, #128]                       \n"
+                    "ld1    {v8.8b, v9.8b, v10.8b, v11.8b}, [%8], #32   \n"
+                    "ld1    {v12.8b, v13.8b, v14.8b, v15.8b}, [%8], #32 \n"
+
+                    "sshll    v0.8h, v0.8b, #0           \n" // k00 - k70
+                    "sshll    v1.8h, v1.8b, #0           \n" // k01 - k71
+                    "sshll    v2.8h, v2.8b, #0           \n" // k02 - k72
+                    "sshll    v3.8h, v3.8b, #0           \n" // k03 - k73
+                    "sshll    v4.8h, v4.8b, #0           \n" // k04 - k74
+                    "sshll    v5.8h, v5.8b, #0           \n" // k05 - k75
+                    "sshll    v6.8h, v6.8b, #0           \n" // k06 - k76
+                    "sshll    v7.8h, v7.8b, #0           \n" // k07 - k77
+
+                    "sshll    v8.8h, v8.8b, #0           \n" // a00 - a70
+                    "sshll    v9.8h, v9.8b, #0           \n" // a01 - a71
+                    "sshll    v10.8h, v10.8b, #0         \n" // a02 - a72
+                    "sshll    v11.8h, v11.8b, #0         \n" // a03 - a73
+                    "sshll    v12.8h, v12.8b, #0         \n" // a04 - a74
+                    "sshll    v13.8h, v13.8b, #0         \n" // a05 - a75
+                    "sshll    v14.8h, v14.8b, #0         \n" // a06 - a76
+                    "sshll    v15.8h, v15.8b, #0         \n" // a07 - a77
+                    // k0
+                    "smlal    v16.4s, v8.4h, v0.h[0]     \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v8.8h, v0.h[0]     \n"//
+                    "smlal    v18.4s, v8.4h, v0.h[1]     \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v8.8h, v0.h[1]     \n"//
+                    "smlal    v20.4s, v8.4h, v0.h[2]     \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v8.8h, v0.h[2]     \n"//
+                    "smlal    v22.4s, v8.4h, v0.h[3]     \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v8.8h, v0.h[3]     \n"//
+                    "smlal    v24.4s, v8.4h, v0.h[4]     \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v8.8h, v0.h[4]     \n"//
+                    "smlal    v26.4s, v8.4h, v0.h[5]     \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v8.8h, v0.h[5]     \n"//
+                    "smlal    v28.4s, v8.4h, v0.h[6]     \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v8.8h, v0.h[6]     \n"//
+                    "smlal    v30.4s, v8.4h, v0.h[7]     \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v8.8h, v0.h[7]     \n"//
+                    // k1
+                    "smlal    v16.4s, v9.4h, v1.h[0]     \n"// sum0 += (a01-a71) * k01
+                    "smlal2   v17.4s, v9.8h, v1.h[0]     \n"//
+                    "smlal    v18.4s, v9.4h, v1.h[1]     \n"// sum1 += (a01-a71) * k11
+                    "smlal2   v19.4s, v9.8h, v1.h[1]     \n"//
+                    "smlal    v20.4s, v9.4h, v1.h[2]     \n"// sum2 += (a01-a71) * k21
+                    "smlal2   v21.4s, v9.8h, v1.h[2]     \n"//
+                    "smlal    v22.4s, v9.4h, v1.h[3]     \n"// sum3 += (a01-a71) * k31
+                    "smlal2   v23.4s, v9.8h, v1.h[3]     \n"//
+                    "smlal    v24.4s, v9.4h, v1.h[4]     \n"// sum4 += (a01-a71) * k41
+                    "smlal2   v25.4s, v9.8h, v1.h[4]     \n"//
+                    "smlal    v26.4s, v9.4h, v1.h[5]     \n"// sum5 += (a01-a71) * k51
+                    "smlal2   v27.4s, v9.8h, v1.h[5]     \n"//
+                    "smlal    v28.4s, v9.4h, v1.h[6]     \n"// sum6 += (a01-a71) * k61
+                    "smlal2   v29.4s, v9.8h, v1.h[6]     \n"//
+                    "smlal    v30.4s, v9.4h, v1.h[7]     \n"// sum7 += (a01-a71) * k71
+                    "smlal2   v31.4s, v9.8h, v1.h[7]     \n"//
+                    // k2
+                    "smlal    v16.4s, v10.4h, v2.h[0]    \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v10.8h, v2.h[0]    \n"//
+                    "smlal    v18.4s, v10.4h, v2.h[1]    \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v10.8h, v2.h[1]    \n"//
+                    "smlal    v20.4s, v10.4h, v2.h[2]    \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v10.8h, v2.h[2]    \n"//
+                    "smlal    v22.4s, v10.4h, v2.h[3]    \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v10.8h, v2.h[3]    \n"//
+                    "smlal    v24.4s, v10.4h, v2.h[4]    \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v10.8h, v2.h[4]    \n"//
+                    "smlal    v26.4s, v10.4h, v2.h[5]    \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v10.8h, v2.h[5]    \n"//
+                    "smlal    v28.4s, v10.4h, v2.h[6]    \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v10.8h, v2.h[6]    \n"//
+                    "smlal    v30.4s, v10.4h, v2.h[7]    \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v10.8h, v2.h[7]    \n"//
+                    // k3
+                    "smlal    v16.4s, v11.4h, v3.h[0]    \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v11.8h, v3.h[0]    \n"//
+                    "smlal    v18.4s, v11.4h, v3.h[1]    \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v11.8h, v3.h[1]    \n"//
+                    "smlal    v20.4s, v11.4h, v3.h[2]    \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v11.8h, v3.h[2]    \n"//
+                    "smlal    v22.4s, v11.4h, v3.h[3]    \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v11.8h, v3.h[3]    \n"//
+                    "smlal    v24.4s, v11.4h, v3.h[4]    \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v11.8h, v3.h[4]    \n"//
+                    "smlal    v26.4s, v11.4h, v3.h[5]    \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v11.8h, v3.h[5]    \n"//
+                    "smlal    v28.4s, v11.4h, v3.h[6]    \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v11.8h, v3.h[6]    \n"//
+                    "smlal    v30.4s, v11.4h, v3.h[7]    \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v11.8h, v3.h[7]    \n"//
+                    // k4
+                    "smlal    v16.4s, v12.4h, v4.h[0]    \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v12.8h, v4.h[0]    \n"//
+                    "smlal    v18.4s, v12.4h, v4.h[1]    \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v12.8h, v4.h[1]    \n"//
+                    "smlal    v20.4s, v12.4h, v4.h[2]    \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v12.8h, v4.h[2]    \n"//
+                    "smlal    v22.4s, v12.4h, v4.h[3]    \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v12.8h, v4.h[3]    \n"//
+                    "smlal    v24.4s, v12.4h, v4.h[4]    \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v12.8h, v4.h[4]    \n"//
+                    "smlal    v26.4s, v12.4h, v4.h[5]    \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v12.8h, v4.h[5]    \n"//
+                    "smlal    v28.4s, v12.4h, v4.h[6]    \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v12.8h, v4.h[6]    \n"//
+                    "smlal    v30.4s, v12.4h, v4.h[7]    \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v12.8h, v4.h[7]    \n"//
+                    // k5
+                    "smlal    v16.4s, v13.4h, v5.h[0]    \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v13.8h, v5.h[0]    \n"//
+                    "smlal    v18.4s, v13.4h, v5.h[1]    \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v13.8h, v5.h[1]    \n"//
+                    "smlal    v20.4s, v13.4h, v5.h[2]    \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v13.8h, v5.h[2]    \n"//
+                    "smlal    v22.4s, v13.4h, v5.h[3]    \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v13.8h, v5.h[3]    \n"//
+                    "smlal    v24.4s, v13.4h, v5.h[4]    \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v13.8h, v5.h[4]    \n"//
+                    "smlal    v26.4s, v13.4h, v5.h[5]    \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v13.8h, v5.h[5]    \n"//
+                    "smlal    v28.4s, v13.4h, v5.h[6]    \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v13.8h, v5.h[6]    \n"//
+                    "smlal    v30.4s, v13.4h, v5.h[7]    \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v13.8h, v5.h[7]    \n"//
+                    // k6
+                    "smlal    v16.4s, v14.4h, v6.h[0]    \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v14.8h, v6.h[0]    \n"//
+                    "smlal    v18.4s, v14.4h, v6.h[1]    \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v14.8h, v6.h[1]    \n"//
+                    "smlal    v20.4s, v14.4h, v6.h[2]    \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v14.8h, v6.h[2]    \n"//
+                    "smlal    v22.4s, v14.4h, v6.h[3]    \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v14.8h, v6.h[3]    \n"//
+                    "smlal    v24.4s, v14.4h, v6.h[4]    \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v14.8h, v6.h[4]    \n"//
+                    "smlal    v26.4s, v14.4h, v6.h[5]    \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v14.8h, v6.h[5]    \n"//
+                    "smlal    v28.4s, v14.4h, v6.h[6]    \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v14.8h, v6.h[6]    \n"//
+                    "smlal    v30.4s, v14.4h, v6.h[7]    \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v14.8h, v6.h[7]    \n"//
+                    // k7
+                    "smlal    v16.4s, v15.4h, v7.h[0]    \n"// sum0 += (a07-a77) * k07
+                    "smlal2   v17.4s, v15.8h, v7.h[0]    \n"//
+                    "smlal    v18.4s, v15.4h, v7.h[1]    \n"// sum1 += (a07-a77) * k17
+                    "smlal2   v19.4s, v15.8h, v7.h[1]    \n"//
+                    "smlal    v20.4s, v15.4h, v7.h[2]    \n"// sum2 += (a07-a77) * k27
+                    "smlal2   v21.4s, v15.8h, v7.h[2]    \n"//
+                    "smlal    v22.4s, v15.4h, v7.h[3]    \n"// sum3 += (a07-a77) * k37
+                    "smlal2   v23.4s, v15.8h, v7.h[3]    \n"//
+                    "smlal    v24.4s, v15.4h, v7.h[4]    \n"// sum4 += (a07-a77) * k47
+                    "smlal2   v25.4s, v15.8h, v7.h[4]    \n"//
+                    "smlal    v26.4s, v15.4h, v7.h[5]    \n"// sum5 += (a07-a77) * k57
+                    "smlal2   v27.4s, v15.8h, v7.h[5]    \n"//
+                    "smlal    v28.4s, v15.4h, v7.h[6]    \n"// sum6 += (a07-a77) * k67
+                    "smlal2   v29.4s, v15.8h, v7.h[6]    \n"//
+                    "smlal    v30.4s, v15.4h, v7.h[7]    \n"// sum7 += (a07-a77) * k77
+                    "smlal2   v31.4s, v15.8h, v7.h[7]    \n"//
+
+                    "subs   w4, w4, #1                   \n"
+                    "bne    0b                           \n"
+
+                    "1:                                  \n"
+
+                    // remain loop
+                    "and    w4, %w20, #7                 \n"// w4 = remain = inch & 7;
+                    "cmp    w4, #0                       \n"
+                    "beq    3f                           \n"
+
+                    "2:                                  \n"
+
+                    "prfm   pldl1keep, [%9, #128]        \n"
+                    "ld1    {v0.8b}, [%9], #8            \n"
+
+                    "prfm   pldl1keep, [%8, #128]        \n"
+                    "ld1    {v8.8b}, [%8], #8            \n"
+
+                    "sshll    v0.8h, v0.8b, #0           \n" // k00 - k70
+                    "sshll    v8.8h, v8.8b, #0           \n" // a00 - a70
+
+                    // k0
+                    "smlal    v16.4s, v8.4h, v0.h[0]     \n"// sum0 += (a00-a70) * k00
+                    "smlal2   v17.4s, v8.8h, v0.h[0]     \n"//
+                    "smlal    v18.4s, v8.4h, v0.h[1]     \n"// sum1 += (a00-a70) * k10
+                    "smlal2   v19.4s, v8.8h, v0.h[1]     \n"//
+                    "smlal    v20.4s, v8.4h, v0.h[2]     \n"// sum2 += (a00-a70) * k20
+                    "smlal2   v21.4s, v8.8h, v0.h[2]     \n"//
+                    "smlal    v22.4s, v8.4h, v0.h[3]     \n"// sum3 += (a00-a70) * k30
+                    "smlal2   v23.4s, v8.8h, v0.h[3]     \n"//
+                    "smlal    v24.4s, v8.4h, v0.h[4]     \n"// sum4 += (a00-a70) * k40
+                    "smlal2   v25.4s, v8.8h, v0.h[4]     \n"//
+                    "smlal    v26.4s, v8.4h, v0.h[5]     \n"// sum5 += (a00-a70) * k50
+                    "smlal2   v27.4s, v8.8h, v0.h[5]     \n"//
+                    "smlal    v28.4s, v8.4h, v0.h[6]     \n"// sum6 += (a00-a70) * k60
+                    "smlal2   v29.4s, v8.8h, v0.h[6]     \n"//
+                    "smlal    v30.4s, v8.4h, v0.h[7]     \n"// sum7 += (a00-a70) * k70
+                    "smlal2   v31.4s, v8.8h, v0.h[7]     \n"//
+
+                    "subs   w4, w4, #1                   \n"
+
+                    "bne    2b                           \n"
+
+                    "3:                                  \n"
+
+                    "st1    {v16.4s, v17.4s}, [%0]       \n"
+                    "st1    {v18.4s, v19.4s}, [%1]       \n"
+                    "st1    {v20.4s, v21.4s}, [%2]       \n"
+                    "st1    {v22.4s, v23.4s}, [%3]       \n"
+                    "st1    {v24.4s, v25.4s}, [%4]       \n"
+                    "st1    {v26.4s, v27.4s}, [%5]       \n"
+                    "st1    {v28.4s, v29.4s}, [%6]       \n"
+                    "st1    {v30.4s, v31.4s}, [%7]       \n"
+                    
+                    : "=r"(output0), // %0
+                      "=r"(output1), // %1
+                      "=r"(output2), // %2
+                      "=r"(output3), // %3
+                      "=r"(output4), // %4
+                      "=r"(output5), // %5
+                      "=r"(output6), // %6
+                      "=r"(output7), // %7
+                      "=r"(vb),      // %8
+                      "=r"(va)       // %9
+                    : "0"(output0),
+                      "1"(output1),
+                      "2"(output2),
+                      "3"(output3),
+                      "4"(output4),
+                      "5"(output5),
+                      "6"(output6),
+                      "7"(output7),
+                      "8"(vb),
+                      "9"(va),
+                      "r"(L)         // %20 
+                    : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31"
+                );                     
+#else                
+                int sum0[8] = {0};
+                int sum1[8] = {0};
+                int sum2[8] = {0};
+                int sum3[8] = {0};
+                int sum4[8] = {0};
+                int sum5[8] = {0};
+                int sum6[8] = {0};
+                int sum7[8] = {0};
+
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                        sum4[n] += (int)va[4] * vb[n];
+                        sum5[n] += (int)va[5] * vb[n];
+                        sum6[n] += (int)va[6] * vb[n];
+                        sum7[n] += (int)va[7] * vb[n];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+8];
+                        sum1[n] += (int)va[1] * vb[n+8];
+                        sum2[n] += (int)va[2] * vb[n+8];
+                        sum3[n] += (int)va[3] * vb[n+8];
+                        sum4[n] += (int)va[4] * vb[n+8];
+                        sum5[n] += (int)va[5] * vb[n+8];
+                        sum6[n] += (int)va[6] * vb[n+8];
+                        sum7[n] += (int)va[7] * vb[n+8];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+16];
+                        sum1[n] += (int)va[1] * vb[n+16];
+                        sum2[n] += (int)va[2] * vb[n+16];
+                        sum3[n] += (int)va[3] * vb[n+16];
+                        sum4[n] += (int)va[4] * vb[n+16];
+                        sum5[n] += (int)va[5] * vb[n+16];
+                        sum6[n] += (int)va[6] * vb[n+16];
+                        sum7[n] += (int)va[7] * vb[n+16];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+24];
+                        sum1[n] += (int)va[1] * vb[n+24];
+                        sum2[n] += (int)va[2] * vb[n+24];
+                        sum3[n] += (int)va[3] * vb[n+24];
+                        sum4[n] += (int)va[4] * vb[n+24];
+                        sum5[n] += (int)va[5] * vb[n+24];
+                        sum6[n] += (int)va[6] * vb[n+24];
+                        sum7[n] += (int)va[7] * vb[n+24];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+32];
+                        sum1[n] += (int)va[1] * vb[n+32];
+                        sum2[n] += (int)va[2] * vb[n+32];
+                        sum3[n] += (int)va[3] * vb[n+32];
+                        sum4[n] += (int)va[4] * vb[n+32];
+                        sum5[n] += (int)va[5] * vb[n+32];
+                        sum6[n] += (int)va[6] * vb[n+32];
+                        sum7[n] += (int)va[7] * vb[n+32];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+40];
+                        sum1[n] += (int)va[1] * vb[n+40];
+                        sum2[n] += (int)va[2] * vb[n+40];
+                        sum3[n] += (int)va[3] * vb[n+40];
+                        sum4[n] += (int)va[4] * vb[n+40];
+                        sum5[n] += (int)va[5] * vb[n+40];
+                        sum6[n] += (int)va[6] * vb[n+40];
+                        sum7[n] += (int)va[7] * vb[n+40];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+48];
+                        sum1[n] += (int)va[1] * vb[n+48];
+                        sum2[n] += (int)va[2] * vb[n+48];
+                        sum3[n] += (int)va[3] * vb[n+48];
+                        sum4[n] += (int)va[4] * vb[n+48];
+                        sum5[n] += (int)va[5] * vb[n+48];
+                        sum6[n] += (int)va[6] * vb[n+48];
+                        sum7[n] += (int)va[7] * vb[n+48];
+                        va += 8;
+
+                        sum0[n] += (int)va[0] * vb[n+56];
+                        sum1[n] += (int)va[1] * vb[n+56];
+                        sum2[n] += (int)va[2] * vb[n+56];
+                        sum3[n] += (int)va[3] * vb[n+56];
+                        sum4[n] += (int)va[4] * vb[n+56];
+                        sum5[n] += (int)va[5] * vb[n+56];
+                        sum6[n] += (int)va[6] * vb[n+56];
+                        sum7[n] += (int)va[7] * vb[n+56];                        
+                        va -= 56;
+                    }
+
+                    va += 64;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                        sum4[n] += (int)va[4] * vb[n];
+                        sum5[n] += (int)va[5] * vb[n];
+                        sum6[n] += (int)va[6] * vb[n];
+                        sum7[n] += (int)va[7] * vb[n];
+                    }
+                    
+                    va += 8;
+                    vb += 8;
+                }
+
+                for (int n=0; n<8; n++)
+                {
+                    output0[n] = sum0[n];
+                    output1[n] = sum1[n];
+                    output2[n] = sum2[n];
+                    output3[n] = sum3[n];
+                    output4[n] = sum4[n];
+                    output5[n] = sum5[n];
+                    output6[n] = sum6[n];
+                    output7[n] = sum7[n];
+                }
+#endif
+                output0 += 8;
+                output1 += 8;
+                output2 += 8;
+                output3 += 8;
+                output4 += 8;
+                output5 += 8;
+                output6 += 8;
+                output7 += 8;
+            }
+
+            for (; j<N; j++)
+            {
+                signed char* vb = bottom_tm.channel(j/8 + j%8);
+                signed char* va = kernel_tm.channel(i/8);
+
+                int sum0 = 0;
+                int sum1 = 0;
+                int sum2 = 0;
+                int sum3 = 0;
+                int sum4 = 0;
+                int sum5 = 0;
+                int sum6 = 0;
+                int sum7 = 0;
+
+                for (int k=0; k<L; k++)
+                {
+                    sum0 += (int)va[0] * vb[0];
+                    sum1 += (int)va[1] * vb[0];
+                    sum2 += (int)va[2] * vb[0];
+                    sum3 += (int)va[3] * vb[0];
+                    sum4 += (int)va[4] * vb[0];
+                    sum5 += (int)va[5] * vb[0];
+                    sum6 += (int)va[6] * vb[0];
+                    sum7 += (int)va[7] * vb[0];
+
+                    va += 8;
+                    vb += 1;
+                }
+                
+                output0[0] = sum0;
+                output1[0] = sum1;
+                output2[0] = sum2;
+                output3[0] = sum3;
+                output4[0] = sum4;
+                output5[0] = sum5;
+                output6[0] = sum6;
+                output7[0] = sum7;               
+
+                output0++;
+                output1++;
+                output2++;
+                output3++;
+                output4++;
+                output5++;
+                output6++;
+                output7++;
+            }
+        }
+#endif // __aarch64__
+
+        nn_outch = (outch - remain_outch_start) >> 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int i = remain_outch_start + pp * 4;
+
+            int* output0 = top_blob.channel(i);
+            int* output1 = top_blob.channel(i+1);
+            int* output2 = top_blob.channel(i+2);
+            int* output3 = top_blob.channel(i+3);
+
+            int j=0;
+            for (; j+7<N; j=j+8)
+            {
+                signed char* vb = bottom_tm.channel(j/8);
+#if __aarch64__
+                signed char* va = kernel_tm.channel(i/8 + (i%8)/4);
+#else                
+                signed char* va = kernel_tm.channel(i/4);
+#endif
+
+#if __ARM_NEON
+#if __aarch64__
+                int32x4_t _sum0 = vdupq_n_s32(0);
+                int32x4_t _sum0n = vdupq_n_s32(0);
+                int32x4_t _sum1 = vdupq_n_s32(0);
+                int32x4_t _sum1n = vdupq_n_s32(0);
+                int32x4_t _sum2 = vdupq_n_s32(0);
+                int32x4_t _sum2n = vdupq_n_s32(0);
+                int32x4_t _sum3 = vdupq_n_s32(0);
+                int32x4_t _sum3n = vdupq_n_s32(0);
+
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    int8x8_t _vacc0_s8 = vld1_s8(va);
+                    int8x8_t _vacc1_s8 = vld1_s8(va+8);
+                    int8x8_t _vacc2_s8 = vld1_s8(va+16);
+                    int8x8_t _vacc3_s8 = vld1_s8(va+24);
+                    int16x8_t _vacc0 = vmovl_s8(_vacc0_s8);
+                    int16x8_t _vacc1 = vmovl_s8(_vacc1_s8);
+                    int16x8_t _vacc2 = vmovl_s8(_vacc2_s8);
+                    int16x8_t _vacc3 = vmovl_s8(_vacc3_s8);
+
+                    // k=0
+                    int8x8_t _vb_s8 = vld1_s8(vb);
+                    int16x8_t _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_low_s16(_vacc0), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_low_s16(_vacc0), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_low_s16(_vacc0), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_low_s16(_vacc0), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_low_s16(_vacc0), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_low_s16(_vacc0), 3);
+
+                    // k=1
+                    _vb_s8 = vld1_s8(vb+8);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc0), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_high_s16(_vacc0), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_high_s16(_vacc0), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_high_s16(_vacc0), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_high_s16(_vacc0), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_high_s16(_vacc0), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_high_s16(_vacc0), 3);
+
+                    // k=2
+                    _vb_s8 = vld1_s8(vb+16);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc1), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc1), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_low_s16(_vacc1), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_low_s16(_vacc1), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_low_s16(_vacc1), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_low_s16(_vacc1), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_low_s16(_vacc1), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_low_s16(_vacc1), 3);
+
+                    // k=3
+                    _vb_s8 = vld1_s8(vb+24);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc1), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc1), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_high_s16(_vacc1), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_high_s16(_vacc1), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_high_s16(_vacc1), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_high_s16(_vacc1), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_high_s16(_vacc1), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_high_s16(_vacc1), 3);
+
+                    // k=4
+                    _vb_s8 = vld1_s8(vb+32);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc2), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc2), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_low_s16(_vacc2), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_low_s16(_vacc2), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_low_s16(_vacc2), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_low_s16(_vacc2), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_low_s16(_vacc2), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_low_s16(_vacc2), 3);
+
+                    // k=5
+                    _vb_s8 = vld1_s8(vb+40);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc2), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc2), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_high_s16(_vacc2), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_high_s16(_vacc2), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_high_s16(_vacc2), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_high_s16(_vacc2), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_high_s16(_vacc2), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_high_s16(_vacc2), 3);
+
+                    // k=6
+                    _vb_s8 = vld1_s8(vb+48);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc3), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc3), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_low_s16(_vacc3), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_low_s16(_vacc3), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_low_s16(_vacc3), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_low_s16(_vacc3), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_low_s16(_vacc3), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_low_s16(_vacc3), 3);
+
+                    // k=7
+                    _vb_s8 = vld1_s8(vb+56);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc3), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc3), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_high_s16(_vacc3), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_high_s16(_vacc3), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_high_s16(_vacc3), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_high_s16(_vacc3), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_high_s16(_vacc3), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_high_s16(_vacc3), 3);
+
+                    va += 32;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    int8x8_t _vacc0_s8 = vld1_s8(va);
+                    int16x8_t _vacc0 = vmovl_s8(_vacc0_s8);
+
+                    // k=0
+                    int8x8_t _vb_s8 = vld1_s8(vb);
+                    int16x8_t _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum1 = vmlal_lane_s16(_sum1, vget_low_s16(_vb), vget_low_s16(_vacc0), 1);
+                    _sum1n = vmlal_lane_s16(_sum1n, vget_high_s16(_vb), vget_low_s16(_vacc0), 1);
+                    _sum2 = vmlal_lane_s16(_sum2, vget_low_s16(_vb), vget_low_s16(_vacc0), 2);
+                    _sum2n = vmlal_lane_s16(_sum2n, vget_high_s16(_vb), vget_low_s16(_vacc0), 2);
+                    _sum3 = vmlal_lane_s16(_sum3, vget_low_s16(_vb), vget_low_s16(_vacc0), 3);
+                    _sum3n = vmlal_lane_s16(_sum3n, vget_high_s16(_vb), vget_low_s16(_vacc0), 3);
+
+                    va += 4;
+                    vb += 8;
+                }
+
+                vst1q_s32(output0, _sum0);
+                vst1q_s32(output0+4, _sum0n);
+                vst1q_s32(output1, _sum1);
+                vst1q_s32(output1+4, _sum1n);
+                vst1q_s32(output2, _sum2);
+                vst1q_s32(output2+4, _sum2n);
+                vst1q_s32(output3, _sum3);
+                vst1q_s32(output3+4, _sum3n); 
+#else
+                asm volatile(
+                    // K loop
+                    "vmov.s32    q8, #0             \n"
+                    "vmov.s32    q9, #0             \n"
+                    "vmov.s32    q10, #0            \n"
+                    "vmov.s32    q11, #0            \n"
+                    "vmov.s32    q12, #0            \n"
+                    "vmov.s32    q13, #0            \n"
+                    "vmov.s32    q14, #0            \n"
+                    "vmov.s32    q15, #0            \n"
+
+                    "lsr         r4, %12, #3        \n"// r4 = nn = L >> 3
+                    "cmp         r4, #0             \n"
+                    "beq         1f                 \n"
+                    
+                    "0:                             \n"// for(; nn != 0; nn--)
+                    "pld         [%4, #128]         \n"
+                    "vld1.s8     {d8-d11}, [%4]!    \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                    "vmovl.s8    q7, d11            \n"// a30-a37
+                    "vmovl.s8    q6, d10            \n"// a20-a27                    
+                    "vmovl.s8    q5, d9             \n"// a10-a17
+                    "vmovl.s8    q4, d8             \n"// a00-a07
+
+                    "pld         [%5, #128]         \n"
+                    "vld1.s8     {d0-d3}, [%5]!     \n"// kptr k00-k30,k01-k31, k02-k32,k03-k33, k04-k34,k05-k35, k06-k36,k07-k37    k(outch)(inch)
+                    "vmovl.s8    q3, d3             \n"// k06-k36,k07-k37
+                    "vmovl.s8    q2, d2             \n"// k04-k34,k05-k35
+                    "vmovl.s8    q1, d1             \n"// k02-k32,k03-k33
+                    "vmovl.s8    q0, d0             \n"// k00-k30,k01-k31
+
+                    "vmlal.s16   q8, d8, d0[0]      \n"// sum0 = (a00-a07) * k00
+                    "vmlal.s16   q9, d9, d0[0]      \n"
+                    "vmlal.s16   q10, d8, d0[1]     \n"// sum1 = (a00-a07) * k10
+                    "vmlal.s16   q11, d9, d0[1]     \n"
+                    "vmlal.s16   q12, d8, d0[2]     \n"// sum2 = (a00-a07) * k20
+                    "vmlal.s16   q13, d9, d0[2]     \n"
+                    "vmlal.s16   q14, d8, d0[3]     \n"// sum3 = (a00-a07) * k30
+                    "vmlal.s16   q15, d9, d0[3]     \n"                  
+
+                    "vmlal.s16   q8, d10, d1[0]     \n"// sum0 += (a10-a17) * k01
+                    "vmlal.s16   q9, d11, d1[0]     \n"
+                    "vmlal.s16   q10, d10, d1[1]    \n"// sum1 += (a10-a17) * k11
+                    "vmlal.s16   q11, d11, d1[1]    \n"
+                    "vmlal.s16   q12, d10, d1[2]    \n"// sum2 += (a10-a17) * k21
+                    "vmlal.s16   q13, d11, d1[2]    \n"
+                    "vmlal.s16   q14, d10, d1[3]    \n"// sum3 += (a10-a17) * k31
+                    "vmlal.s16   q15, d11, d1[3]    \n"
+
+                    "pld         [%4, #128]         \n"
+                    "vld1.s8     {d8-d9}, [%4]!     \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                    "vmovl.s8    q5, d9             \n"// a10-a17
+                    "vmovl.s8    q4, d8             \n"// a00-a07
+
+                    "vmlal.s16   q8, d12, d2[0]     \n"// sum0 += (a20-a27) * k02
+                    "vmlal.s16   q9, d13, d2[0]     \n"
+                    "vmlal.s16   q10, d12, d2[1]    \n"// sum1 += (a20-a27) * k12
+                    "vmlal.s16   q11, d13, d2[1]    \n"
+                    "vmlal.s16   q12, d12, d2[2]    \n"// sum2 += (a20-a27) * k22
+                    "vmlal.s16   q13, d13, d2[2]    \n"
+                    "vmlal.s16   q14, d12, d2[3]    \n"// sum3 += (a20-a27) * k32
+                    "vmlal.s16   q15, d13, d2[3]    \n"                      
+
+                    "vmlal.s16   q8, d14, d3[0]     \n"// sum0 += (a30-a37) * k03
+                    "vmlal.s16   q9, d15, d3[0]     \n"
+                    "vmlal.s16   q10, d14, d3[1]    \n"// sum1 += (a30-a37) * k13
+                    "vmlal.s16   q11, d15, d3[1]    \n"
+                    "vmlal.s16   q12, d14, d3[2]    \n"// sum2 += (a30-a37) * k23
+                    "vmlal.s16   q13, d15, d3[2]    \n"
+                    "vmlal.s16   q14, d14, d3[3]    \n"// sum3 += (a30-a37) * k33
+                    "vmlal.s16   q15, d15, d3[3]    \n"
+
+                    "pld         [%4, #128]         \n"
+                    "vld1.s8     {d0-d1}, [%4]!     \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                    "vmovl.s8    q1, d1             \n"// a10-a17
+                    "vmovl.s8    q0, d0             \n"// a00-a07
+
+                    "vmlal.s16   q8, d8, d4[0]      \n"// sum0 += (a40-a47) * k04
+                    "vmlal.s16   q9, d9, d4[0]      \n"
+                    "vmlal.s16   q10, d8, d4[1]     \n"// sum1 += (a40-a47) * k14
+                    "vmlal.s16   q11, d9, d4[1]     \n"
+                    "vmlal.s16   q12, d8, d4[2]     \n"// sum2 += (a40-a47) * k24
+                    "vmlal.s16   q13, d9, d4[2]     \n"
+                    "vmlal.s16   q14, d8, d4[3]     \n"// sum3 += (a40-a47) * k34
+                    "vmlal.s16   q15, d9, d4[3]     \n"                     
+
+                    "vmlal.s16   q8, d10, d5[0]     \n"// sum0 += (a50-a57) * k05
+                    "vmlal.s16   q9, d11, d5[0]     \n"
+                    "vmlal.s16   q10, d10, d5[1]    \n"// sum1 += (a50-a57) * k15
+                    "vmlal.s16   q11, d11, d5[1]    \n"
+                    "vmlal.s16   q12, d10, d5[2]    \n"// sum2 += (a50-a57) * k25
+                    "vmlal.s16   q13, d11, d5[2]    \n"
+                    "vmlal.s16   q14, d10, d5[3]    \n"// sum3 += (a50-a57) * k35
+                    "vmlal.s16   q15, d11, d5[3]    \n"                  
+
+                    "vmlal.s16   q8, d0, d6[0]      \n"// sum0 += (a60-a67) * k06
+                    "vmlal.s16   q9, d1, d6[0]      \n"
+                    "vmlal.s16   q10, d0, d6[1]     \n"// sum1 += (a60-a67) * k16
+                    "vmlal.s16   q11, d1, d6[1]     \n"
+                    "vmlal.s16   q12, d0, d6[2]     \n"// sum2 += (a60-a67) * k26
+                    "vmlal.s16   q13, d1, d6[2]     \n"
+                    "vmlal.s16   q14, d0, d6[3]     \n"// sum3 += (a60-a67) * k36
+                    "vmlal.s16   q15, d1, d6[3]     \n"                      
+
+                    "vmlal.s16   q8, d2, d7[0]      \n"// sum0 += (a70-a77) * k07
+                    "vmlal.s16   q9, d3, d7[0]      \n"
+                    "vmlal.s16   q10, d2, d7[1]     \n"// sum1 += (a70-a77) * k17
+                    "vmlal.s16   q11, d3, d7[1]     \n"
+                    "vmlal.s16   q12, d2, d7[2]     \n"// sum2 += (a70-a77) * k27
+                    "vmlal.s16   q13, d3, d7[2]     \n"
+                    "vmlal.s16   q14, d2, d7[3]     \n"// sum3 += (a70-a77) * k37
+                    "vmlal.s16   q15, d3, d7[3]     \n"                                        
+
+                    "subs        r4, r4, #1         \n"
+                    "bne         0b                 \n"// end for
+
+                    "1:                             \n"
+                    // remain loop
+                    "and         r4, %12, #7        \n"// r4 = remain = inch & 7
+                    "cmp         r4, #0             \n"
+                    "beq         3f                 \n"
+
+                    "2:                             \n"// for(; remain != 0; remain--)
+                    "vld1.s8     {d2}, [%4]!        \n"// tmpr a00-a70    a(inch)(data)
+                    "vld1.s8     {d0}, [%5]         \n"// kptr k00-k30    k(outch)(inch)
+                    "vmovl.s8    q1, d2             \n"
+                    "vmovl.s8    q0, d0             \n"
+                    "add         %5, #4             \n"
+
+                    "vmlal.s16   q8, d2, d0[0]      \n"// sum0 += (a00-a70) * k00
+                    "vmlal.s16   q9, d3, d0[0]      \n"
+                    "vmlal.s16   q10, d2, d0[1]     \n"// sum1 += (a00-a70) * k10
+                    "vmlal.s16   q11, d3, d0[1]     \n"
+                    "vmlal.s16   q12, d2, d0[2]     \n"// sum2 += (a00-a70) * k20
+                    "vmlal.s16   q13, d3, d0[2]     \n"
+                    "vmlal.s16   q14, d2, d0[3]     \n"// sum3 += (a00-a70) * k30
+                    "vmlal.s16   q15, d3, d0[3]     \n"    
+
+                    "subs        r4, r4, #1         \n"
+                    "bne         2b                 \n"
+
+                    "3:                             \n"// store the result to memory
+                    "vst1.s32    {d16-d19}, [%0]    \n"
+                    "vst1.s32    {d20-d23}, [%1]    \n"
+                    "vst1.s32    {d24-d27}, [%2]    \n"
+                    "vst1.s32    {d28-d31}, [%3]    \n"
+
+                    : "=r"(output0), // %0
+                      "=r"(output1), // %1
+                      "=r"(output2), // %2
+                      "=r"(output3), // %3
+                      "=r"(vb),      // %4
+                      "=r"(va)       // %5
+                    : "0"(output0),
+                      "1"(output1),
+                      "2"(output2),
+                      "3"(output3),
+                      "4"(vb),
+                      "5"(va),
+                      "r"(L)         // %12  
+                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"                    
+                );
+#endif // __aarch64__                                
+#else
+                int sum0[8] = {0};
+                int sum1[8] = {0};
+                int sum2[8] = {0};
+                int sum3[8] = {0};
+               
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+8];
+                        sum1[n] += (int)va[1] * vb[n+8];
+                        sum2[n] += (int)va[2] * vb[n+8];
+                        sum3[n] += (int)va[3] * vb[n+8];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+16];
+                        sum1[n] += (int)va[1] * vb[n+16];
+                        sum2[n] += (int)va[2] * vb[n+16];
+                        sum3[n] += (int)va[3] * vb[n+16];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+24];
+                        sum1[n] += (int)va[1] * vb[n+24];
+                        sum2[n] += (int)va[2] * vb[n+24];
+                        sum3[n] += (int)va[3] * vb[n+24];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+32];
+                        sum1[n] += (int)va[1] * vb[n+32];
+                        sum2[n] += (int)va[2] * vb[n+32];
+                        sum3[n] += (int)va[3] * vb[n+32];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+40];
+                        sum1[n] += (int)va[1] * vb[n+40];
+                        sum2[n] += (int)va[2] * vb[n+40];
+                        sum3[n] += (int)va[3] * vb[n+40];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+48];
+                        sum1[n] += (int)va[1] * vb[n+48];
+                        sum2[n] += (int)va[2] * vb[n+48];
+                        sum3[n] += (int)va[3] * vb[n+48];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+56];
+                        sum1[n] += (int)va[1] * vb[n+56];
+                        sum2[n] += (int)va[2] * vb[n+56];
+                        sum3[n] += (int)va[3] * vb[n+56];
+                        va -= 28;
+                    }
+
+                    va += 32;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                    }
+                    
+                    va += 4;
+                    vb += 8;
+                }
+
+                for (int n=0; n<8; n++)
+                {
+                    output0[n] = sum0[n];
+                    output1[n] = sum1[n];
+                    output2[n] = sum2[n];
+                    output3[n] = sum3[n];
+                }
+#endif // __ARM_NEON
+                output0 += 8;
+                output1 += 8;
+                output2 += 8;
+                output3 += 8;
+            }
+
+            for (; j<N; j++)
+            {                
+                signed char* vb = bottom_tm.channel(j/8 + j%8);
+#if __aarch64__
+                signed char* va = kernel_tm.channel(i/8 + (i%8)/4);
+#else                
+                signed char* va = kernel_tm.channel(i/4);
+#endif
+
+#if __ARM_NEON
+#if __aarch64__
+                int sum0 = 0;
+                int sum1 = 0;
+                int sum2 = 0;
+                int sum3 = 0;
+
+                for (int k=0; k<L; k++)
+                {
+                    sum0 += (int)va[0] * vb[0];
+                    sum1 += (int)va[1] * vb[0];
+                    sum2 += (int)va[2] * vb[0];
+                    sum3 += (int)va[3] * vb[0];
+
+                    va += 4;
+                    vb += 1;
+                }
+                
+                output0[0] = sum0;
+                output1[0] = sum1;
+                output2[0] = sum2;
+                output3[0] = sum3;
+#else
+                asm volatile(
+                    // inch loop
+                    "veor        q6, q6, q6        \n"
+                    "veor        q7, q7, q7        \n"
+                    "veor        q8, q8, q8        \n"
+                    "veor        q9, q9, q9        \n"
+                    "veor        q10, q10, q10     \n"
+                    "veor        q11, q11, q11     \n"
+                    "veor        q12, q12, q12     \n"
+                    "veor        q13, q13, q13     \n"                    
+                    "vmov.s32    q14, #0           \n"
+
+                    "lsr         r4, %12, #3       \n"// r4 = nn = L >> 2
+                    "cmp         r4, #0            \n"
+                    "beq         1f                \n"
+                    
+                    "0:                            \n"// for(; nn != 0; nn--)
+                    "pld         [%4, #128]        \n"
+                    "vld1.s8     {d0}, [%4]!       \n"// tmpr a00,a10,a20,a30    a(inch)(data)
+                    "vmovl.s8    q0, d0            \n"// a00-a07
+
+                    "pld         [%5, #128]        \n"
+                    "vld1.s8     {d2-d5}, [%5]!    \n"// kptr k00-k30,k01-k31, k02-k32,k03-k33, k04-k34,k05-k35, k06-k36,k07-k37    k(outch)(inch)
+                    "vmovl.s8    q4, d5            \n"// k06-k36,k07-k37
+                    "vmovl.s8    q3, d4            \n"// k04-k34,k05-k35
+                    "vmovl.s8    q2, d3            \n"// k02-k32,k03-k33
+                    "vmovl.s8    q1, d2            \n"// k00-k30,k01-k31
+
+                    "vmlal.s16   q6, d2, d0[0]     \n"// (k00-k30) * a00
+                    "vmlal.s16   q7, d3, d0[1]     \n"// (k01-k31) * a01
+                    "vmlal.s16   q8, d4, d0[2]     \n"// (k02-k32) * a02
+                    "vmlal.s16   q9, d5, d0[3]     \n"// (k03-k33) * a03
+                    "vmlal.s16   q10, d6, d1[0]    \n"// (k04-k34) * a04
+                    "vmlal.s16   q11, d7, d1[1]    \n"// (k05-k35) * a05
+                    "vmlal.s16   q12, d8, d1[2]    \n"// (k06-k36) * a06
+                    "vmlal.s16   q13, d9, d1[3]    \n"// (k07-k37) * a07                    
+
+                    "subs        r4, r4, #1        \n"
+                    "bne         0b                \n"// end for
+
+                    "vadd.s32    q6, q6, q7        \n"
+                    "vadd.s32    q9, q9, q8        \n"
+                    "vadd.s32    q11, q11, q10     \n"
+                    "vadd.s32    q13, q13, q12     \n"
+
+                    "vadd.s32    q9, q9, q6        \n"
+                    "vadd.s32    q13, q13, q11     \n"
+                    "vadd.s32    q14, q13, q9      \n"
+    
+                    "1:                            \n"
+                    // remain loop
+                    "and         r4, %12, #7       \n"// r4 = remain = inch & 3
+                    "cmp         r4, #0            \n"
+                    "beq         3f                \n"
+
+                    "2:                            \n"// for(; remain != 0; remain--)
+                    "vld1.s8     {d2}, [%4]        \n"// tmpr a00        a(inch)(data)
+                    "vld1.s8     {d0}, [%5]        \n"// kptr k00-k30    k(outch)(inch)
+                    "vmovl.s8    q1, d2            \n"
+                    "vmovl.s8    q0, d0            \n"
+                    "add         %4, #1            \n"
+                    "add         %5, #4            \n"
+
+                    "vmlal.s16   q14, d0, d2[0]    \n"
+
+                    "subs        r4, r4, #1        \n"
+                    "bne         2b                \n"
+
+                    "3:                            \n"// store the result to memory
+                    "vst1.s32    {d28[0]}, [%0]    \n"
+                    "vst1.s32    {d28[1]}, [%1]    \n"
+                    "vst1.s32    {d29[0]}, [%2]    \n"
+                    "vst1.s32    {d29[1]}, [%3]    \n"
+
+                    : "=r"(output0), // %0
+                      "=r"(output1), // %1
+                      "=r"(output2), // %2
+                      "=r"(output3), // %3
+                      "=r"(vb),      // %4
+                      "=r"(va)       // %5
+                    : "0"(output0),
+                      "1"(output1),
+                      "2"(output2),
+                      "3"(output3),
+                      "4"(vb),
+                      "5"(va),
+                      "r"(L)         // %12  
+                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14"
+                );
+#endif // __aarch64__                            
+#else
+                int sum0 = 0;
+                int sum1 = 0;
+                int sum2 = 0;
+                int sum3 = 0;
+
+                for (int k=0; k<L; k++)
+                {
+                    sum0 += (int)va[0] * vb[0];
+                    sum1 += (int)va[1] * vb[0];
+                    sum2 += (int)va[2] * vb[0];
+                    sum3 += (int)va[3] * vb[0];
+
+                    va += 4;
+                    vb += 1;
+                }
+                
+                output0[0] = sum0;
+                output1[0] = sum1;
+                output2[0] = sum2;
+                output3[0] = sum3;
+#endif // __ARM_NEON
+                output0++;
+                output1++;
+                output2++;
+                output3++;
+            }
+        }
+
+        remain_outch_start += nn_outch << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i=remain_outch_start; i<outch; i++)
+        {
+            int* output = top_blob.channel(i);
+
+            int j=0;
+            for (; j+7<N; j=j+8)
+            {
+                signed char* vb = bottom_tm.channel(j/8);
+#if __aarch64__
+                signed char* va = kernel_tm.channel(i/8 + (i%8)/4 + i%4);
+#else                
+                signed char* va = kernel_tm.channel(i/4 + i%4);
+#endif
+
+#if __ARM_NEON
+#if __aarch64__
+                int32x4_t _sum0 = vdupq_n_s32(0);
+                int32x4_t _sum0n = vdupq_n_s32(0);
+
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    int8x8_t _vacc0_s8 = vld1_s8(va);
+                    int16x8_t _vacc0 = vmovl_s8(_vacc0_s8);
+
+                    // k=0
+                    int8x8_t _vb_s8 = vld1_s8(vb);
+                    int16x8_t _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 0);
+
+                    // k=1
+                    _vb_s8 = vld1_s8(vb+8);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 1);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 1);
+
+                    // k=2
+                    _vb_s8 = vld1_s8(vb+16);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 2);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 2);
+
+                    // k=3
+                    _vb_s8 = vld1_s8(vb+24);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 3);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 3);
+
+                    // k=4
+                    _vb_s8 = vld1_s8(vb+32);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc0), 0);
+
+                    // k=5
+                    _vb_s8 = vld1_s8(vb+40);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc0), 1);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc0), 1);
+
+                    // k=6
+                    _vb_s8 = vld1_s8(vb+48);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc0), 2);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc0), 2);
+
+                    // k=7
+                    _vb_s8 = vld1_s8(vb+56);
+                    _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_high_s16(_vacc0), 3);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_high_s16(_vacc0), 3);
+
+                    va += 8;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    int8x8_t _vacc0_s8 = vld1_s8(va);
+                    int16x8_t _vacc0 = vmovl_s8(_vacc0_s8);
+
+                    // k=0
+                    int8x8_t _vb_s8 = vld1_s8(vb);
+                    int16x8_t _vb = vmovl_s8(_vb_s8);
+                    _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_vb), vget_low_s16(_vacc0), 0);
+                    _sum0n = vmlal_lane_s16(_sum0n, vget_high_s16(_vb), vget_low_s16(_vacc0), 0);
+
+                    va += 1;
+                    vb += 8;
+                }
+
+                vst1q_s32(output, _sum0);
+                vst1q_s32(output+4, _sum0n);  
+#else
+                asm volatile(
+                    // inch loop
+                    "vmov.s32    q6, #0            \n"
+                    "vmov.s32    q7, #0            \n"
+
+                    "lsr         r4, %6, #3        \n"// r4 = nn = inch >> 3
+                    "cmp         r4, #0            \n"
+                    "beq         1f                \n"
+                    
+                    "0:                            \n"// for(; nn != 0; nn--)
+                    "pld         [%1, #128]        \n"
+                    "vld1.s8     {d4-d7}, [%1]!    \n"// tmpr a00-a07,a10-a17,a20-a27,a30-a37    a(inch)(data)
+                    "vmovl.s8    q5, d7            \n"// a30-a37
+                    "vmovl.s8    q4, d6            \n"// a20-a27
+                    "vmovl.s8    q3, d5            \n"// a10-a17
+                    "vmovl.s8    q2, d4            \n"// a00-a07
+
+                    "pld         [%2, #128]        \n"
+                    "vld1.s8     {d0}, [%2]!       \n"// kptr k00-k07    k(outch)(inch)
+                    "vmovl.s8    q1, d1            \n"// k04,k05,k06,k07
+                    "vmovl.s8    q0, d0            \n"// k00,k01,k02,k03
+
+                    "vmlal.s16   q6, d4, d0[0]     \n"// (a00-a07) * k00
+                    "vmlal.s16   q7, d5, d0[0]     \n"
+                    "vmlal.s16   q6, d6, d0[1]     \n"// (a10-a17) * k01
+                    "vmlal.s16   q7, d7, d0[1]     \n"
+                    "vmlal.s16   q6, d8, d0[2]     \n"// (a20-a27) * k02
+                    "vmlal.s16   q7, d9, d0[2]     \n"
+                    "vmlal.s16   q6, d10, d0[3]    \n"// (a30-a37) * k03
+                    "vmlal.s16   q7, d11, d0[3]    \n"
+
+                    "pld         [%1, #128]        \n"
+                    "vld1.s8     {d4-d7}, [%1]!    \n"// tmpr a40-a47,a50-a57,a60-a67,a70-a77    a(inch)(data)
+                    "vmovl.s8    q5, d7            \n"// a70-a77
+                    "vmovl.s8    q4, d6            \n"// a60-a67
+                    "vmovl.s8    q3, d5            \n"// a50-a57
+                    "vmovl.s8    q2, d4            \n"// a40-a47
+
+                    "vmlal.s16   q6, d4, d1[0]     \n"// (a00-a07) * k00
+                    "vmlal.s16   q7, d5, d1[0]     \n"
+                    "vmlal.s16   q6, d6, d1[1]     \n"// (a10-a17) * k01
+                    "vmlal.s16   q7, d7, d1[1]     \n"
+                    "vmlal.s16   q6, d8, d1[2]     \n"// (a20-a27) * k02
+                    "vmlal.s16   q7, d9, d1[2]     \n"
+                    "vmlal.s16   q6, d10, d1[3]    \n"// (a30-a37) * k03
+                    "vmlal.s16   q7, d11, d1[3]    \n"                    
+
+                    "subs        r4, r4, #1        \n"
+                    "bne         0b                \n"// end for
+    
+                    "1:                            \n"
+                    // remain loop
+                    "and         r4, %6, #7        \n"// r4 = remain = inch & 7
+                    "cmp         r4, #0            \n"
+                    "beq         3f                \n"
+
+                    "2:                            \n"// for(; remain != 0; remain--)
+                    "vld1.s8     {d2}, [%1]!       \n"// tmpr a00-a07    a(inch)(data)
+                    "vld1.s8     {d0}, [%2]        \n"// kptr k00        k(outch)(inch)
+                    "vmovl.s8    q1, d2            \n"
+                    "vmovl.s8    q0, d0            \n"
+                    "add         %2, #1            \n"
+
+                    "vmlal.s16   q6, d2, d0[0]     \n"// (a00-a07) * k00
+                    "vmlal.s16   q7, d3, d0[0]     \n"  
+
+                    "subs        r4, r4, #1        \n"
+                    "bne         2b                \n"
+
+                    "3:                            \n"// store the result to memory
+                    "vst1.s32    {d12-d15}, [%0]   \n"
+
+                    : "=r"(output), // %0
+                      "=r"(vb),     // %1
+                      "=r"(va)      // %2
+                    : "0"(output),
+                      "1"(vb),
+                      "2"(va),
+                      "r"(L)        // %6  
+                    : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
+                );
+#endif // __aarch64__                                         
+#else                
+                int sum[8] = {0};
+
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum[n] += (int)va[0] * vb[n];
+                        sum[n] += (int)va[1] * vb[n+8];
+                        sum[n] += (int)va[2] * vb[n+16];
+                        sum[n] += (int)va[3] * vb[n+24];
+                        sum[n] += (int)va[4] * vb[n+32];
+                        sum[n] += (int)va[5] * vb[n+40];
+                        sum[n] += (int)va[6] * vb[n+48];
+                        sum[n] += (int)va[7] * vb[n+56];
+                    }
+
+                    va += 8;
+                    vb += 64;    
+                }
+
+                for (; k<L; k++)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum[n] += (int)va[0] * vb[n];
+                    }
+
+                    va += 1;
+                    vb += 8;
+                }
+
+                for (int n=0; n<8; n++)
+                {
+                    output[n] = sum[n];
+                }
+#endif // __ARM_NEON
+                output += 8;
+            }
+
+            for (; j<N; j++)
+            {
+                int sum = 0;
+
+                signed char* vb = bottom_tm.channel(j/8 + j%8);
+#if __aarch64__
+                signed char* va = kernel_tm.channel(i/8 + (i%8)/4 + i%4);
+#else                
+                signed char* va = kernel_tm.channel(i/4 + i%4);
+#endif
+
+                for (int k=0; k<L; k++)
+                {
+                    sum += (int)va[0] * vb[0];
+
+                    va += 1;
+                    vb += 1;
+                }
+                output[0] = sum;
+
+                output++;
+            }
+        }
+    }
+}
diff --git a/src/layer/arm/convolutiondepthwise_3x3_int8.h b/src/layer/arm/convolutiondepthwise_3x3_int8.h
index 89556b8bf..a58377b26 100644
--- a/src/layer/arm/convolutiondepthwise_3x3_int8.h
+++ b/src/layer/arm/convolutiondepthwise_3x3_int8.h
@@ -16,347 +16,6 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
-#if __aarch64__
-static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
-{
-    int w = bottom_blob.w;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p = 0; p < outch; p++)
-    {
-        Mat out = top_blob.channel(p);
-
-        const signed char* kernel = (const signed char *)_kernel + p*9;
-        
-        int* outptr0 = out;
-        int* outptr0n = outptr0 + outw;
-    
-        const signed char* img0 = bottom_blob.channel(p);
-        
-        const signed char* r0 = img0;
-        const signed char* r1 = img0 + w;
-        const signed char* r2 = img0 + w*2;
-        const signed char* r3 = img0 + w*3;
-
-        int i = 0;
-
-        int8x8_t _k0 = vdup_n_s8(kernel[0]);
-        int8x8_t _k1 = vdup_n_s8(kernel[1]);
-        int8x8_t _k2 = vdup_n_s8(kernel[2]);
-
-        int8x8_t _k3 = vdup_n_s8(kernel[3]);
-        int8x8_t _k4 = vdup_n_s8(kernel[4]);
-        int8x8_t _k5 = vdup_n_s8(kernel[5]);
-
-        int8x8_t _k6 = vdup_n_s8(kernel[6]);
-        int8x8_t _k7 = vdup_n_s8(kernel[7]);
-        int8x8_t _k8 = vdup_n_s8(kernel[8]);
-
-        for (; i+1 < outh; i+=2)
-        {
-            int nn = outw >> 3;
-            int remain = outw & 7;
-
-            for (; nn >0; nn--)
-            {
-                int8x8_t _r0 = vld1_s8(r0);
-                int8x8_t _r0n = vld1_s8(r0+8);
-                int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
-                int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
-
-                int16x8_t _sum0 = vmull_s8(_r0, _k0);
-                _sum0 = vmlal_s8(_sum0, _r01, _k1);
-                _sum0 = vmlal_s8(_sum0, _r02, _k2);
-
-                int8x8_t _r1 = vld1_s8(r1);
-                int8x8_t _r1n = vld1_s8(r1+8);
-                int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
-                int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                _sum0 = vmlal_s8(_sum0, _r1, _k3);
-                _sum0 = vmlal_s8(_sum0, _r11, _k4);
-                _sum0 = vmlal_s8(_sum0, _r12, _k5);
-
-                int16x8_t _sum1 = vmull_s8(_r1, _k0);
-                _sum1 = vmlal_s8(_sum1, _r11, _k1);
-                _sum1 = vmlal_s8(_sum1, _r12, _k2);
-
-                int8x8_t _r2 = vld1_s8(r2);
-                int8x8_t _r2n = vld1_s8(r2+8);
-                int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
-                int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                _sum0 = vmlal_s8(_sum0, _r2, _k6);
-                _sum0 = vmlal_s8(_sum0, _r21, _k7);
-                _sum0 = vmlal_s8(_sum0, _r22, _k8);
-
-                _sum1 = vmlal_s8(_sum1, _r2, _k3);
-                _sum1 = vmlal_s8(_sum1, _r21, _k4);
-                _sum1 = vmlal_s8(_sum1, _r22, _k5);
-
-                int8x8_t _r3 = vld1_s8(r3);
-                int8x8_t _r3n = vld1_s8(r3+8);
-                int8x8_t _r31 = vext_s8(_r3, _r3n, 1);
-                int8x8_t _r32 = vext_s8(_r3, _r3n, 2);
-                _sum1 = vmlal_s8(_sum1, _r3, _k6);
-                _sum1 = vmlal_s8(_sum1, _r31, _k7);
-                _sum1 = vmlal_s8(_sum1, _r32, _k8);
-
-                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
-                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));
-
-                vst1q_s32(outptr0, sum0_s32);
-                vst1q_s32(outptr0+4, sum0n_s32);
-
-                int32x4_t sum1_s32 = vmovl_s16(vget_low_s16(_sum1));
-                int32x4_t sum1n_s32 = vmovl_s16(vget_high_s16(_sum1));
-
-                vst1q_s32(outptr0n, sum1_s32);
-                vst1q_s32(outptr0n+4, sum1n_s32);
-
-                r0 += 8;
-                r1 += 8;
-                r2 += 8;
-                r3 += 8;
-                outptr0 += 8;
-                outptr0n += 8;
-            }
-
-            for (; remain>0; remain--)
-            {
-                //Todo Neon
-
-                int sum0 = 0;
-                int sum0n = 0;
-
-                sum0 += (int)r0[0] * kernel[0];
-                sum0 += (int)r0[1] * kernel[1];
-                sum0 += (int)r0[2] * kernel[2];
-                sum0 += (int)r1[0] * kernel[3];
-                sum0 += (int)r1[1] * kernel[4];
-                sum0 += (int)r1[2] * kernel[5];
-                sum0 += (int)r2[0] * kernel[6];
-                sum0 += (int)r2[1] * kernel[7];
-                sum0 += (int)r2[2] * kernel[8];
-
-                sum0n += (int)r1[0] * kernel[0];
-                sum0n += (int)r1[1] * kernel[1];
-                sum0n += (int)r1[2] * kernel[2];
-                sum0n += (int)r2[0] * kernel[3];
-                sum0n += (int)r2[1] * kernel[4];
-                sum0n += (int)r2[2] * kernel[5];
-                sum0n += (int)r3[0] * kernel[6];
-                sum0n += (int)r3[1] * kernel[7];
-                sum0n += (int)r3[2] * kernel[8];
-
-                *outptr0 = sum0;
-                *outptr0n = sum0n;
-
-                r0++;
-                r1++;
-                r2++;
-                r3++;
-                outptr0++;
-                outptr0n++;
-            }
-
-            r0 += 2 + w;
-            r1 += 2 + w;
-            r2 += 2 + w;
-            r3 += 2 + w;
-
-            outptr0 += outw;
-            outptr0n += outw;
-        }
-
-        for (; i < outh; i++)
-        {
-            int nn = outw >> 3;
-            int remain = outw & 7;
-
-            for (; nn >0; nn--)
-            {
-                int8x8_t _r0 = vld1_s8(r0);
-                int8x8_t _r0n = vld1_s8(r0+8);
-                int8x8_t _r01 = vext_s8(_r0, _r0n, 1);
-                int8x8_t _r02 = vext_s8(_r0, _r0n, 2);
-
-                int16x8_t _sum0 = vmull_s8(_r0, _k0);
-                _sum0 = vmlal_s8(_sum0, _r01, _k1);
-                _sum0 = vmlal_s8(_sum0, _r02, _k2);
-
-                int8x8_t _r1 = vld1_s8(r1);
-                int8x8_t _r1n = vld1_s8(r1+8);
-                int8x8_t _r11 = vext_s8(_r1, _r1n, 1);
-                int8x8_t _r12 = vext_s8(_r1, _r1n, 2);
-                _sum0 = vmlal_s8(_sum0, _r1, _k3);
-                _sum0 = vmlal_s8(_sum0, _r11, _k4);
-                _sum0 = vmlal_s8(_sum0, _r12, _k5);
-
-                int8x8_t _r2 = vld1_s8(r2);
-                int8x8_t _r2n = vld1_s8(r2+8);
-                int8x8_t _r21 = vext_s8(_r2, _r2n, 1);
-                int8x8_t _r22 = vext_s8(_r2, _r2n, 2);
-                _sum0 = vmlal_s8(_sum0, _r2, _k6);
-                _sum0 = vmlal_s8(_sum0, _r21, _k7);
-                _sum0 = vmlal_s8(_sum0, _r22, _k8);
-
-                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum0));
-                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum0));
-
-                vst1q_s32(outptr0, sum0_s32);
-                vst1q_s32(outptr0+4, sum0n_s32);
-
-                r0 += 8;
-                r1 += 8;
-                r2 += 8;
-                outptr0 += 8;
-            }
-
-            for (; remain>0; remain--)
-            {
-                int sum = 0;
-
-                sum += (int)r0[0] * kernel[0];
-                sum += (int)r0[1] * kernel[1];
-                sum += (int)r0[2] * kernel[2];
-                sum += (int)r1[0] * kernel[3];
-                sum += (int)r1[1] * kernel[4];
-                sum += (int)r1[2] * kernel[5];
-                sum += (int)r2[0] * kernel[6];
-                sum += (int)r2[1] * kernel[7];
-                sum += (int)r2[2] * kernel[8];
-
-                *outptr0 = sum;
-
-                r0++;
-                r1++;
-                r2++;
-                outptr0++;
-            }   
-
-            r0 += 2;
-            r1 += 2;
-            r2 += 2;
-        }
-    }
-}
-
-static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
-{
-    int w = bottom_blob.w;
-
-    int outw = top_blob.w;
-    int outh = top_blob.h;
-    int outch = top_blob.c;
-
-    const int tailstep = w - 2*outw + w;
-
-    #pragma omp parallel for num_threads(opt.num_threads)
-    for (int p=0; p<outch; p++)
-    {
-        Mat out = top_blob.channel(p);
-
-        const signed char* kernel = (const signed char*)_kernel + p*9;
-
-        int* outptr = out;
-
-        const signed char* img = bottom_blob.channel(p);
-
-        const signed char* r0 = img;
-        const signed char* r1 = img + w;
-        const signed char* r2 = img + w*2;
-
-        int i = 0;
-
-        int8x8_t _k0 = vdup_n_s8(kernel[0]);
-        int8x8_t _k1 = vdup_n_s8(kernel[1]);
-        int8x8_t _k2 = vdup_n_s8(kernel[2]);
-        int8x8_t _k3 = vdup_n_s8(kernel[3]);
-        int8x8_t _k4 = vdup_n_s8(kernel[4]);
-        int8x8_t _k5 = vdup_n_s8(kernel[5]);
-        int8x8_t _k6 = vdup_n_s8(kernel[6]);
-        int8x8_t _k7 = vdup_n_s8(kernel[7]);
-        int8x8_t _k8 = vdup_n_s8(kernel[8]);
-
-        for (; i < outh; i++)
-        {           
-            int nn = outw >> 3;
-            int remain = outw & 7;
-
-            for (; nn > 0; nn--)
-            {
-                int8x8x2_t _r0 = vld2_s8(r0);
-                int8x8x2_t _r0n = vld2_s8(r0+16);
-                int8x8_t _r00 = _r0.val[0];
-                int8x8_t _r01 = _r0.val[1];
-                int8x8_t _r02 = vext_s8(_r00, _r0n.val[0], 1);
-
-                int16x8_t _sum = vmull_s8(_r00, _k0);
-                _sum = vmlal_s8(_sum, _r01, _k1);
-                _sum = vmlal_s8(_sum, _r02, _k2);
-
-                int8x8x2_t _r1 = vld2_s8(r1);
-                int8x8x2_t _r1n = vld2_s8(r1+16);
-                int8x8_t _r10 = _r1.val[0];
-                int8x8_t _r11 = _r1.val[1];
-                int8x8_t _r12 = vext_s8(_r10, _r1n.val[0], 1);
-                _sum = vmlal_s8(_sum, _r10, _k3);
-                _sum = vmlal_s8(_sum, _r11, _k4);
-                _sum = vmlal_s8(_sum, _r12, _k5);
-
-                int8x8x2_t _r2 = vld2_s8(r2);
-                int8x8x2_t _r2n = vld2_s8(r2+16);
-                int8x8_t _r20 = _r2.val[0];
-                int8x8_t _r21 = _r2.val[1];
-                int8x8_t _r22 = vext_s8(_r20, _r2n.val[0], 1);
-                _sum = vmlal_s8(_sum, _r20, _k6);
-                _sum = vmlal_s8(_sum, _r21, _k7);
-                _sum = vmlal_s8(_sum, _r22, _k8);
-
-                int32x4_t sum0_s32 = vmovl_s16(vget_low_s16(_sum));
-                int32x4_t sum0n_s32 = vmovl_s16(vget_high_s16(_sum));
-
-                vst1q_s32(outptr, sum0_s32);
-                vst1q_s32(outptr+4, sum0n_s32);
-
-                r0 += 16;
-                r1 += 16;
-                r2 += 16;
-                outptr += 8;
-            }       
-
-            for (; remain>0; remain--)
-            {
-                int sum = 0;
-                
-                sum += (int)r0[0] * kernel[0];
-                sum += (int)r0[1] * kernel[1];
-                sum += (int)r0[2] * kernel[2];
-                sum += (int)r1[0] * kernel[3];
-                sum += (int)r1[1] * kernel[4];
-                sum += (int)r1[2] * kernel[5];
-                sum += (int)r2[0] * kernel[6];
-                sum += (int)r2[1] * kernel[7];
-                sum += (int)r2[2] * kernel[8];
-
-                *outptr = sum;
-
-                r0 += 2;
-                r1 += 2;
-                r2 += 2;
-                outptr++;
-            }
-
-            r0 += tailstep;
-            r1 += tailstep;
-            r2 += tailstep;
-        }
-    }
-}
-#else // __aarch64__
 static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
     int w = bottom_blob.w;
@@ -824,5 +483,3 @@ static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const M
         }
     }
 }
-
-#endif
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index a8b2c76d7..2a76f1333 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -13,7 +13,7 @@
 // specific language governing permissions and limitations under the License.
 
 #include "convolutiondepthwise_arm.h"
-
+#include "benchmark.h"
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -147,6 +147,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
     Mat bottom_blob_unbordered = bottom_blob;
     if (use_int8_inference && elemsize != 1)
     {
+        // start = ncnn::get_current_time();
+
         Mat bottom_blob_int8;
         bottom_blob_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
         if (bottom_blob_int8.empty())
@@ -167,8 +169,8 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
             quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
         }
 
-        bottom_blob_unbordered = bottom_blob_int8;
-    }
+        bottom_blob_unbordered = bottom_blob_int8;    
+    }    
 
     Mat bottom_blob_bordered = bottom_blob_unbordered;
     if (pad_w > 0 || pad_h > 0)
@@ -211,25 +213,67 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
             {
                 if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
                 {
-                    if (stride_w == 1 && stride_h == 1)
+                    if (use_int8_requantize)
                     {
-                        convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
+                        Mat top_blob_tm;
+                        top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
+                        if (top_blob_tm.empty())
+                            return -100;
+                        
+                        top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
+                        if (top_blob.empty())
+                            return -100;
+
+                        if (stride_w == 1 && stride_h == 1)
+                        {
+                            convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
+                        }
+                        else if (stride_w == 2 && stride_h == 2)
+                        {
+                            convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob_tm, weight_data, opt);
+                        }                     
+
+                        // requantize, reverse scale inplace
+                        #pragma omp parallel for num_threads(opt.num_threads)
+                        for (int g=0; g<group; g++)
+                        {
+                            ncnn::Option opt_g = opt;
+                            opt_g.num_threads = 1;
+                            opt_g.blob_allocator = top_blob.allocator;
+
+                            Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
+                            Mat top_blob_g = top_blob.channel_range(g, 1);
+                            requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
+                        }                
                     }
-                    else if (stride_w == 2 && stride_h == 2)
+                    else
                     {
-                        convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
-                    }
-
-                    // dequantize, reverse scale inplace
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int g=0; g<group; g++)
-                    {
-                        ncnn::Option opt_g = opt;
-                        opt_g.num_threads = 1;
-                        opt_g.blob_allocator = top_blob.allocator;
-
-                        Mat top_blob_g = top_blob.channel_range(g, 1);
-                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
+                        // start = ncnn::get_current_time();
+
+                        top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
+                        if (top_blob.empty())
+                            return -100;
+
+                        if (stride_w == 1 && stride_h == 1)
+                        {
+                            convdw3x3s1_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
+                        }
+                        else if (stride_w == 2 && stride_h == 2)
+                        {
+                            convdw3x3s2_int8_neon(bottom_blob_bordered, top_blob, weight_data, opt);
+                        }                        
+
+                        // dequantize, reverse scale inplace
+                        #pragma omp parallel for num_threads(opt.num_threads)
+                        for (int g=0; g<group; g++)
+                        {
+                            ncnn::Option opt_g = opt;
+                            opt_g.num_threads = 1;
+                            opt_g.blob_allocator = top_blob.allocator;
+
+                            Mat top_blob_g = top_blob.channel_range(g, 1);
+                            dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
+                        }           
                     }
 
                     return 0;
diff --git a/src/layer/arm/quantize_arm.cpp b/src/layer/arm/quantize_arm.cpp
index be7fb00da..5cbd3f782 100644
--- a/src/layer/arm/quantize_arm.cpp
+++ b/src/layer/arm/quantize_arm.cpp
@@ -31,19 +31,6 @@ static inline signed char float2int8(float v)
 
 int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
-#if !__aarch64__ && __ARM_NEON
-    int FPSCR_value = 0;
-
-    asm volatile(
-        "vmrs   %0, FPSCR               \n"
-        "bic    r10, %0, #0x00c00000    \n"
-        "vmsr   FPSCR, r10              \n"
-        : "=r"(FPSCR_value)
-        :
-        : "memory", "r10"
-    );
-#endif
-
     int dims = bottom_blob.dims;
 
     if (dims == 1)
@@ -200,15 +187,6 @@ int Quantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
         }
     }
 
-#if !__aarch64__ && __ARM_NEON
-    asm volatile(
-        "vmsr   FPSCR, %0           \n"
-        :
-        : "r"(FPSCR_value)
-        : "memory"
-    );
-#endif
-
     return 0;
 }
 
diff --git a/src/layer/arm/relu_arm.cpp b/src/layer/arm/relu_arm.cpp
index 9f3d541bc..f0fa4f80d 100644
--- a/src/layer/arm/relu_arm.cpp
+++ b/src/layer/arm/relu_arm.cpp
@@ -22,8 +22,92 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(ReLU_arm)
 
+int ReLU_arm::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            signed char* ptr = bottom_top_blob.channel(q);
+
+#if __ARM_NEON
+            int nn = size >> 4;
+            int remain = size - (nn << 4);
+#else
+            int remain = size;
+#endif // __ARM_NEON
+
+#if __ARM_NEON
+#if __aarch64__
+            int8x16_t _zero = vdupq_n_s8(0);
+            for (; nn>0; nn--)
+            {
+                int8x16_t _p = vld1q_s8(ptr);
+                _p = vmaxq_s8(_p, _zero);
+                vst1q_s8(ptr, _p);
+
+                ptr += 16;
+            }
+#else
+            if (nn > 0)
+            {
+            asm volatile(
+                "veor       q1, q0, q0          \n"
+                "0:                             \n"
+                "pld        [%1, #128]          \n"
+                "vld1.s8    {d0-d1}, [%1 :128]  \n"
+                "vmax.s8    q0, q0, q1          \n"
+                "subs       %0, #1              \n"
+                "vst1.s8    {d0-d1}, [%1 :128]! \n"
+                "bne        0b                  \n"
+                : "=r"(nn),     // %0
+                  "=r"(ptr)     // %1
+                : "0"(nn),
+                  "1"(ptr)
+                : "cc", "memory", "q0", "q1"
+            );
+            }
+#endif // __aarch64__
+#endif // __ARM_NEON
+            for (; remain>0; remain--)
+            {
+                if (*ptr < 0)
+                    *ptr = 0;
+
+                ptr++;
+            }
+        }
+    }
+    else
+    {
+        // TODO
+        // #pragma omp parallel for num_threads(opt.num_threads)
+        // for (int q=0; q<channels; q++)
+        // {
+        //     float* ptr = bottom_top_blob.channel(q);
+
+        //     for (int i=0; i<size; i++)
+        //     {
+        //         if (ptr[i] < 0)
+        //             ptr[i] *= slope;
+        //     }
+        // }
+    }
+
+    return 0;
+}
+
 int ReLU_arm::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (bottom_top_blob.elemsize == 1u)
+        return ReLU_arm::forward_inplace_int8(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
diff --git a/src/layer/arm/relu_arm.h b/src/layer/arm/relu_arm.h
index 4403d61f1..9ed073904 100644
--- a/src/layer/arm/relu_arm.h
+++ b/src/layer/arm/relu_arm.h
@@ -23,6 +23,7 @@ class ReLU_arm : public ReLU
 {
 public:
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
 };
 
 } // namespace ncnn
diff --git a/src/layer/arm/requantize_arm.cpp b/src/layer/arm/requantize_arm.cpp
new file mode 100644
index 000000000..255277dd6
--- /dev/null
+++ b/src/layer/arm/requantize_arm.cpp
@@ -0,0 +1,325 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize_arm.h"
+
+#include <math.h>
+
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif // __ARM_NEON
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Requantize_arm)
+
+static inline signed char float2int8(float v)
+{
+    int int32 = round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -128) return -128;
+    return (signed char)int32;
+}
+
+int Requantize_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{ 
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        const int* intptr = bottom_blob;
+        signed char * ptr = top_blob;
+
+        if (bias_term)
+        {
+            if (bias_data_size > 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i=0; i<w; i++)
+                {
+                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+            else
+            {
+                float bias = bias_data[0];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i=0; i<w; i++)
+                {
+                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<w; i++)
+            {
+                ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
+                if (fusion_relu && ptr[i] < 0)
+                    ptr[i] = 0;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        if (bias_term)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];
+
+                for (int j=0; j<w; j++)
+                {
+                    ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
+                    if (fusion_relu && ptr[j] < 0)
+                        ptr[j] = 0;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                for (int j=0; j<w; j++)
+                {
+                    ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
+                    if (fusion_relu && ptr[j] < 0)
+                        ptr[j] = 0;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;      
+
+        if (bias_term)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q=0; q<channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];
+
+#if __ARM_NEON
+                int nn = size >> 3;
+                int remain = size & 7;
+
+#if __aarch64__
+                for (; nn>0; nn--)
+                {
+                    ptr[0] = float2int8(((intptr[0] * scale_in) + bias) * scale_out);
+                    ptr[1] = float2int8(((intptr[1] * scale_in) + bias) * scale_out);
+                    ptr[2] = float2int8(((intptr[2] * scale_in) + bias) * scale_out);
+                    ptr[3] = float2int8(((intptr[3] * scale_in) + bias) * scale_out);
+                    ptr[4] = float2int8(((intptr[4] * scale_in) + bias) * scale_out);
+                    ptr[5] = float2int8(((intptr[5] * scale_in) + bias) * scale_out);
+                    ptr[6] = float2int8(((intptr[6] * scale_in) + bias) * scale_out);
+                    ptr[7] = float2int8(((intptr[7] * scale_in) + bias) * scale_out);
+
+                    ptr += 8;
+                    intptr += 8;
+                }
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%1, #256]          \n"
+                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
+                    "vdup.f32   q10, %6             \n" //q10 scale_in
+                    "vdup.f32   q11, %7             \n" //q11 scale_out
+                    "vdup.f32   q12, %8             \n" //q12 bias
+                    "0:                             \n"
+                    // top_s32 -> top_f32
+                    "vcvt.f32.s32 q0, q0            \n" 
+                    "vcvt.f32.s32 q1, q1            \n"
+                    // top_f32 = top_f32 * scale_int
+                    "vmul.f32   q0, q0, q10         \n"
+                    "vmul.f32   q1, q1, q10         \n"
+                    // top_f32 = top_f32 + bias
+                    "vadd.f32   q0, q0, q12         \n"
+                    "vadd.f32   q1, q1, q12         \n"
+                    // top_f32 = top_f32 * scale_out
+                    "vmul.f32   q0, q0, q11         \n"
+                    "vmul.f32   q1, q1, q11         \n"
+                    // top_f32 -> top_s32
+                    "vcvtr.s32.f32 s0, s0           \n"
+                    "vcvtr.s32.f32 s1, s1           \n"
+                    "vcvtr.s32.f32 s2, s2           \n"
+                    "vcvtr.s32.f32 s3, s3           \n"
+                    "vcvtr.s32.f32 s4, s4           \n"
+                    "vcvtr.s32.f32 s5, s5           \n"
+                    "vcvtr.s32.f32 s6, s6           \n"
+                    "vcvtr.s32.f32 s7, s7           \n" 
+                    // top_s32 -> top_s16
+                    "vqmovn.s32 d4, q0              \n"
+                    "vqmovn.s32 d5, q1              \n"
+                    "pld        [%1, #256]          \n"
+                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
+                    // top_s16 -> top_s8
+                    "vqmovn.s16   d4, q2            \n"
+                    // save top_s8
+                    "vst1.8     {d4}, [%2:64]!      \n"
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    "sub        %1, #32             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(intptr),     // %1
+                      "=r"(ptr)         // %2
+                    : "0"(nn),
+                      "1"(intptr),
+                      "2"(ptr),
+                      "r"(scale_in),    // %6
+                      "r"(scale_out),   // %7
+                      "r"(bias)         // %8
+                    : "cc", "memory", "q0", "q1", "q2", "q10", "q11", "q12"
+                );
+                }
+#endif // __aarch64__           
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+                for (; remain > 0; remain--)
+                {
+                    *ptr = float2int8(((*intptr * scale_in) + bias) * scale_out);
+
+                    intptr++;
+                    ptr ++;                     
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q=0; q<channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+#if __ARM_NEON
+                int nn = size >> 3;
+                int remain = size & 7;
+
+#if __aarch64__
+                //TODO
+                for (; nn>0; nn--)
+                {
+                    ptr[0] = float2int8(intptr[0] * scale_in * scale_out);
+                    ptr[1] = float2int8(intptr[1] * scale_in * scale_out);
+                    ptr[2] = float2int8(intptr[2] * scale_in * scale_out);
+                    ptr[3] = float2int8(intptr[3] * scale_in * scale_out);
+                    ptr[4] = float2int8(intptr[4] * scale_in * scale_out);
+                    ptr[5] = float2int8(intptr[5] * scale_in * scale_out);
+                    ptr[6] = float2int8(intptr[6] * scale_in * scale_out);
+                    ptr[7] = float2int8(intptr[7] * scale_in * scale_out);
+
+                    ptr += 8;
+                    intptr += 8;
+                }                
+#else
+                if (nn > 0)
+                {
+                asm volatile(
+                    "pld        [%1, #256]          \n"
+                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
+                    "vdup.f32   q10, %6             \n" //q10 scale_in
+                    "vdup.f32   q11, %7             \n" //q11 scale_out
+                    "0:                             \n"
+                    // top_s32 -> top_f32
+                    "vcvt.f32.s32 q0, q0            \n"
+                    "vcvt.f32.s32 q1, q1            \n"
+                    // top_f32 = top_f32 * scale_int
+                    "vmul.f32   q0, q0, q10         \n"
+                    "vmul.f32   q1, q1, q10         \n"
+                    // top_f32 = top_f32 * scale_out
+                    "vmul.f32   q0, q0, q11         \n"
+                    "vmul.f32   q1, q1, q11         \n"
+                    // top_f32 -> top_s32
+                    "vcvtr.s32.f32 s0, s0           \n"
+                    "vcvtr.s32.f32 s1, s1           \n"
+                    "vcvtr.s32.f32 s2, s2           \n"
+                    "vcvtr.s32.f32 s3, s3           \n"
+                    "vcvtr.s32.f32 s4, s4           \n"
+                    "vcvtr.s32.f32 s5, s5           \n"
+                    "vcvtr.s32.f32 s6, s6           \n"
+                    "vcvtr.s32.f32 s7, s7           \n" 
+                    // top_s32 -> top_s16
+                    "vqmovn.s32 d4, q0              \n"
+                    "vqmovn.s32 d5, q1              \n"
+                    "pld        [%1, #256]          \n"
+                    "vld1.s32   {d0-d3}, [%1:128]!  \n" //q0-q1 data
+                    // top_s16 -> top_s8
+                    "vqmovn.s16   d4, q2            \n"
+                    // save top_s8
+                    "vst1.8     {d4}, [%2:64]!      \n"
+                    "subs       %0, #1              \n"
+                    "bne        0b                  \n"
+                    "sub        %1, #32             \n"
+                    : "=r"(nn),         // %0
+                      "=r"(intptr),     // %1
+                      "=r"(ptr)         // %2
+                    : "0"(nn),
+                      "1"(intptr),
+                      "2"(ptr),
+                      "r"(scale_in),    // %6
+                      "r"(scale_out)    // %7
+                    : "cc", "memory", "q0", "q1", "q2", "q10", "q11"
+                );
+                } 
+#endif // __aarch64__      
+#else
+                int remain = size;
+#endif // __ARM_NEON
+
+                for (; remain > 0; remain--)
+                {
+                    *ptr = float2int8(*intptr * scale_in * scale_out);
+
+                    intptr++;
+                    ptr ++;
+                }
+            }
+        }    
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/arm/requantize_arm.h b/src/layer/arm/requantize_arm.h
new file mode 100644
index 000000000..1bfd40068
--- /dev/null
+++ b/src/layer/arm/requantize_arm.h
@@ -0,0 +1,30 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_ARM_H
+#define LAYER_REQUANTIZE_ARM_H
+
+#include "requantize.h"
+
+namespace ncnn {
+
+class Requantize_arm : public Requantize
+{
+public:
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_ARM_H
\ No newline at end of file
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index 29e031427..b1632a477 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -25,6 +25,7 @@ Convolution::Convolution()
     one_blob_only = true;
     support_inplace = false;
     support_vulkan = true;
+    use_int8_requantize = false;
 
 #if NCNN_VULKAN
     padding = 0;
@@ -42,7 +43,6 @@ Convolution::Convolution()
 #endif // NCNN_VULKAN
 
     quantize = 0;
-    dequantize = 0;
 }
 
 Convolution::~Convolution()
@@ -52,7 +52,14 @@ Convolution::~Convolution()
 #endif // NCNN_VULKAN
 
     delete quantize;
-    delete dequantize;
+
+    for (int i=0; i<(int)dequantize_ops.size(); i++)
+        delete dequantize_ops[i];
+    dequantize_ops.clear();
+
+    for (int i=0; i<(int)requantize_ops.size(); i++)
+        delete requantize_ops[i];
+    requantize_ops.clear();
 }
 
 int Convolution::load_param(const ParamDict& pd)
@@ -113,10 +120,18 @@ int Convolution::load_model(const ModelBin& mb)
 
     if (int8_scale_term)
     {
-        weight_data_int8_scale = mb.load(1, 1)[0];
+        weight_data_int8_scales = mb.load(num_output, 1);
         bottom_blob_int8_scale = mb.load(1, 1)[0];
     }
 
+    for (int i=0; i<(int)dequantize_ops.size(); i++)
+        delete dequantize_ops[i];
+    dequantize_ops.clear();
+
+    for (int i=0; i<(int)requantize_ops.size(); i++)
+        delete requantize_ops[i];
+    requantize_ops.clear();
+
     bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
     bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
 
@@ -126,27 +141,39 @@ int Convolution::load_model(const ModelBin& mb)
         return -1;
     }
 
+    // runtime quantize the weight data
     if (weight_data_is_float32 && use_int8_inference)
     {
         // quantize weight to int8
-        Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
+        Mat int8_weight_data(weight_data_size, (size_t)1u);
+        if (int8_weight_data.empty())
+            return -100;
 
-        ncnn::ParamDict pd;
-        pd.set(0, weight_data_int8_scale);// scale
+        const int weight_data_size_output = weight_data_size / num_output;
+
+        for (int n=0; n<num_output; n++)
+        {
+            Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
 
-        op->load_param(pd);
+            ncnn::ParamDict pd;
+            pd.set(0, weight_data_int8_scales[n]);// scale
 
-        Mat int8_weight_data;
-        op->forward(weight_data, int8_weight_data);
+            op->load_param(pd);
 
-        delete op;
+            ncnn::Option opt = ncnn::get_default_option();
+            opt.blob_allocator = int8_weight_data.allocator;
 
-        if (int8_weight_data.empty())
-            return -100;
+            const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
+            Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
+            op->forward(weight_data_n, int8_weight_data_n, opt);
+
+            delete op;
+        }
 
         weight_data = int8_weight_data;
     }
 
+    // initial the quantize,dequantize op layer
     if (use_int8_inference)
     {
         quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
@@ -157,22 +184,74 @@ int Convolution::load_model(const ModelBin& mb)
             quantize->load_param(pd);
         }
 
-        dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
+        dequantize_ops.resize(num_output);
+        for (int n=0; n<num_output; n++)
         {
-            float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);
+            dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);
+
+            float top_rescale = 1.f;
+
+            if (weight_data_int8_scales[n] == 0)
+                top_rescale = 0;
+            else
+                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
 
             ncnn::ParamDict pd;
             pd.set(0, top_rescale);// scale
-            pd.set(1, bias_term);// bias_term
-            pd.set(2, num_output);// bias_data_size
+            pd.set(1, bias_term);  // bias_term
+            pd.set(2, 1);          // bias_data_size
 
-            dequantize->load_param(pd);
+            dequantize_ops[n]->load_param(pd);
 
             ncnn::Mat weights[1];
-            weights[0] = bias_data;
+            weights[0] = bias_data.range(n, 1);
 
-            dequantize->load_model(ModelBinFromMatArray(weights));
+            dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
+        }
+    }
+
+    return 0;
+}
+
+int Convolution::create_requantize_op(void)
+{
+    if (!use_int8_requantize)
+    {
+        fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
+        return -1;
+    }
+
+    requantize_ops.resize(num_output);
+    for (int n=0; n<num_output; n++)
+    {
+        requantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Requantize);
+
+        float scale_in = 1.f;
+        float scale_out = 1.f;
+
+        if (weight_data_int8_scales[n] == 0)
+        {
+            scale_in = 0;
+        }
+        else
+        {
+            scale_in = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
         }
+
+        scale_out = top_blob_int8_scale;
+
+        ncnn::ParamDict pd;
+        pd.set(0, scale_in);   // scale in
+        pd.set(1, scale_out);  // scale_out
+        pd.set(2, bias_term);  // bias_term
+        pd.set(3, 1);          // bias_data_size
+
+        requantize_ops[n]->load_param(pd);
+
+        ncnn::Mat weights[1];
+        weights[0] = bias_data.range(n, 1);
+
+        requantize_ops[n]->load_model(ModelBinFromMatArray(weights));
     }
 
     return 0;
@@ -210,7 +289,7 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
 
             if (int8_scale_term)
             {
-                weights[2] = Mat(1, (size_t)4u, (void*)&weight_data_int8_scale);
+                weights[2] = weight_data_int8_scales;
                 weights[3] = Mat(1, (size_t)4u, (void*)&bottom_blob_int8_scale);
             }
 
@@ -309,50 +388,118 @@ int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& op
 
     if (use_int8_inference)
     {
-        // num_output
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int p=0; p<num_output; p++)
+        if (use_int8_requantize == true)
         {
-            int* outptr = top_blob.channel(p);
+            Mat top_blob_tm;
+            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
+            if (top_blob_tm.empty())
+                return -100;
+            
+            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100; 
 
-            for (int i = 0; i < outh; i++)
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
             {
-                for (int j = 0; j < outw; j++)
-                {
-                    int sum = 0;
+                int* outptr = top_blob_tm.channel(p);
 
-                    const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
-
-                    // channels
-                    for (int q=0; q<channels; q++)
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
                     {
-                        const Mat m = bottom_blob_bordered.channel(q);
-                        const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
+                        int sum = 0;
+
+                        const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
 
-                        for (int k = 0; k < maxk; k++)
+                        // channels
+                        for (int q=0; q<channels; q++)
                         {
-                            int val = sptr[ space_ofs[k] ];
-                            int w = kptr[k];
-                            sum += val * w;
+                            const Mat m = bottom_blob_bordered.channel(q);
+                            const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                int val = sptr[ space_ofs[k] ];
+                                int w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            kptr += maxk;
                         }
 
-                        kptr += maxk;
+                        outptr[j] = sum;
                     }
 
-                    outptr[j] = sum;
+                    outptr += outw;
                 }
 
-                outptr += outw;
+                // requantize, reverse scale inplace
+                {
+                    ncnn::Option opt_g = opt;
+                    opt_g.num_threads = 1;
+                    opt_g.blob_allocator = top_blob.allocator;
+
+                    Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
+                    Mat top_blob_g = top_blob.channel_range(p, 1);
+                    requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
+                }                        
             }
         }
-
-        // dequantize, reverse scale inplace
+        else
         {
-            ncnn::Option opt_g = opt;
-            opt_g.blob_allocator = top_blob.allocator;
+            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100;
+      
+            // num_output
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
+            {
+                int* outptr = top_blob.channel(p);
 
-            dequantize->forward_inplace(top_blob, opt_g);
-        }
+                for (int i = 0; i < outh; i++)
+                {
+                    for (int j = 0; j < outw; j++)
+                    {
+                        int sum = 0;
+
+                        const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
+
+                        // channels
+                        for (int q=0; q<channels; q++)
+                        {
+                            const Mat m = bottom_blob_bordered.channel(q);
+                            const signed char* sptr = m.row<signed char>(i*stride_h) + j*stride_w;
+
+                            for (int k = 0; k < maxk; k++)
+                            {
+                                int val = sptr[ space_ofs[k] ];
+                                int w = kptr[k];
+                                sum += val * w;
+                            }
+
+                            kptr += maxk;
+                        }
+
+                        outptr[j] = sum;
+                    }
+
+                    outptr += outw;
+                }
+
+                // dequantize, reverse scale inplace
+                {
+                    ncnn::Option opt_g = opt;
+                    opt_g.num_threads = 1;
+                    opt_g.blob_allocator = top_blob.allocator;
+
+                    Mat top_blob_g = top_blob.channel_range(p, 1);
+                    dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
+                }          
+            }   
+        }        
 
         return 0;
     }
diff --git a/src/layer/convolution.h b/src/layer/convolution.h
index 6c4566797..1c244daf1 100644
--- a/src/layer/convolution.h
+++ b/src/layer/convolution.h
@@ -29,6 +29,8 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
+    virtual int create_requantize_op(void);
+
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 #if NCNN_VULKAN
@@ -91,13 +93,16 @@ public:
     Pipeline* pipeline_innerproduct_pack4to1;
 #endif // NCNN_VULKAN
 
-    float weight_data_int8_scale;
+    Mat weight_data_int8_scales;
     float bottom_blob_int8_scale;
+    float top_blob_int8_scale;
 
     bool use_int8_inference;
+    bool use_int8_requantize;
 
     ncnn::Layer* quantize;
-    ncnn::Layer* dequantize;
+    std::vector<ncnn::Layer*> dequantize_ops;
+    std::vector<ncnn::Layer*> requantize_ops;
 };
 
 } // namespace ncnn
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index fe1ed1191..e01916eae 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -25,6 +25,7 @@ ConvolutionDepthWise::ConvolutionDepthWise()
     one_blob_only = true;
     support_inplace = false;
     support_vulkan = true;
+    use_int8_requantize = false;
 
 #if NCNN_VULKAN
     padding = 0;
@@ -58,6 +59,11 @@ ConvolutionDepthWise::~ConvolutionDepthWise()
         delete dequantize_ops[i];
 
     dequantize_ops.clear();
+
+    for (int i=0; i<(int)requantize_ops.size(); i++)
+        delete requantize_ops[i];
+
+    requantize_ops.clear();    
 }
 
 int ConvolutionDepthWise::load_param(const ParamDict& pd)
@@ -150,7 +156,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
     if (int8_scale_term == 1)
     {
         weight_data_int8_scales = mb.load(group, 1);
-        bottom_blob_int8_scales = mb.load(group, 1);
+        bottom_blob_int8_scales = mb.load(1, 1);
+
+        float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
+        bottom_blob_int8_scales = Mat(group);
+        bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
     }
     else if (int8_scale_term == 2)
     {
@@ -177,6 +187,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
 
     dequantize_ops.clear();
 
+    for (int i=0; i<(int)requantize_ops.size(); i++)
+        delete requantize_ops[i];
+
+    requantize_ops.clear();    
+
     bool weight_data_is_int8 = (weight_data.elemsize == (size_t)1u);
     bool weight_data_is_float32 = (weight_data.elemsize == (size_t)4u);
 
@@ -236,7 +251,11 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
         {
             dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);
 
-            float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+            float top_rescale = 1.f;
+            if (weight_data_int8_scales[g] == 0)
+                top_rescale = 0;
+            else
+                top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
 
             ncnn::ParamDict pd;
             pd.set(0, top_rescale);// scale
@@ -255,6 +274,50 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
     return 0;
 }
 
+int ConvolutionDepthWise::create_requantize_op(void)
+{
+    if (!use_int8_requantize)
+    {
+        fprintf(stderr, "requantized op set but use_int8_requantize disabled\n");
+        return -1;
+    }
+
+    requantize_ops.resize(group);
+    for (int g=0; g<group; g++)
+    {
+        requantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Requantize);
+
+        float scale_in = 1.f;
+        float scale_out = 1.f;
+
+        if (weight_data_int8_scales[g] == 0)
+        {
+            scale_in = 0;
+        }
+        else
+        {
+            scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+        }
+
+        scale_out = top_blob_int8_scale;
+
+        ncnn::ParamDict pd;
+        pd.set(0, scale_in);   // scale in
+        pd.set(1, scale_out);  // scale_out
+        pd.set(2, bias_term);  // bias_term
+        pd.set(3, 1);          // bias_data_size
+
+        requantize_ops[g]->load_param(pd);
+
+        ncnn::Mat weights[1];
+        weights[0] = bias_data.range(g, 1);
+
+        requantize_ops[g]->load_model(ModelBinFromMatArray(weights));
+    }
+
+    return 0;
+}
+
 int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
diff --git a/src/layer/convolutiondepthwise.h b/src/layer/convolutiondepthwise.h
index b936cdc72..c8a90e606 100644
--- a/src/layer/convolutiondepthwise.h
+++ b/src/layer/convolutiondepthwise.h
@@ -29,6 +29,8 @@ public:
 
     virtual int load_model(const ModelBin& mb);
 
+    virtual int create_requantize_op(void);
+
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 
 #if NCNN_VULKAN
@@ -92,11 +94,14 @@ public:
 
     Mat weight_data_int8_scales;
     Mat bottom_blob_int8_scales;
+    float top_blob_int8_scale;
 
     bool use_int8_inference;
+    bool use_int8_requantize;
 
     std::vector<ncnn::Layer*> quantize_ops;
     std::vector<ncnn::Layer*> dequantize_ops;
+    std::vector<ncnn::Layer*> requantize_ops;
 };
 
 } // namespace ncnn
diff --git a/src/layer/innerproduct.cpp b/src/layer/innerproduct.cpp
index 44adf09cd..4ec945dfc 100644
--- a/src/layer/innerproduct.cpp
+++ b/src/layer/innerproduct.cpp
@@ -36,7 +36,6 @@ InnerProduct::InnerProduct()
 #endif // NCNN_VULKAN
 
     quantize = 0;
-    dequantize = 0;
 }
 
 InnerProduct::~InnerProduct()
@@ -46,7 +45,11 @@ InnerProduct::~InnerProduct()
 #endif // NCNN_VULKAN
 
     delete quantize;
-    delete dequantize;
+
+    for (int i=0; i<(int)dequantize_ops.size(); i++)
+        delete dequantize_ops[i];
+
+    dequantize_ops.clear();
 }
 
 int InnerProduct::load_param(const ParamDict& pd)
@@ -92,7 +95,7 @@ int InnerProduct::load_model(const ModelBin& mb)
 
     if (int8_scale_term)
     {
-        weight_data_int8_scale = mb.load(1, 1)[0];
+        weight_data_int8_scales = mb.load(num_output, 1);
         bottom_blob_int8_scale = mb.load(1, 1)[0];
     }
 
@@ -105,25 +108,71 @@ int InnerProduct::load_model(const ModelBin& mb)
         return -1;
     }
 
+    // initial the quantize,dequantize op layer
     if (use_int8_inference)
     {
         quantize = ncnn::create_layer(ncnn::LayerType::Quantize);
-        dequantize = ncnn::create_layer(ncnn::LayerType::Dequantize);
+        {
+            ncnn::ParamDict pd;
+            pd.set(0, bottom_blob_int8_scale);// scale
+
+            quantize->load_param(pd);
+        }
+
+        dequantize_ops.resize(num_output);
+        for (int n=0; n<num_output; n++)
+        {
+            dequantize_ops[n] = ncnn::create_layer(ncnn::LayerType::Dequantize);
+
+            float top_rescale = 1.f;
+
+            if (weight_data_int8_scales[n] == 0)
+                top_rescale = 0;
+            else
+                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[n]);
+
+            ncnn::ParamDict pd;
+            pd.set(0, top_rescale);// scale
+            pd.set(1, bias_term);  // bias_term
+            pd.set(2, 1);          // bias_data_size
+
+            dequantize_ops[n]->load_param(pd);
+
+            ncnn::Mat weights[1];
+            weights[0] = bias_data.range(n, 1);
+
+            dequantize_ops[n]->load_model(ModelBinFromMatArray(weights));
+        }
     }
 
+    // runtime quantize the weight data
     if (weight_data_is_float32 && use_int8_inference)
     {
         // quantize weight to int8
-        ncnn::ParamDict pd;
-        pd.set(0, weight_data_int8_scale);// scale
+        Mat int8_weight_data(weight_data_size, (size_t)1u);
+        if (int8_weight_data.empty())
+            return -100;
 
-        quantize->load_param(pd);
+        const int weight_data_size_output = weight_data_size / num_output;
 
-        Mat int8_weight_data;
-        quantize->forward(weight_data, int8_weight_data);
+        for (int n=0; n<num_output; n++)
+        {
+            Layer* op = ncnn::create_layer(ncnn::LayerType::Quantize);
 
-        if (int8_weight_data.empty())
-            return -100;
+            ncnn::ParamDict pd;
+            pd.set(0, weight_data_int8_scales[n]);// scale
+
+            op->load_param(pd);
+
+            ncnn::Option opt = ncnn::get_default_option();
+            opt.blob_allocator = int8_weight_data.allocator;
+
+            const Mat weight_data_n = weight_data.range(weight_data_size_output * n, weight_data_size_output);
+            Mat int8_weight_data_n = int8_weight_data.range(weight_data_size_output * n, weight_data_size_output);
+            op->forward(weight_data_n, int8_weight_data_n, opt);
+
+            delete op;
+        }
 
         weight_data = int8_weight_data;
     }
@@ -152,12 +201,10 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
 
         // quantize, scale and round to nearest
         {
-            ncnn::ParamDict pd;
-            pd.set(0, bottom_blob_int8_scale);// scale
+            ncnn::Option opt_g = opt;
+            opt_g.blob_allocator = bottom_blob_int8.allocator;
 
-            quantize->load_param(pd);
-
-            quantize->forward(bottom_blob, bottom_blob_int8, opt);
+            quantize->forward(bottom_blob, bottom_blob_int8, opt_g);
         }
 
         // num_output
@@ -179,26 +226,24 @@ int InnerProduct::forward(const Mat& bottom_blob, Mat& top_blob, const Option& o
                 }
             }
 
-            out[p] = sum;
+            out[p] = sum;       
         }
 
-        // dequantize, reverse scale inplace
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<num_output; p++)
         {
-            float top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scale);
-
-            ncnn::ParamDict pd;
-            pd.set(0, top_rescale);// scale
-            pd.set(1, bias_term);// bias_term
-            pd.set(2, num_output);// bias_data_size
-
-            dequantize->load_param(pd);
-
-            ncnn::Mat weights[1];
-            weights[0] = bias_data;
-
-            dequantize->load_model(ModelBinFromMatArray(weights));
-
-            dequantize->forward_inplace(top_blob, opt);
+            int* out_s32 = top_blob;
+            float* out_f32 = top_blob;
+            float top_rescale = 1.f;
+            if (weight_data_int8_scales[p] == 0)
+                top_rescale = 0;
+            else
+                top_rescale = 1.f / (bottom_blob_int8_scale * weight_data_int8_scales[p]);
+
+            if (bias_term)
+                out_f32[p] = out_s32[p] * top_rescale + bias_data[p];
+            else
+                out_f32[p] = out_s32[p] * top_rescale;
         }
 
         return 0;
diff --git a/src/layer/innerproduct.h b/src/layer/innerproduct.h
index 0f9b151af..8ac451f43 100644
--- a/src/layer/innerproduct.h
+++ b/src/layer/innerproduct.h
@@ -76,13 +76,13 @@ public:
     Pipeline* pipeline_innerproduct_pack4to1;
 #endif // NCNN_VULKAN
 
-    float weight_data_int8_scale;
+    Mat weight_data_int8_scales;
     float bottom_blob_int8_scale;
 
     bool use_int8_inference;
 
     ncnn::Layer* quantize;
-    ncnn::Layer* dequantize;
+    std::vector<ncnn::Layer*> dequantize_ops;
 };
 
 } // namespace ncnn
diff --git a/src/layer/relu.cpp b/src/layer/relu.cpp
index e1c6bd2ae..dfffd8918 100644
--- a/src/layer/relu.cpp
+++ b/src/layer/relu.cpp
@@ -38,8 +38,51 @@ int ReLU::load_param(const ParamDict& pd)
     return 0;
 }
 
+int ReLU::forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const
+{
+    int w = bottom_top_blob.w;
+    int h = bottom_top_blob.h;
+    int channels = bottom_top_blob.c;
+    int size = w * h;
+
+    if (slope == 0.f)
+    {
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<channels; q++)
+        {
+            signed char* ptr = bottom_top_blob.channel(q);
+
+            for (int i=0; i<size; i++)
+            {
+                if (ptr[i] < 0)
+                    ptr[i] = 0;
+            }
+        }
+    }
+    else
+    {
+        // TODO
+        // #pragma omp parallel for num_threads(opt.num_threads)
+        // for (int q=0; q<channels; q++)
+        // {
+        //     float* ptr = bottom_top_blob.channel(q);
+
+        //     for (int i=0; i<size; i++)
+        //     {
+        //         if (ptr[i] < 0)
+        //             ptr[i] *= slope;
+        //     }
+        // }
+    }
+
+    return 0;
+}
+
 int ReLU::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 {
+    if (bottom_top_blob.elemsize == 1u)
+        return ReLU::forward_inplace_int8(bottom_top_blob, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
diff --git a/src/layer/relu.h b/src/layer/relu.h
index aa62683ca..678a6a6e9 100644
--- a/src/layer/relu.h
+++ b/src/layer/relu.h
@@ -27,6 +27,7 @@ public:
     virtual int load_param(const ParamDict& pd);
 
     virtual int forward_inplace(Mat& bottom_top_blob, const Option& opt) const;
+    virtual int forward_inplace_int8(Mat& bottom_top_blob, const Option& opt) const;
 
 #if NCNN_VULKAN
     virtual int create_pipeline();
diff --git a/src/layer/requantize.cpp b/src/layer/requantize.cpp
new file mode 100644
index 000000000..16bf5ee3a
--- /dev/null
+++ b/src/layer/requantize.cpp
@@ -0,0 +1,195 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#include "requantize.h"
+
+#include <math.h>
+
+namespace ncnn {
+
+DEFINE_LAYER_CREATOR(Requantize)
+
+Requantize::Requantize()
+{
+    one_blob_only = true;
+    support_inplace = false;
+    fusion_relu = false;
+}
+
+static inline signed char float2int8(float v)
+{
+    int int32 = round(v);
+    if (int32 > 127) return 127;
+    if (int32 < -128) return -128;
+    return (signed char)int32;
+}
+
+int Requantize::load_param(const ParamDict& pd)
+{
+    scale_in = pd.get(0, 1.f);	// bottom_blob_scale * weight_scale
+	scale_out = pd.get(1, 1.f);	// top_blob_scale
+    bias_term = pd.get(2, 0);
+    bias_data_size = pd.get(3, 0);
+    fusion_relu = pd.get(4, 0);
+
+    return 0;
+}
+
+int Requantize::load_model(const ModelBin& mb)
+{
+    if (bias_term)
+    {
+        bias_data = mb.load(bias_data_size, 1);
+        if (bias_data.empty())
+            return -100;
+    }
+
+    return 0;
+}
+
+int Requantize::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{ 
+    int dims = bottom_blob.dims;
+
+    if (dims == 1)
+    {
+        int w = bottom_blob.w;
+
+        const int* intptr = bottom_blob;
+        signed char * ptr = top_blob;
+
+        if (bias_term)
+        {
+            if (bias_data_size > 1)
+            {
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i=0; i<w; i++)
+                {
+                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias_data[i]) * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+            else
+            {
+                float bias = bias_data[0];
+                #pragma omp parallel for num_threads(opt.num_threads)
+                for (int i=0; i<w; i++)
+                {
+                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<w; i++)
+            {
+                ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
+                if (fusion_relu && ptr[i] < 0)
+                    ptr[i] = 0;
+            }
+        }
+    }
+
+    if (dims == 2)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+
+        if (bias_term)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                float bias = bias_data_size > 1 ? bias_data[i] : bias_data[0];
+
+                for (int j=0; j<w; j++)
+                {
+                    ptr[j] = float2int8(((intptr[j] * scale_in) + bias) * scale_out);
+                    if (fusion_relu && ptr[j] < 0)
+                        ptr[j] = 0;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i=0; i<h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                for (int j=0; j<w; j++)
+                {
+                    ptr[j] = float2int8(intptr[j] * scale_in * scale_out);
+                    if (fusion_relu && ptr[j] < 0)
+                        ptr[j] = 0;
+                }
+            }
+        }
+    }
+
+    if (dims == 3)
+    {
+        int w = bottom_blob.w;
+        int h = bottom_blob.h;
+        int channels = bottom_blob.c;
+        int size = w * h;      
+
+        if (bias_term)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q=0; q<channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                float bias = bias_data_size > 1 ? bias_data[q] : bias_data[0];
+
+                for (int i=0; i<size; i++)
+                {
+                    ptr[i] = float2int8(((intptr[i] * scale_in) + bias) * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+        }
+        else
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q=0; q<channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                for (int i=0; i<size; i++)
+                {
+                    ptr[i] = float2int8(intptr[i] * scale_in * scale_out);
+                    if (fusion_relu && ptr[i] < 0)
+                        ptr[i] = 0;
+                }
+            }
+        }    
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
\ No newline at end of file
diff --git a/src/layer/requantize.h b/src/layer/requantize.h
new file mode 100644
index 000000000..ca04293f6
--- /dev/null
+++ b/src/layer/requantize.h
@@ -0,0 +1,46 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#ifndef LAYER_REQUANTIZE_H
+#define LAYER_REQUANTIZE_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Requantize : public Layer
+{
+public:
+    Requantize();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int load_model(const ModelBin& mb);
+
+    virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    float scale_in;	// bottom_blob_scale * weight_scale
+	float scale_out;// top_blob_scale / (bottom_blob_scale * weight_scale)
+    int bias_term;
+    int bias_data_size;
+
+    bool fusion_relu;
+
+    Mat bias_data;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_REQUANTIZE_H
\ No newline at end of file
diff --git a/src/layer/x86/convolution_3x3.h b/src/layer/x86/convolution_3x3.h
index cc7ecd3bf..48afab45e 100644
--- a/src/layer/x86/convolution_3x3.h
+++ b/src/layer/x86/convolution_3x3.h
@@ -1,6 +1,7 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
@@ -138,3 +139,496 @@ static void conv3x3s1_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& _ker
     }
 
 }
+
+static void conv3x3s1_winograd23_transform_kernel_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
+{
+    kernel_tm.create(4*4, inch, outch);
+
+    // G
+    const float ktm[4][3] = {
+        {   1.0f,     0.0f,     0.0f},
+        { 1.0f/2,   1.0f/2,   1.0f/2},
+        { 1.0f/2,  -1.0f/2,   1.0f/2},
+        {   0.0f,     0.0f,     1.0f}
+    };
+
+    #pragma omp parallel for
+    for (int p = 0; p<outch; p++)
+    {
+        for (int q = 0; q<inch; q++)
+        {
+            const float* kernel0 = (const float*)kernel + p*inch * 9 + q * 9;
+            float* kernel_tm0 = kernel_tm.channel(p).row(q);
+
+            // transform kernel
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            // h
+            float tmp[4][3];
+            for (int i=0; i<4; i++)
+            {
+                tmp[i][0] = k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j=0; j<4; j++)
+            {
+                float* tmpp = &tmp[j][0];
+
+                for (int i=0; i<4; i++)
+                {
+                    kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);
+
+    const float* bias = _bias;    
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        const int tiles = nColBlocks * nRowBlocks;
+
+        bottom_blob_tm.create(4*4, tiles, inch, 4u, opt.workspace_allocator);
+
+        // BT
+        // const float itm[4][4] = {
+        //     {1.0f,  0.0f, -1.0f,  0.0f},
+        //     {0.0f,  1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  0.00f, 1.0f}
+        // };        
+
+        for (int q=0; q<inch; q++)
+        {
+            const float* img = bottom_blob_bordered.channel(q);
+            float* out_tm0 = bottom_blob_tm.channel(q);
+
+            for (int j = 0; j < nColBlocks; j++)
+            {
+                const float* r0 = img + w * j * 2;
+                const float* r1 = r0 + w;
+                const float* r2 = r1 + w;
+                const float* r3 = r2 + w;
+
+                for (int i = 0; i < nRowBlocks; i++)
+                {
+                    float d0[4],d1[4],d2[4],d3[4];
+                    float w0[4],w1[4],w2[4],w3[4];
+                    float t0[4],t1[4],t2[4],t3[4];
+                    // load
+                    for (int n = 0; n < 4; n++)
+                    {
+                        d0[n] = r0[n];
+                        d1[n] = r1[n];
+                        d2[n] = r2[n];
+                        d3[n] = r3[n];
+                    }                                  
+                    // w = B_t * d
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        w0[n] = d0[n] - d2[n];
+                        w1[n] = d1[n] + d2[n];
+                        w2[n] = d2[n] - d1[n];
+                        w3[n] = d3[n] - d1[n];
+                    }                                
+                    // transpose d to d_t
+                    {
+                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
+                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
+                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
+                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
+                    }
+                    // d = B_t * d_t
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        d0[n] = t0[n] - t2[n];
+                        d1[n] = t1[n] + t2[n];
+                        d2[n] = t2[n] - t1[n];
+                        d3[n] = t3[n] - t1[n];
+                    }
+                    // save to out_tm
+                    for (int n = 0; n < 4; n++)
+                    {
+                        out_tm0[n   ] = d0[n];
+                        out_tm0[n+ 4] = d1[n];
+                        out_tm0[n+ 8] = d2[n];
+                        out_tm0[n+12] = d3[n];
+                    }                  
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+
+                    out_tm0 += 16;
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
+
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        const int tiles = nColBlocks * nRowBlocks; 
+
+        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
+
+        int nn_outch = outch >> 2;
+        int remain_outch_start = nn_outch << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = pp * 4;
+
+            Mat out0_tm = top_blob_tm.channel(p);
+            Mat out1_tm = top_blob_tm.channel(p+1);
+            Mat out2_tm = top_blob_tm.channel(p+2);
+            Mat out3_tm = top_blob_tm.channel(p+3);
+
+            const Mat kernel0_tm = kernel_tm.channel(p);
+            const Mat kernel1_tm = kernel_tm.channel(p+1);
+            const Mat kernel2_tm = kernel_tm.channel(p+2);
+            const Mat kernel3_tm = kernel_tm.channel(p+3);
+
+            for (int i=0; i<tiles; i++)
+            {
+                float* output0_tm = out0_tm.row(i);
+                float* output1_tm = out1_tm.row(i);
+                float* output2_tm = out2_tm.row(i);
+                float* output3_tm = out3_tm.row(i);
+
+                float sum0[16] = {0.0f};
+                float sum1[16] = {0.0f};
+                float sum2[16] = {0.0f};
+                float sum3[16] = {0.0f};
+
+                int q = 0;
+                for (; q+3<inch; q+=4)
+                {   
+                    const float* r0 = bottom_blob_tm.channel(q).row(i);
+                    const float* r1 = bottom_blob_tm.channel(q+1).row(i);
+                    const float* r2 = bottom_blob_tm.channel(q+2).row(i);
+                    const float* r3 = bottom_blob_tm.channel(q+3).row(i);
+
+                    const float* k0 = kernel0_tm.row(q);
+                    const float* k1 = kernel1_tm.row(q);
+                    const float* k2 = kernel2_tm.row(q);
+                    const float* k3 = kernel3_tm.row(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += r0[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += r1[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += r2[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += r3[n] * k0[n];
+                        k0 -= 16 * 3;
+
+                        sum1[n] += r0[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += r1[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += r2[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += r3[n] * k1[n];
+                        k1 -= 16 * 3;
+
+                        sum2[n] += r0[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += r1[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += r2[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += r3[n] * k2[n];
+                        k2 -= 16 * 3;
+
+                        sum3[n] += r0[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += r1[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += r2[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += r3[n] * k3[n];
+                        k3 -= 16 * 3;
+                    }
+                }
+
+                for (; q<inch; q++)
+                {
+                    const float* r0 = bottom_blob_tm.channel(q).row(i);
+
+                    const float* k0 = kernel0_tm.row(q);
+                    const float* k1 = kernel1_tm.row(q);
+                    const float* k2 = kernel2_tm.row(q);
+                    const float* k3 = kernel3_tm.row(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += r0[n] * k0[n];
+                        sum1[n] += r0[n] * k1[n];
+                        sum2[n] += r0[n] * k2[n];
+                        sum3[n] += r0[n] * k3[n];
+                    }
+                }
+
+                for (int n=0; n<16; n++)
+                {
+                    output0_tm[n] = sum0[n];
+                    output1_tm[n] = sum1[n];
+                    output2_tm[n] = sum2[n];
+                    output3_tm[n] = sum3[n];
+                }
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=remain_outch_start; p<outch; p++)
+        {
+            Mat out0_tm = top_blob_tm.channel(p);
+            const Mat kernel0_tm = kernel_tm.channel(p);
+
+            for (int i=0; i<tiles; i++)
+            {
+                float* output0_tm = out0_tm.row(i);
+
+                float sum0[16] = {0.0f};
+
+                int q = 0;
+                for (; q+3<inch; q+=4)
+                {   
+                    const float* r0 = bottom_blob_tm.channel(q).row(i);
+                    const float* r1 = bottom_blob_tm.channel(q+1).row(i);
+                    const float* r2 = bottom_blob_tm.channel(q+2).row(i);
+                    const float* r3 = bottom_blob_tm.channel(q+3).row(i);
+
+                    const float* k0 = kernel0_tm.row(q);
+                    const float* k1 = kernel0_tm.row(q+1);
+                    const float* k2 = kernel0_tm.row(q+2);
+                    const float* k3 = kernel0_tm.row(q+3);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += r0[n] * k0[n];
+                        sum0[n] += r1[n] * k1[n];
+                        sum0[n] += r2[n] * k2[n];
+                        sum0[n] += r3[n] * k3[n];
+                    }
+                }
+
+                for (; q<inch; q++)
+                {
+                    const float* r0 = bottom_blob_tm.channel(q).row(i);
+                    const float* k0 = kernel0_tm.row(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += r0[n] * k0[n];
+                    }             
+                }
+
+                for (int n=0; n<16; n++)
+                {
+                    output0_tm[n] = sum0[n];
+                }
+            }
+        }
+    }
+    bottom_blob_tm = Mat();
+    // END dot
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    {
+        // AT
+        // const float itm[2][4] = {
+        //     {1.0f,  1.0f,  1.0f,  0.0f},
+        //     {0.0f,  1.0f, -1.0f,  1.0f}
+        // }; 
+
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<outch; p++)
+        {
+            Mat out_tm = top_blob_tm.channel(p);
+            Mat out = top_blob_bordered.channel(p);
+
+            const float bias0 = bias ? bias[p] : 0.f;
+
+            for (int j=0; j<nColBlocks; j++)
+            {
+                float* outRow0 = out.row(j*2);
+                float* outRow1 = out.row(j*2+1);
+
+                for(int i=0; i<nRowBlocks; i++)
+                {
+                    float* out_tile = out_tm.row(j*nRowBlocks + i);
+
+                    float s0[4],s1[4],s2[4],s3[4];
+                    float w0[4],w1[4];
+                    float d0[2],d1[2],d2[2],d3[2];
+                    float o0[2],o1[2];
+                    // load
+                    for (int n = 0; n < 4; n++)
+                    {
+                        s0[n] = out_tile[n];
+                        s1[n] = out_tile[n+ 4];
+                        s2[n] = out_tile[n+ 8];
+                        s3[n] = out_tile[n+12];
+                    }
+                    // w = A_T * W
+                    for (int n = 0; n < 4; n++)
+                    {
+                        w0[n] = s0[n] + s1[n] + s2[n];
+                        w1[n] = s1[n] - s2[n] + s3[n];
+                    }
+                    // transpose w to w_t
+                    {
+                        d0[0] = w0[0]; d0[1] = w1[0];
+                        d1[0] = w0[1]; d1[1] = w1[1];
+                        d2[0] = w0[2]; d2[1] = w1[2];
+                        d3[0] = w0[3]; d3[1] = w1[3];
+                    }
+                    // Y = A_T * w_t
+                    for (int n = 0; n < 2; n++)
+                    {
+                        o0[n] = d0[n] + d1[n] + d2[n] + bias0;
+                        o1[n] = d1[n] - d2[n] + d3[n] + bias0;
+                    }
+                    // save to top blob tm
+                    outRow0[0] = o0[0];
+                    outRow0[1] = o0[1];
+                    outRow1[0] = o1[0];
+                    outRow1[1] = o1[1];
+
+                    outRow0 += 2;
+                    outRow1 += 2;      
+                }
+            }
+        }        
+    }
+    // END transform output 
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);
+}
+
+static void conv3x3s2_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Mat& _bias, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const float* kernel = _kernel;
+    const float* bias = _bias;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        const float bias0 = bias ? bias[p] : 0.f;
+
+        out.fill(bias0);
+
+        for (int q = 0; q < inch; q++)
+        {
+            float *outptr = out;
+
+            const float *img = bottom_blob.channel(q);
+            const float* kernel0 = kernel + p*inch*9  + q*9;
+
+            const float *r0 = img;
+            const float *r1 = img + w;
+            const float *r2 = img + w * 2;
+
+            const float* k0 = kernel0;
+            const float* k1 = kernel0 + 3;
+            const float* k2 = kernel0 + 6;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    float sum = 0;
+
+                    sum += r0[0] * k0[0];
+                    sum += r0[1] * k0[1];
+                    sum += r0[2] * k0[2];
+                    sum += r1[0] * k1[0];
+                    sum += r1[1] * k1[1];
+                    sum += r1[2] * k1[2];
+                    sum += r2[0] * k2[0];
+                    sum += r2[1] * k2[1];
+                    sum += r2[2] * k2[2];
+
+                    *outptr += sum;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    outptr++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/layer/x86/convolution_3x3_int8.h b/src/layer/x86/convolution_3x3_int8.h
index 4fd8f0ec8..4f1fc6407 100644
--- a/src/layer/x86/convolution_3x3_int8.h
+++ b/src/layer/x86/convolution_3x3_int8.h
@@ -11,12 +11,6 @@
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.
-static inline short saturate2int16(int v)
-{
-    if (v > 32767) return 32767;
-    if (v < -32768) return -32768;
-    return (short)v;
-}
 
 static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
@@ -84,6 +78,424 @@ static void conv3x3s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat
     }
 }
 
+static void conv3x3s1_winograd23_transform_kernel_int8_sse(const Mat& kernel, Mat& kernel_tm, int inch, int outch)
+{
+    kernel_tm.create(4*4, inch, outch, 2ul);  
+
+    // G
+    const short ktm[4][3] = {
+        {   2,     0,     0},
+        {   1,     1,     1},
+        {   1,    -1,     1},
+        {   0,     0,     2}
+    };
+
+    #pragma omp parallel for
+    for (int p = 0; p<outch; p++)
+    {
+        for (int q = 0; q<inch; q++)
+        {
+            const signed char* kernel0 = (const signed char*)kernel + p*inch * 9 + q * 9;
+            short* kernel_tm0 = kernel_tm.channel(p).row<short>(q);
+
+            // transform kernel
+            const signed char* k0 = kernel0;
+            const signed char* k1 = kernel0 + 3;
+            const signed char* k2 = kernel0 + 6;
+
+            // h
+            short tmp[4][3];
+            for (int i=0; i<4; i++)
+            {
+                tmp[i][0] = (short)k0[0] * ktm[i][0] + k0[1] * ktm[i][1] + k0[2] * ktm[i][2];
+                tmp[i][1] = (short)k1[0] * ktm[i][0] + k1[1] * ktm[i][1] + k1[2] * ktm[i][2];
+                tmp[i][2] = (short)k2[0] * ktm[i][0] + k2[1] * ktm[i][1] + k2[2] * ktm[i][2];
+            }
+
+            // U
+            for (int j=0; j<4; j++)
+            {
+                short* tmpp = &tmp[j][0];
+
+                for (int i=0; i<4; i++)
+                {
+                    kernel_tm0[j*4 + i] = tmpp[0] * ktm[i][0] + tmpp[1] * ktm[i][1] + tmpp[2] * ktm[i][2];
+                }
+            }
+        }
+    }
+}
+
+static void conv3x3s1_winograd23_int8_sse(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel_tm, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    // pad to 2n+2, winograd F(2,3)
+    Mat bottom_blob_bordered = bottom_blob;
+
+    outw = (outw + 1) / 2 * 2;
+    outh = (outh + 1) / 2 * 2;
+
+    w = outw + 2;
+    h = outh + 2;
+    copy_make_border(bottom_blob, bottom_blob_bordered, 0, h - bottom_blob.h, 0, w - bottom_blob.w, 0, 0.f, opt.workspace_allocator, opt.num_threads);  
+
+    // BEGIN transform input
+    Mat bottom_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        const int tiles = nColBlocks * nRowBlocks;
+
+        bottom_blob_tm.create(4*4, tiles, inch, 2u, opt.workspace_allocator);
+
+        // BT
+        // const float itm[4][4] = {
+        //     {1.0f,  0.0f, -1.0f,  0.0f},
+        //     {0.0f,  1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  1.00f, 0.0f},
+        //     {0.0f, -1.0f,  0.00f, 1.0f}
+        // };
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q=0; q<inch; q++)
+        {
+            const signed char* img = bottom_blob_bordered.channel(q);
+            short* out_tm0 = bottom_blob_tm.channel(q);
+
+            for (int j = 0; j < nColBlocks; j++)
+            {
+                const signed char* r0 = img + w * j * 2;
+                const signed char* r1 = r0 + w;
+                const signed char* r2 = r1 + w;
+                const signed char* r3 = r2 + w;
+
+                for (int i = 0; i < nRowBlocks; i++)
+                {
+                    short d0[4],d1[4],d2[4],d3[4];
+                    short w0[4],w1[4],w2[4],w3[4];
+                    short t0[4],t1[4],t2[4],t3[4];
+                    // load 
+                    for (int n = 0; n < 4; n++)
+                    {
+                        d0[n] = r0[n];
+                        d1[n] = r1[n];
+                        d2[n] = r2[n];
+                        d3[n] = r3[n];
+                    }                                  
+                    // w = B_t * d
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        w0[n] = d0[n] - d2[n];
+                        w1[n] = d1[n] + d2[n];
+                        w2[n] = d2[n] - d1[n];
+                        w3[n] = d3[n] - d1[n];
+                    }                                
+                    // transpose d to d_t
+                    {
+                        t0[0]=w0[0]; t1[0]=w0[1]; t2[0]=w0[2]; t3[0]=w0[3];
+                        t0[1]=w1[0]; t1[1]=w1[1]; t2[1]=w1[2]; t3[1]=w1[3];
+                        t0[2]=w2[0]; t1[2]=w2[1]; t2[2]=w2[2]; t3[2]=w2[3];
+                        t0[3]=w3[0]; t1[3]=w3[1]; t2[3]=w3[2]; t3[3]=w3[3];
+                    }
+                    // U = B_t * d_t
+                    for (int n = 0; n < 4; n++)
+                    {   
+                        d0[n] = t0[n] - t2[n];
+                        d1[n] = t1[n] + t2[n];
+                        d2[n] = t2[n] - t1[n];
+                        d3[n] = t3[n] - t1[n];
+                    }                
+                    // save to out_tm
+                    for (int n = 0; n < 4; n++)
+                    {
+                        out_tm0[n   ] = d0[n];
+                        out_tm0[n+ 4] = d1[n];
+                        out_tm0[n+ 8] = d2[n];
+                        out_tm0[n+12] = d3[n];
+                    }                  
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+
+                    out_tm0 += 16;
+                }
+            }
+        }
+    }
+    bottom_blob_bordered = Mat();
+    
+    // BEGIN dot
+    Mat top_blob_tm;
+    {
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        const int tiles = nColBlocks * nRowBlocks; 
+
+        top_blob_tm.create(16, tiles, outch, 4u, opt.workspace_allocator);
+
+        int nn_outch = outch >> 2;
+        int remain_outch_start = nn_outch << 2;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = pp * 4;
+
+            Mat out0_tm = top_blob_tm.channel(p);
+            Mat out1_tm = top_blob_tm.channel(p+1);
+            Mat out2_tm = top_blob_tm.channel(p+2);
+            Mat out3_tm = top_blob_tm.channel(p+3);
+
+            const Mat kernel0_tm = kernel_tm.channel(p);
+            const Mat kernel1_tm = kernel_tm.channel(p+1);
+            const Mat kernel2_tm = kernel_tm.channel(p+2);
+            const Mat kernel3_tm = kernel_tm.channel(p+3);
+
+            for (int i=0; i<tiles; i++)
+            {
+                int* output0_tm = out0_tm.row<int>(i);
+                int* output1_tm = out1_tm.row<int>(i);
+                int* output2_tm = out2_tm.row<int>(i);
+                int* output3_tm = out3_tm.row<int>(i);
+
+                int sum0[16] = {0};
+                int sum1[16] = {0};
+                int sum2[16] = {0};
+                int sum3[16] = {0};
+
+                int q = 0;
+                for (; q+3<inch; q+=4)
+                {   
+                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
+                    const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
+                    const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
+                    const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);
+
+                    const short* k0 = kernel0_tm.row<short>(q);
+                    const short* k1 = kernel1_tm.row<short>(q);
+                    const short* k2 = kernel2_tm.row<short>(q);
+                    const short* k3 = kernel3_tm.row<short>(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += (int)r0[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += (int)r1[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += (int)r2[n] * k0[n];
+                        k0 += 16;
+                        sum0[n] += (int)r3[n] * k0[n];
+                        k0 -= 16 * 3;
+
+                        sum1[n] += (int)r0[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += (int)r1[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += (int)r2[n] * k1[n];
+                        k1 += 16;
+                        sum1[n] += (int)r3[n] * k1[n];
+                        k1 -= 16 * 3;
+
+                        sum2[n] += (int)r0[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += (int)r1[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += (int)r2[n] * k2[n];
+                        k2 += 16;
+                        sum2[n] += (int)r3[n] * k2[n];
+                        k2 -= 16 * 3;
+
+                        sum3[n] += (int)r0[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += (int)r1[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += (int)r2[n] * k3[n];
+                        k3 += 16;
+                        sum3[n] += (int)r3[n] * k3[n];
+                        k3 -= 16 * 3;
+                    }
+                }
+
+                for (; q<inch; q++)
+                {
+                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
+
+                    const short* k0 = kernel0_tm.row<short>(q);
+                    const short* k1 = kernel1_tm.row<short>(q);
+                    const short* k2 = kernel2_tm.row<short>(q);
+                    const short* k3 = kernel3_tm.row<short>(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += (int)r0[n] * k0[n];
+                        sum1[n] += (int)r0[n] * k1[n];
+                        sum2[n] += (int)r0[n] * k2[n];
+                        sum3[n] += (int)r0[n] * k3[n];
+                    }
+                }
+
+                for (int n=0; n<16; n++)
+                {
+                    output0_tm[n] = sum0[n];
+                    output1_tm[n] = sum1[n];
+                    output2_tm[n] = sum2[n];
+                    output3_tm[n] = sum3[n];
+                }
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=remain_outch_start; p<outch; p++)
+        {
+            Mat out0_tm = top_blob_tm.channel(p);
+            const Mat kernel0_tm = kernel_tm.channel(p);
+
+            for (int i=0; i<tiles; i++)
+            {
+                int* output0_tm = out0_tm.row<int>(i);
+
+                int sum0[16] = {0};
+
+                int q = 0;
+                for (; q+3<inch; q+=4)
+                {   
+                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
+                    const short* r1 = bottom_blob_tm.channel(q+1).row<short>(i);
+                    const short* r2 = bottom_blob_tm.channel(q+2).row<short>(i);
+                    const short* r3 = bottom_blob_tm.channel(q+3).row<short>(i);
+
+                    const short* k0 = kernel0_tm.row<short>(q);
+                    const short* k1 = kernel0_tm.row<short>(q+1);
+                    const short* k2 = kernel0_tm.row<short>(q+2);
+                    const short* k3 = kernel0_tm.row<short>(q+3);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += (int)r0[n] * k0[n];
+                        sum0[n] += (int)r1[n] * k1[n];
+                        sum0[n] += (int)r2[n] * k2[n];
+                        sum0[n] += (int)r3[n] * k3[n];
+                    }
+                }
+
+                for (; q<inch; q++)
+                {
+                    const short* r0 = bottom_blob_tm.channel(q).row<short>(i);
+                    const short* k0 = kernel0_tm.row<short>(q);
+
+                    for (int n=0; n<16; n++)
+                    {
+                        sum0[n] += (int)r0[n] * k0[n];
+                    }             
+                }
+
+                for (int n=0; n<16; n++)
+                {
+                    output0_tm[n] = sum0[n];
+                }
+            }
+        }
+    }
+    bottom_blob_tm = Mat();
+    // END dot    
+
+    // BEGIN transform output
+    Mat top_blob_bordered;
+    top_blob_bordered.create(outw, outh, outch, 4u, opt.workspace_allocator);
+    {
+        // AT
+        // const float itm[2][4] = {
+        //     {1.0f,  1.0f,  1.0f,  0.0f},
+        //     {0.0f,  1.0f, -1.0f,  1.0f}
+        // }; 
+
+        int w_tm = outw / 2 * 4;
+        int h_tm = outh / 2 * 4;
+
+        int nColBlocks = h_tm/4; // may be the block num in Feathercnn
+        int nRowBlocks = w_tm/4;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<outch; p++)
+        {
+            Mat out_tm = top_blob_tm.channel(p);
+            Mat out = top_blob_bordered.channel(p);
+
+            for (int j=0; j<nColBlocks; j++)
+            {
+                int* outRow0 = out.row<int>(j*2);
+                int* outRow1 = out.row<int>(j*2+1);
+
+                for(int i=0; i<nRowBlocks; i++)
+                {
+                    int* out_tile = out_tm.row<int>(j*nRowBlocks + i);
+
+                    int s0[4],s1[4],s2[4],s3[4];
+                    int w0[4],w1[4];
+                    int d0[2],d1[2],d2[2],d3[2];
+                    int o0[2],o1[2];
+                    // load
+                    for (int n = 0; n < 4; n++)
+                    {
+                        s0[n] = out_tile[n];
+                        s1[n] = out_tile[n+ 4];
+                        s2[n] = out_tile[n+ 8];
+                        s3[n] = out_tile[n+12];
+                    }
+                    // w = A_T * W
+                    for (int n = 0; n < 4; n++)
+                    {
+                        w0[n] = s0[n] + s1[n] + s2[n];
+                        w1[n] = s1[n] - s2[n] + s3[n];
+                    }
+                    // transpose w to w_t
+                    {
+                        d0[0] = w0[0]; d0[1] = w1[0];
+                        d1[0] = w0[1]; d1[1] = w1[1];
+                        d2[0] = w0[2]; d2[1] = w1[2];
+                        d3[0] = w0[3]; d3[1] = w1[3];
+                    }
+                    // Y = A_T * w_t
+                    for (int n = 0; n < 2; n++)
+                    {
+                        o0[n] = d0[n] + d1[n] + d2[n];
+                        o1[n] = d1[n] - d2[n] + d3[n];
+                    }
+                    // save to top blob tm,why right 2,because the G' = G*2
+                    outRow0[0] = o0[0] >> 2;
+                    outRow0[1] = o0[1] >> 2;
+                    outRow1[0] = o1[0] >> 2;
+                    outRow1[1] = o1[1] >> 2;
+
+                    outRow0 += 2;
+                    outRow1 += 2;           
+                }
+            }
+        }        
+    }
+    // END transform output 
+
+    // cut result pad
+    copy_cut_border(top_blob_bordered, top_blob, 0, top_blob_bordered.h - top_blob.h, 0, top_blob_bordered.w - top_blob.w, opt.blob_allocator, opt.num_threads);  
+}
+
 static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
 {
     int w = bottom_blob.w;
@@ -122,23 +534,19 @@ static void conv3x3s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat
 
                 for (; remain > 0; remain--)
                 {
-                    short sum0 = 0;
-                    short sum1 = 0;
-                    short sum2 = 0;
-
-                    sum0 += (short)r0[0] * kernel0[0];
-                    sum0 += (short)r0[1] * kernel0[1];
-                    sum0 += (short)r0[2] * kernel0[2];
-                    sum1 += (short)r1[0] * kernel0[3];
-                    sum1 += (short)r1[1] * kernel0[4];
-                    sum1 += (short)r1[2] * kernel0[5];
-                    sum2 += (short)r2[0] * kernel0[6];
-                    sum2 += (short)r2[1] * kernel0[7];
-                    sum2 += (short)r2[2] * kernel0[8];
-
-                    *outptr0 = saturate2int16(*outptr0 + sum0);
-                    *outptr0 = saturate2int16(*outptr0 + sum1);
-                    *outptr0 = saturate2int16(*outptr0 + sum2);
+                    int sum0 = 0;
+
+                    sum0 += (int)r0[0] * kernel0[0];
+                    sum0 += (int)r0[1] * kernel0[1];
+                    sum0 += (int)r0[2] * kernel0[2];
+                    sum0 += (int)r1[0] * kernel0[3];
+                    sum0 += (int)r1[1] * kernel0[4];
+                    sum0 += (int)r1[2] * kernel0[5];
+                    sum0 += (int)r2[0] * kernel0[6];
+                    sum0 += (int)r2[1] * kernel0[7];
+                    sum0 += (int)r2[2] * kernel0[8];
+
+                    *outptr0 += sum0;
 
                     r0 += 2;
                     r1 += 2;
diff --git a/src/layer/x86/convolution_5x5_int8.h b/src/layer/x86/convolution_5x5_int8.h
new file mode 100644
index 000000000..662034faf
--- /dev/null
+++ b/src/layer/x86/convolution_5x5_int8.h
@@ -0,0 +1,35 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2019 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv5x5s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 5;
+    int kernel_h = 5;
+
+    int stride_w = 1;
+    int stride_h = 1;
+
+    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
+
+static void conv5x5s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 5;
+    int kernel_h = 5;
+
+    int stride_w = 2;
+    int stride_h = 2;
+
+    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
diff --git a/src/layer/x86/convolution_7x7_int8.h b/src/layer/x86/convolution_7x7_int8.h
new file mode 100644
index 000000000..1704c41f3
--- /dev/null
+++ b/src/layer/x86/convolution_7x7_int8.h
@@ -0,0 +1,35 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv7x7s1_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 7;
+    int kernel_h = 7;
+
+    int stride_w = 1;
+    int stride_h = 1;
+
+    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
+
+static void conv7x7s2_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int kernel_w = 7;
+    int kernel_h = 7;
+
+    int stride_w = 2;
+    int stride_h = 2;
+
+    conv_im2col_sgemm_int8_sse(bottom_blob, top_blob, _kernel, kernel_w, kernel_h, stride_w, stride_h, opt);
+}
\ No newline at end of file
diff --git a/src/layer/x86/convolution_sgemm_int8.h b/src/layer/x86/convolution_sgemm_int8.h
new file mode 100644
index 000000000..753a08f54
--- /dev/null
+++ b/src/layer/x86/convolution_sgemm_int8.h
@@ -0,0 +1,381 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv_im2col_sgemm_int8_sse(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, \
+            const int kernel_w, const int kernel_h, const int stride_w, const int stride_h, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    // im2col
+    Mat bottom_im2col(outw*outh, kernel_h*kernel_w*inch, 1UL, opt.workspace_allocator);
+    {
+        const int stride = kernel_h*kernel_w*outw*outh;
+        signed char* ret = (signed char*)bottom_im2col;
+    
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=0; p<inch; p++)
+        {
+            const signed char* input = bottom_blob.channel(p);
+            int retID = stride * p;
+            for (int u=0; u<kernel_h; u++)
+            {
+                for (int v=0; v<kernel_w; v++)
+                {
+                    for (int i=0; i<outh; i++)
+                    {
+                        for (int j=0; j<outw; j++)
+                        {
+                            int row = u + i * stride_h;
+                            int col = v + j * stride_w;
+                            int index = row * w + col;
+                            ret[retID] = input[index];
+                            retID++;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    int kernel_size = kernel_w * kernel_h;
+    int out_size = outw * outh;
+
+    // bottom_im2col memory packed 4 x 8
+    Mat bottom_tm(8*kernel_size, inch, out_size/8 + out_size%8, (size_t)1u, opt.workspace_allocator);
+    {
+        int nn_size = out_size >> 3;
+        int remain_size_start = nn_size << 3;
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int ii=0; ii<nn_size; ii++)
+        {
+            int i = ii * 8;
+
+            const signed char* img0 = bottom_im2col.channel(0);
+            img0 += i;
+
+            signed char* tmpptr = bottom_tm.channel(i/8);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                tmpptr[0] = img0[0];
+                tmpptr[1] = img0[1];
+                tmpptr[2] = img0[2];
+                tmpptr[3] = img0[3];
+                tmpptr[4] = img0[4];
+                tmpptr[5] = img0[5];
+                tmpptr[6] = img0[6];
+                tmpptr[7] = img0[7];
+
+                tmpptr += 8;
+                img0 += out_size;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i=remain_size_start; i<out_size; i++)
+        {
+            const signed char* img0 = bottom_im2col.channel(0);
+            img0 += i;
+
+            signed char* tmpptr = bottom_tm.channel(i/8 + i%8);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                tmpptr[0] = img0[0];
+
+                tmpptr += 1;
+                img0 += out_size;
+            }
+        }       
+    }
+
+    // kernel memory packed 4 x 8
+    Mat kernel_tm(4*kernel_size, inch, outch/4 + outch%4, (size_t)1u, opt.workspace_allocator);
+    {
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+        nn_outch = outch >> 2;
+        remain_outch_start = nn_outch << 2;
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int p = pp * 4;
+
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+            const signed char* k1 = kernel + (p+1)*inch*kernel_size;
+            const signed char* k2 = kernel + (p+2)*inch*kernel_size;
+            const signed char* k3 = kernel + (p+3)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp[1] = k1[0];
+                ktmp[2] = k2[0];
+                ktmp[3] = k3[0];
+                ktmp += 4;
+
+                k0 += 1;
+                k1 += 1;
+                k2 += 1;
+                k3 += 1;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int p=remain_outch_start; p<outch; p++)
+        {
+            const signed char* k0 = kernel + (p+0)*inch*kernel_size;
+
+            signed char* ktmp = kernel_tm.channel(p/4 + p%4);
+
+            for (int q=0; q<inch*kernel_size; q++)
+            {
+                ktmp[0] = k0[0];
+                ktmp++;
+                k0++;
+            }
+        }
+    }
+
+    // sgemm(int M, int N, int L, float* A, float* B, float* C)
+    {
+        // int M = outch;  // outch
+        int N = outw * outh; // outsize or out stride
+        int L = kernel_w * kernel_h * inch; // ksize * inch
+
+        int nn_outch = 0;
+        int remain_outch_start = 0;
+
+        nn_outch = outch >> 2;
+        remain_outch_start = nn_outch << 2;
+        
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int pp=0; pp<nn_outch; pp++)
+        {
+            int i = pp * 4;
+
+            int* output0 = top_blob.channel(i);
+            int* output1 = top_blob.channel(i+1);
+            int* output2 = top_blob.channel(i+2);
+            int* output3 = top_blob.channel(i+3);
+
+            int j=0;
+            for (; j+7<N; j=j+8)
+            {
+                signed char* vb = bottom_tm.channel(j/8);
+                signed char* va = kernel_tm.channel(i/4);
+                
+                int sum0[8] = {0};
+                int sum1[8] = {0};
+                int sum2[8] = {0};
+                int sum3[8] = {0};
+               
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+8];
+                        sum1[n] += (int)va[1] * vb[n+8];
+                        sum2[n] += (int)va[2] * vb[n+8];
+                        sum3[n] += (int)va[3] * vb[n+8];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+16];
+                        sum1[n] += (int)va[1] * vb[n+16];
+                        sum2[n] += (int)va[2] * vb[n+16];
+                        sum3[n] += (int)va[3] * vb[n+16];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+24];
+                        sum1[n] += (int)va[1] * vb[n+24];
+                        sum2[n] += (int)va[2] * vb[n+24];
+                        sum3[n] += (int)va[3] * vb[n+24];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+32];
+                        sum1[n] += (int)va[1] * vb[n+32];
+                        sum2[n] += (int)va[2] * vb[n+32];
+                        sum3[n] += (int)va[3] * vb[n+32];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+40];
+                        sum1[n] += (int)va[1] * vb[n+40];
+                        sum2[n] += (int)va[2] * vb[n+40];
+                        sum3[n] += (int)va[3] * vb[n+40];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+48];
+                        sum1[n] += (int)va[1] * vb[n+48];
+                        sum2[n] += (int)va[2] * vb[n+48];
+                        sum3[n] += (int)va[3] * vb[n+48];
+                        va += 4;
+
+                        sum0[n] += (int)va[0] * vb[n+56];
+                        sum1[n] += (int)va[1] * vb[n+56];
+                        sum2[n] += (int)va[2] * vb[n+56];
+                        sum3[n] += (int)va[3] * vb[n+56];
+                        va -= 28;
+                    }
+
+                    va += 32;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum0[n] += (int)va[0] * vb[n];
+                        sum1[n] += (int)va[1] * vb[n];
+                        sum2[n] += (int)va[2] * vb[n];
+                        sum3[n] += (int)va[3] * vb[n];
+                    }
+                    
+                    va += 4;
+                    vb += 8;
+                }
+
+                for (int n=0; n<8; n++)
+                {
+                    output0[n] = sum0[n];
+                    output1[n] = sum1[n];
+                    output2[n] = sum2[n];
+                    output3[n] = sum3[n];
+                }
+                output0 += 8;
+                output1 += 8;
+                output2 += 8;
+                output3 += 8;
+            }
+
+            for (; j<N; j++)
+            {                
+                int sum0 = 0;
+                int sum1 = 0;
+                int sum2 = 0;
+                int sum3 = 0;
+
+                signed char* vb = bottom_tm.channel(j/8 + j%8);
+                signed char* va = kernel_tm.channel(i/4);
+
+                for (int k=0; k<L; k++)
+                {
+                    sum0 += (int)va[0] * vb[0];
+                    sum1 += (int)va[1] * vb[0];
+                    sum2 += (int)va[2] * vb[0];
+                    sum3 += (int)va[3] * vb[0];
+
+                    va += 4;
+                    vb += 1;
+                }
+                
+                output0[0] = sum0;
+                output1[0] = sum1;
+                output2[0] = sum2;
+                output3[0] = sum3;
+
+                output0++;
+                output1++;
+                output2++;
+                output3++;
+            }
+        }
+
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int i=remain_outch_start; i<outch; i++)
+        {
+            int* output = top_blob.channel(i);
+
+            int j=0;
+            for (; j+7<N; j=j+8)
+            {
+                signed char* vb = bottom_tm.channel(j/8);
+                signed char* va = kernel_tm.channel(i/4 + i%4);
+                int sum[8] = {0};
+
+                int k=0;
+                for (; k+7<L; k=k+8)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum[n] += (int)va[0] * vb[n];
+                        sum[n] += (int)va[1] * vb[n+8];
+                        sum[n] += (int)va[2] * vb[n+16];
+                        sum[n] += (int)va[3] * vb[n+24];
+                        sum[n] += (int)va[4] * vb[n+32];
+                        sum[n] += (int)va[5] * vb[n+40];
+                        sum[n] += (int)va[6] * vb[n+48];
+                        sum[n] += (int)va[7] * vb[n+56];
+                    }
+                    va += 8;
+                    vb += 64;
+                }
+
+                for (; k<L; k++)
+                {
+                    for (int n=0; n<8; n++)
+                    {
+                        sum[n] += (int)va[0] * vb[n];
+                    }
+                    va += 1;
+                    vb += 8;
+                }
+
+                for (int n=0; n<8; n++)
+                {
+                    output[n] = sum[n];
+                }
+                output += 8;
+            }
+
+            for (; j<N; j++)
+            {
+                int sum = 0;
+
+                signed char* vb = bottom_tm.channel(j/8 + j%8);
+                signed char* va = kernel_tm.channel(i/4 + i%4);
+
+                for (int k=0; k<L; k++)
+                {
+                    sum += (int)va[0] * vb[0];
+
+                    va += 1;
+                    vb += 1;
+                }
+                output[0] = sum;
+
+                output++;
+            }
+        }
+    }
+}
diff --git a/src/layer/x86/convolution_x86.cpp b/src/layer/x86/convolution_x86.cpp
index 9190423a2..0e36b1b22 100644
--- a/src/layer/x86/convolution_x86.cpp
+++ b/src/layer/x86/convolution_x86.cpp
@@ -14,17 +14,61 @@
 
 #include "convolution_x86.h"
 
+#include "layer_type.h"
+#include "benchmark.h"
+
 namespace ncnn {
 
 #include "convolution_1x1.h"
 #include "convolution_3x3.h"
 #include "convolution_5x5.h"
 
+#include "convolution_sgemm_int8.h"
 #include "convolution_1x1_int8.h"
 #include "convolution_3x3_int8.h"
+#include "convolution_5x5_int8.h"
+#include "convolution_7x7_int8.h"
 
 DEFINE_LAYER_CREATOR(Convolution_x86)
 
+int Convolution_x86::load_param(const ParamDict& pd)
+{
+    int ret = Convolution::load_param(pd);
+    if (ret != 0)
+        return ret;
+
+    use_winograd3x3 = false;
+
+    if (pd.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
+    {
+        int num_input = weight_data_size / 9 / num_output;
+        // winograd is slow on small channel count
+        if(num_input >= 16 && num_output >= 16)
+            use_winograd3x3 = true;
+    }           
+
+    return 0;
+}
+
+int Convolution_x86::load_model(const ModelBin& mb)
+{
+    int ret = Convolution::load_model(mb);
+    if (ret != 0)
+        return ret;
+
+    if (use_winograd3x3)
+    {
+        int num_input = weight_data_size / 9 / num_output;
+
+        if (use_int8_inference)
+            conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
+        else
+            conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
+    }
+
+    return 0;
+}
+
 int Convolution_x86::forwardDilation(const Mat& bottom_blob, Mat& top_blob, conv_func conv, const Option& opt) const
 {
     int w = bottom_blob.w;
@@ -147,7 +191,7 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     const int kernel_size = kernel_w;
     const int stride = stride_w;
 
-    if (kernel_size > 5 || stride > 5 || dilation_w != dilation_h)
+    if (kernel_size > 7 || stride > 7 || dilation_w != dilation_h)
     {
         return Convolution::forward(bottom_blob, top_blob, opt);
     }
@@ -155,26 +199,23 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
-    conv_func conv_func_table[5][5] =
+    conv_func conv_func_table[7][4] =
     {
         {
             conv1x1s1_sse,
             conv1x1s2_sse,
             0,
-            0,
             0
         }, // kernel_size = 1
         {
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 2
         {
             conv3x3s1_sse,
-            0,
-            0,
+            conv3x3s2_sse,
             0,
             0
         }, // kernel_size = 3
@@ -182,35 +223,43 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 4
         {
             conv5x5s1_sse,
             0,
+            0,
+            0
+        }, // kernel_size = 5
+        {
             0,
             0,
+            0,
+            0
+        }, // kernel_size = 6
+        {
+            0,          
+            0,          
+            0,
             0
-        }  // kernel_size = 5
+        }  // kernel_size = 7        
     };
 
     typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
-    conv_int8_func conv_int8_func_table[5][5] =
+    conv_int8_func conv_int8_func_table[7][4] =
     {
         {
             conv1x1s1_int8_sse,
             conv1x1s2_int8_sse,
             0,
-            0,
             0
         }, // kernel_size = 1
         {
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 2
         {
@@ -218,22 +267,31 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
             conv3x3s2_int8_sse,
             0,
             0,
-            0
         }, // kernel_size = 3
         {
             0,
             0,
             0,
-            0,
             0
         }, // kernel_size = 4
+        {        
+            conv5x5s1_int8_sse,
+            conv5x5s2_int8_sse,    
+            0,
+            0
+        }, // kernel_size = 5
         {
             0,
             0,
             0,
+            0
+        }, // kernel_size = 6
+        {
+            conv7x7s1_int8_sse,          
+            conv7x7s2_int8_sse, 
             0,
             0
-        }  // kernel_size = 5
+        }  // kernel_size = 7
     };
 
     conv_func conv = 0;
@@ -322,21 +380,69 @@ int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 
     if (use_int8_inference)
     {
-        conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
-
-        // dequantize, reverse scale inplace
+        if (use_int8_requantize == true)
         {
-            ncnn::Option opt_g = opt;
-            opt_g.blob_allocator = top_blob.allocator;
+            Mat top_blob_tm;
+            top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
+            if (top_blob_tm.empty())
+                return -100;
+            
+            top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100; 
+
+            if (use_winograd3x3)
+                conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_tm, weight_3x3_winograd23_data, opt);
+            else
+                conv_int8(bottom_blob_bordered, top_blob_tm, weight_data, opt);
 
-            dequantize->forward_inplace(top_blob, opt_g);
+            // requantize, reverse scale inplace
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
+            {
+                ncnn::Option opt_g = opt;
+                opt_g.num_threads = 1;
+                opt_g.blob_allocator = top_blob.allocator;
+
+                Mat top_blob_tm_g = top_blob_tm.channel_range(p, 1);
+                Mat top_blob_g = top_blob.channel_range(p, 1);
+                requantize_ops[p]->forward(top_blob_tm_g, top_blob_g, opt_g);
+            }                                       
         }
+        else
+        {
+            top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
+            if (top_blob.empty())
+                return -100; 
 
+            if (use_winograd3x3)
+                conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, opt);
+            else
+                conv_int8(bottom_blob_bordered, top_blob, weight_data, opt);
+
+            // dequantize, reverse scale inplace
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int p=0; p<num_output; p++)
+            {
+                ncnn::Option opt_g = opt;
+                opt_g.num_threads = 1;
+                opt_g.blob_allocator = top_blob.allocator;
+
+                Mat top_blob_g = top_blob.channel_range(p, 1);
+                dequantize_ops[p]->forward_inplace(top_blob_g, opt_g);
+            }                    
+        } 
+      
         return 0;
     }
 
-    conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
-
+    if (use_winograd3x3)
+    {
+        conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt);
+    }    
+    else
+        conv(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+        
     return 0;
 }
 
diff --git a/src/layer/x86/convolution_x86.h b/src/layer/x86/convolution_x86.h
index e72c14aca..0062548af 100644
--- a/src/layer/x86/convolution_x86.h
+++ b/src/layer/x86/convolution_x86.h
@@ -24,8 +24,16 @@ typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option
 class Convolution_x86 : public Convolution
 {
 public:
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int load_model(const ModelBin& mb);
+
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     virtual int forwardDilation(const Mat& bottom_blob, Mat &top_blob, conv_func conv, const Option& opt) const;
+
+public:
+    bool use_winograd3x3;
+    Mat weight_3x3_winograd23_data;
 };
 
 } // namespace ncnn
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index a440c1fac..d1911e8c7 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -134,7 +134,7 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     }
 
     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
-    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;     
 
     Mat bottom_blob_unbordered = bottom_blob;
     if (use_int8_inference && elemsize != 1)
@@ -159,8 +159,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
             quantize_ops[g]->forward(bottom_blob_g, bottom_blob_int8_g, opt_g);
         }
 
-        bottom_blob_unbordered = bottom_blob_int8;
-    }
+        bottom_blob_unbordered = bottom_blob_int8;       
+    }     
 
     Mat bottom_blob_bordered = bottom_blob_unbordered;
     if (pad_w > 0 || pad_h > 0)
@@ -203,25 +203,65 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
             {
                 if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
                 {
-                    if (stride_w == 1 && stride_h == 1)
+                    if (use_int8_requantize)
                     {
-                        convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
+                        Mat top_blob_tm;
+                        top_blob_tm.create(outw, outh, num_output, (size_t)4u, opt.workspace_allocator);
+                        if (top_blob_tm.empty())
+                            return -100;
+                        
+                        top_blob.create(outw, outh, num_output, (size_t)1u, opt.blob_allocator);
+                        if (top_blob.empty())
+                            return -100;
+
+                        if (stride_w == 1 && stride_h == 1)
+                        {
+                            convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
+                        }
+                        else if (stride_w == 2 && stride_h == 2)
+                        {
+                            convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob_tm, weight_data, opt);
+                        }
+
+                        // requantize, reverse scale inplace
+                        #pragma omp parallel for num_threads(opt.num_threads)
+                        for (int g=0; g<group; g++)
+                        {
+                            ncnn::Option opt_g = opt;
+                            opt_g.num_threads = 1;
+                            opt_g.blob_allocator = top_blob.allocator;
+
+                            Mat top_blob_tm_g = top_blob_tm.channel_range(g, 1);
+                            Mat top_blob_g = top_blob.channel_range(g, 1);
+                            requantize_ops[g]->forward(top_blob_tm_g, top_blob_g, opt_g);
+                        }                      
                     }
-                    else if (stride_w == 2 && stride_h == 2)
+                    else
                     {
-                        convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
-                    }
-
-                    // dequantize, reverse scale inplace
-                    #pragma omp parallel for num_threads(opt.num_threads)
-                    for (int g=0; g<group; g++)
-                    {
-                        ncnn::Option opt_g = opt;
-                        opt_g.num_threads = 1;
-                        opt_g.blob_allocator = top_blob.allocator;
-
-                        Mat top_blob_g = top_blob.channel(g);
-                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
+                        top_blob.create(outw, outh, num_output, (size_t)4u, opt.blob_allocator);
+                        if (top_blob.empty())
+                            return -100;                       
+
+                        if (stride_w == 1 && stride_h == 1)
+                        {
+                            convdw3x3s1_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
+                        }
+                        else if (stride_w == 2 && stride_h == 2)
+                        {
+                            convdw3x3s2_int8_sse(bottom_blob_bordered, top_blob, weight_data, opt);
+                        }
+
+                        // dequantize, reverse scale inplace
+                        #pragma omp parallel for num_threads(opt.num_threads)
+                        for (int g=0; g<group; g++)
+                        {
+                            ncnn::Option opt_g = opt;
+                            opt_g.num_threads = 1;
+                            opt_g.blob_allocator = top_blob.allocator;
+
+                            Mat top_blob_g = top_blob.channel(g);
+                            dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
+                        }
                     }
 
                     return 0;
diff --git a/src/net.cpp b/src/net.cpp
index fc3565734..0c129d12e 100644
--- a/src/net.cpp
+++ b/src/net.cpp
@@ -16,6 +16,9 @@
 #include "layer_type.h"
 #include "modelbin.h"
 #include "paramdict.h"
+#include "convolution.h"
+#include "convolutiondepthwise.h"
+#include "relu.h"
 
 #include <stdarg.h>
 #include <stdio.h>
@@ -679,6 +682,8 @@ int Net::load_model(FILE* fp)
     }
 #endif // NCNN_VULKAN
 
+    fuse_network();
+
     return ret;
 }
 
@@ -898,6 +903,110 @@ int Net::load_model(const unsigned char* _mem)
     return mem - _mem;
 }
 
+void Net::fuse_network()
+{
+    // set the int8 op fusion:requantize
+#if NCNN_STRING && NCNN_REQUANT    
+    // fprintf(stderr, "Test op fusion to int8 implement:\n");
+    for (size_t i=0; i<layers.size(); i++)
+    {
+        Layer* layer = layers[i];
+
+        if (layer->type == "Convolution" || layer->type == "ConvolutionDepthWise")
+        {
+            if (((Convolution*)layer)->use_int8_inference == false)
+                continue;
+
+            for (size_t n=0; n<blobs[layer->tops[0]].consumers.size(); n++)
+            {
+                int layer_next_index = blobs[layer->tops[0]].consumers[n];
+                Layer* layer_next = layers[layer_next_index];
+
+                if (layer_next->type == "ReLU")
+                {
+                    int layer_next_2_index = blobs[layer_next->tops[0]].consumers[0];
+                    Layer* layer_next_2 = layers[layer_next_2_index];
+
+                    if (layer_next_2->type == "Convolution" || layer_next_2->type == "ConvolutionDepthWise")
+                    {
+                        // fprintf(stderr, "%s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
+                        if (layer->type == "Convolution" && layer_next_2->type == "Convolution")
+                        {
+                            ((Convolution*)layer)->use_int8_requantize = true;
+                            ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
+                            ((Convolution*)layer)->create_requantize_op();
+                        }
+                        else if (layer->type == "ConvolutionDepthWise" && layer_next_2->type == "Convolution")
+                        {
+                            ((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
+                            ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_2)->bottom_blob_int8_scale;
+                            ((ConvolutionDepthWise*)layer)->create_requantize_op();
+                        }
+                        else if (layer->type == "Convolution" && layer_next_2->type == "ConvolutionDepthWise")
+                        {
+                            ((Convolution*)layer)->use_int8_requantize = true;
+                            ((Convolution*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
+                            ((Convolution*)layer)->create_requantize_op();
+                        }
+                        else
+                        {
+                            ((ConvolutionDepthWise*)layer)->use_int8_requantize = true;
+                            ((ConvolutionDepthWise*)layer)->top_blob_int8_scale = ((ConvolutionDepthWise*)layer_next_2)->bottom_blob_int8_scales[0];
+                            ((ConvolutionDepthWise*)layer)->create_requantize_op();
+                        }
+                    }
+                    else if (layer_next_2->type == "Split")
+                    {
+                        bool all_conv = true;
+                        for (size_t i=0; i<layer_next_2->tops.size(); i++)
+                        {
+                            int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
+                            if (layers[layer_next_3_index]->type != "Convolution" && layers[layer_next_3_index]->type != "ConvolutionDepthWise" && layers[layer_next_3_index]->type != "PriorBox" )
+                            {
+                                // fprintf(stderr, "%s, %s, %s, %s\n", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str(), layers[layer_next_3_index]->name.c_str());
+                                all_conv = false;
+                            }
+                        }
+
+                        if (all_conv == true && layer_next_2->tops.size() >= size_t(2))
+                        {
+                            // fprintf(stderr, "%s, %s, %s, ", layer->name.c_str(), layer_next->name.c_str(), layer_next_2->name.c_str());
+                            for (size_t i=0; i<layer_next_2->tops.size(); i++)
+                            {
+                                int layer_next_3_index = blobs[layer_next_2->tops[i]].consumers[0];
+                                Layer* layer_next_3 = layers[layer_next_3_index];
+
+                                // fprintf(stderr, "%s, ", layer_next_3->name.c_str());
+                                if (layer_next_3->type == "Convolution")
+                                {
+                                    ((Convolution*)layer)->top_blob_int8_scale = ((Convolution*)layer_next_3)->bottom_blob_int8_scale; 
+                                }    
+                            }
+
+                            ((Convolution*)layer)->use_int8_requantize = true;
+                            ((Convolution*)layer)->create_requantize_op();    
+                            // fprintf(stderr, "\n");
+                        }
+                    }
+                    else
+                    {
+                        // fprintf(stderr, "%s, %s\n", layer->name.c_str(), layer_next->name.c_str());
+                    }
+                }
+                else if (layer_next->type == "Pooling")
+                {
+                    // ToDo
+                }
+                else
+                {
+                    // fprintf(stderr, "%s\n", layer->name.c_str());
+                }                  
+            }
+        }
+    }
+#endif
+}
+
 void Net::clear()
 {
     blobs.clear();
diff --git a/src/net.h b/src/net.h
index 6c877e4e9..f957aa1e7 100644
--- a/src/net.h
+++ b/src/net.h
@@ -76,6 +76,10 @@ public:
     // return bytes consumed
     int load_model(const unsigned char* mem);
 
+    // parse the structure of network
+    // fuse int8 op dequantize and quantize by requantize
+    void fuse_network();
+
     // unload network structure and weight data
     void clear();
 
diff --git a/src/platform.h.in b/src/platform.h.in
index d94484068..505f141cc 100644
--- a/src/platform.h.in
+++ b/src/platform.h.in
@@ -22,5 +22,7 @@
 #cmakedefine01 NCNN_PIXEL
 #cmakedefine01 NCNN_PIXEL_ROTATE
 #cmakedefine01 NCNN_VULKAN
+#cmakedefine01 NCNN_REQUANT
+#cmakedefine01 NCNN_IM2COL_SGEMM
 
 #endif // NCNN_PLATFORM_H
diff --git a/tools/caffe/caffe2ncnn.cpp b/tools/caffe/caffe2ncnn.cpp
index 2451b5367..0d0b15c76 100644
--- a/tools/caffe/caffe2ncnn.cpp
+++ b/tools/caffe/caffe2ncnn.cpp
@@ -685,7 +685,7 @@ int main(int argc, char** argv)
 
                 if (int8_scale_term)
                 {
-                    if ((int)weight_int8scale.size() == num_group && (int)blob_int8scale.size() == num_group)
+                    if ((int)weight_int8scale.size() == num_group)
                     {
                         fprintf(pp, " 8=1");
                     }