diff --git a/src/layer/arm/convolution_1x1_int8.h b/src/layer/arm/convolution_1x1_int8.h
index c5985295e..0d135a51b 100644
--- a/src/layer/arm/convolution_1x1_int8.h
+++ b/src/layer/arm/convolution_1x1_int8.h
@@ -16,6 +16,91 @@
 #include <arm_neon.h>
 #endif // __ARM_NEON
 
+#if __aarch64__
+static void conv1x1s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const float *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        int q = 0;
+
+        for (; q+7<inch; q+=8)
+        {
+            int* outptr0 = out0;
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            const signed char *r0 = bottom_blob.channel(q);
+            const signed char *r1 = bottom_blob.channel(q + 1);
+            const signed char *r2 = bottom_blob.channel(q + 2);
+            const signed char *r3 = bottom_blob.channel(q + 3);
+            const signed char *r4 = bottom_blob.channel(q + 4);
+            const signed char *r5 = bottom_blob.channel(q + 5);
+            const signed char *r6 = bottom_blob.channel(q + 6);
+            const signed char *r7 = bottom_blob.channel(q + 7);
+
+            int size = outw * outh;
+            int remain = size;
+
+            for (; remain > 0; remain--)
+            {
+                //ToDo Neon
+                int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
+                           (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
+                           (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
+                           (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
+
+                *outptr0 += sum0;
+
+                r0++;
+                r1++;
+                r2++;
+                r3++;
+                r4++;
+                r5++;
+                r6++;
+                r7++;
+                outptr0++;
+            }
+        }
+
+        for (; q<inch; q++)
+        {
+            int* outptr0 = out0;
+
+            const signed char *r0 = bottom_blob.channel(q);
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+            const signed char k0 = kernel0[0];
+
+            int size = outw * outh;
+            int remain = size;
+
+            for (; remain > 0; remain--)
+            {
+                int sum0 = (int)(*r0) * (int)k0;
+
+                *outptr0 += sum0;
+
+                r0++;
+                outptr0++;
+            }
+        }
+    }
+}
+#else // __aarch64__
 /*
  * Convolution 1x1 quantized with int8,unroll 8 x 4
  */
@@ -1366,7 +1451,7 @@ static void conv1x1s1_neon_s8_left4(const Mat& bottom_blob, Mat& top_blob, const
     }
 }
 
-static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
+static void conv1x1s1_int8_neon(const Mat& bottom_blob, Mat& top_blob, const Mat& _kernel, const Option& opt)
 {
     int size = top_blob.h * top_blob.w;
     int remain = size & 7;
@@ -1391,3 +1476,105 @@ static void conv1x1s1_neon_s8_inter(const Mat& bottom_blob, Mat& top_blob, const
 
     return;
 }
+#endif // __aarch64__
+
+static void conv1x1s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2*outw + w;
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        int q = 0;
+
+        for (; q+7<inch; q+=8)
+        {
+            int* outptr0 = out0;
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            const signed char *r0 = bottom_blob.channel(q);
+            const signed char *r1 = bottom_blob.channel(q + 1);
+            const signed char *r2 = bottom_blob.channel(q + 2);
+            const signed char *r3 = bottom_blob.channel(q + 3);
+            const signed char *r4 = bottom_blob.channel(q + 4);
+            const signed char *r5 = bottom_blob.channel(q + 5);
+            const signed char *r6 = bottom_blob.channel(q + 6);
+            const signed char *r7 = bottom_blob.channel(q + 7);
+
+            for(int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    //ToDo Neon
+                    int sum0 = (int)*r0 * (int)kernel0[0] + (int)*r1 * (int)kernel0[1] +
+                            (int)*r2 * (int)kernel0[2] + (int)*r3 * (int)kernel0[3] +
+                            (int)*r4 * (int)kernel0[4] + (int)*r5 * (int)kernel0[5] +
+                            (int)*r6 * (int)kernel0[6] + (int)*r7 * (int)kernel0[7];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    r3 += 2;
+                    r4 += 2;
+                    r5 += 2;
+                    r6 += 2;
+                    r7 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+                r3 += tailstep;
+                r4 += tailstep;
+                r5 += tailstep;
+                r6 += tailstep;
+                r7 += tailstep;
+            }
+        }
+
+        for (; q<inch; q++)
+        {
+            int* outptr0 = out0;
+
+            const signed char *r0 = bottom_blob.channel(q);
+
+            const signed char *kernel0 = (const signed char *)kernel + p * inch + q;
+
+            for(int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    //ToDo Neon
+                    int sum0 = (int)*r0 * (int)kernel0[0];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+            }
+        }
+    }
+}
diff --git a/src/layer/arm/convolution_3x3_int8.h b/src/layer/arm/convolution_3x3_int8.h
new file mode 100644
index 000000000..974931fb9
--- /dev/null
+++ b/src/layer/arm/convolution_3x3_int8.h
@@ -0,0 +1,149 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void conv3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;
+
+        for (int q = 0; q < inch; q++)
+        {
+            int *outptr0 = out0;
+
+            const signed char *img0 = bottom_blob.channel(q);
+
+            const signed char *r0 = img0;
+            const signed char *r1 = img0 + w;
+            const signed char *r2 = img0 + w * 2;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    int sum0 = 0;
+
+                    sum0 += (int)r0[0] * kernel0[0];
+                    sum0 += (int)r0[1] * kernel0[1];
+                    sum0 += (int)r0[2] * kernel0[2];
+                    sum0 += (int)r1[0] * kernel0[3];
+                    sum0 += (int)r1[1] * kernel0[4];
+                    sum0 += (int)r1[2] * kernel0[5];
+                    sum0 += (int)r2[0] * kernel0[6];
+                    sum0 += (int)r2[1] * kernel0[7];
+                    sum0 += (int)r2[2] * kernel0[8];
+
+                    *outptr0 += sum0;
+
+                    r0++;
+                    r1++;
+                    r2++;
+                    outptr0++;
+                }
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+            }
+
+            kernel0 += 9;
+        }
+    }
+}
+
+static void conv3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out0 = top_blob.channel(p);
+
+        out0.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * inch * 9;
+
+        for (int q = 0; q < inch; q++)
+        {
+            int *outptr0 = out0;
+
+            const signed char *img0 = bottom_blob.channel(q);
+
+            const signed char *r0 = img0;
+            const signed char *r1 = img0 + w;
+            const signed char *r2 = img0 + w * 2;
+
+            for (int i = 0; i < outh; i++)
+            {
+                int remain = outw;
+
+                for (; remain > 0; remain--)
+                {
+                    int sum0 = 0;
+
+                    sum0 += (int)r0[0] * (int)kernel0[0];
+                    sum0 += (int)r0[1] * (int)kernel0[1];
+                    sum0 += (int)r0[2] * (int)kernel0[2];
+                    sum0 += (int)r1[0] * (int)kernel0[3];
+                    sum0 += (int)r1[1] * (int)kernel0[4];
+                    sum0 += (int)r1[2] * (int)kernel0[5];
+                    sum0 += (int)r2[0] * (int)kernel0[6];
+                    sum0 += (int)r2[1] * (int)kernel0[7];
+                    sum0 += (int)r2[2] * (int)kernel0[8];
+
+                    *outptr0 += sum0;
+
+                    r0 += 2;
+                    r1 += 2;
+                    r2 += 2;
+                    outptr0++;
+                }
+
+                r0 += tailstep;
+                r1 += tailstep;
+                r2 += tailstep;
+            }
+
+            kernel0 += 9;
+        }
+    }
+}
diff --git a/src/layer/arm/convolution_arm.cpp b/src/layer/arm/convolution_arm.cpp
index f241092d1..4e6e339f2 100644
--- a/src/layer/arm/convolution_arm.cpp
+++ b/src/layer/arm/convolution_arm.cpp
@@ -23,9 +23,8 @@ namespace ncnn {
 #include "convolution_5x5.h"
 #include "convolution_7x7.h"
 
-#if !__aarch64__
 #include "convolution_1x1_int8.h"
-#endif // !__aarch64__
+#include "convolution_3x3_int8.h"
 
 DEFINE_LAYER_CREATOR(Convolution_arm)
 
@@ -221,19 +220,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         return Convolution::forward(bottom_blob, top_blob, opt);
     }
 
-#if __aarch64__
-    if (use_int8_inference)
-    {
-        // TODO
-        return Convolution::forward(bottom_blob, top_blob, opt);
-    }
-#else
-    if (use_int8_inference && (kernel_size != 1 || stride != 1))
-    {
-        return Convolution::forward(bottom_blob, top_blob, opt);
-    }
-#endif
-
     typedef void (*conv_func)(const Mat&, Mat&, const Mat&, const Mat&, const Option&);
 
     // kernel_size x stride
@@ -283,10 +269,58 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
         }  // kernel_size = 7
     };
 
+    typedef void (*conv_int8_func)(const Mat&, Mat&, const Mat&, const Option&);
+
+    // kernel_size x stride
+    conv_int8_func conv_int8_func_table[5][5] =
+    {
+        {
+            conv1x1s1_int8_neon,
+            conv1x1s2_int8_neon,
+            0,
+            0,
+            0
+        }, // kernel_size = 1
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 2
+        {
+            conv3x3s1_int8_neon,
+            conv3x3s2_int8_neon,
+            0,
+            0,
+            0
+        }, // kernel_size = 3
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }, // kernel_size = 4
+        {
+            0,
+            0,
+            0,
+            0,
+            0
+        }  // kernel_size = 5
+    };
+
     conv_func conv = 0;
+    conv_int8_func conv_int8 = 0;
 
     if (use_int8_inference)
     {
+        conv_int8 = conv_int8_func_table[kernel_size-1][stride-1];
+        if (!conv_int8)
+        {
+            return Convolution::forward(bottom_blob, top_blob, opt);
+        }
     }
     else
     {
@@ -339,11 +373,8 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
     if (top_blob.empty())
         return -100;
 
-#if !__aarch64__
     if (use_int8_inference)
     {
-        // kernel_size = 1
-        // stride = 1
         Mat bottom_blob_bordered_int8;
         bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
         if (bottom_blob_bordered_int8.empty())
@@ -359,7 +390,7 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
             quantize->forward(bottom_blob_bordered, bottom_blob_bordered_int8, opt);
         }
 
-        conv1x1s1_neon_s8_inter(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+        conv_int8(bottom_blob_bordered_int8, top_blob, weight_data, opt);
 
         // dequantize, reverse scale inplace
         {
@@ -382,7 +413,6 @@ int Convolution_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option
 
         return 0;
     }
-#endif
 
     if (use_winograd3x3 && w <= 120 && h <= 120)
     {
diff --git a/src/layer/arm/convolutiondepthwise_3x3_int8.h b/src/layer/arm/convolutiondepthwise_3x3_int8.h
new file mode 100644
index 000000000..73c11d3ed
--- /dev/null
+++ b/src/layer/arm/convolutiondepthwise_3x3_int8.h
@@ -0,0 +1,142 @@
+// SenseNets is pleased to support the open source community by supporting ncnn available.
+//
+// Copyright (C) 2018 SenseNets Technology Ltd. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+static void convdw3x3s1_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    //int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+
+        out.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * 9;
+
+        int *outptr = out;
+
+        const signed char *img0 = bottom_blob.channel(p);
+
+        const signed char *r0 = img0;
+        const signed char *r1 = img0 + w;
+        const signed char *r2 = img0 + w * 2;
+
+        int i = 0;
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain > 0; remain--)
+            {
+
+                int sum = 0;
+
+                sum += (int)r0[0] * (int)kernel0[0];
+                sum += (int)r0[1] * (int)kernel0[1];
+                sum += (int)r0[2] * (int)kernel0[2];
+                sum += (int)r1[0] * (int)kernel0[3];
+                sum += (int)r1[1] * (int)kernel0[4];
+                sum += (int)r1[2] * (int)kernel0[5];
+                sum += (int)r2[0] * (int)kernel0[6];
+                sum += (int)r2[1] * (int)kernel0[7];
+                sum += (int)r2[2] * (int)kernel0[8];
+
+                *outptr += sum;
+
+                r0++;
+                r1++;
+                r2++;
+                outptr++;
+            }
+
+            r0 += 2;
+            r1 += 2;
+            r2 += 2;
+        }
+    }
+}
+
+static void convdw3x3s2_int8_neon(const Mat &bottom_blob, Mat &top_blob, const Mat &_kernel, const Option& opt)
+{
+    int w = bottom_blob.w;
+    //int h = bottom_blob.h;
+    //int inch = bottom_blob.c;
+
+    int outw = top_blob.w;
+    int outh = top_blob.h;
+    int outch = top_blob.c;
+
+    const int tailstep = w - 2 * outw + w;
+
+    const signed char *kernel = _kernel;
+
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = 0; p < outch; p++)
+    {
+        Mat out = top_blob.channel(p);
+        out.fill(0);
+
+        const signed char *kernel0 = (const signed char *)kernel + p * 9;
+
+        int *outptr = out;
+
+        const signed char *img0 = bottom_blob.channel(p);
+
+        const signed char *r0 = img0;
+        const signed char *r1 = img0 + w;
+        const signed char *r2 = img0 + w * 2;
+
+        int i = 0;
+
+        for (; i < outh; i++)
+        {
+            int remain = outw;
+
+            for (; remain > 0; remain--)
+            {
+                int sum = 0;
+
+                sum += (int)r0[0] * (int)kernel0[0];
+                sum += (int)r0[1] * (int)kernel0[1];
+                sum += (int)r0[2] * (int)kernel0[2];
+                sum += (int)r1[0] * (int)kernel0[3];
+                sum += (int)r1[1] * (int)kernel0[4];
+                sum += (int)r1[2] * (int)kernel0[5];
+                sum += (int)r2[0] * (int)kernel0[6];
+                sum += (int)r2[1] * (int)kernel0[7];
+                sum += (int)r2[2] * (int)kernel0[8];
+
+                *outptr += sum;
+
+                r0 += 2;
+                r1 += 2;
+                r2 += 2;
+                outptr++;
+            }
+
+            r0 += tailstep;
+            r1 += tailstep;
+            r2 += tailstep;
+        }
+    }
+}
diff --git a/src/layer/arm/convolutiondepthwise_arm.cpp b/src/layer/arm/convolutiondepthwise_arm.cpp
index 1d938912f..e48820b38 100644
--- a/src/layer/arm/convolutiondepthwise_arm.cpp
+++ b/src/layer/arm/convolutiondepthwise_arm.cpp
@@ -24,6 +24,8 @@ namespace ncnn {
 
 #include "convolutiondepthwise_3x3.h"
 
+#include "convolutiondepthwise_3x3_int8.h"
+
 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_arm)
 
 ConvolutionDepthWise_arm::ConvolutionDepthWise_arm()
@@ -56,7 +58,13 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
     if (channels == group && group == num_output)
     {
         // depth-wise specific
-        return 0;
+        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
+        {
+            if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
+            {
+                return 0;
+            }
+        }
     }
 
     const int channels_g = channels / group;
@@ -66,7 +74,7 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
 
     for (int g=0; g<group; g++)
     {
-        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const float*)weight_data + maxk * channels_g * num_output_g * g));
+        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
         Mat bias_data_g;
         if (bias_term)
             bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
@@ -87,6 +95,12 @@ int ConvolutionDepthWise_arm::load_model(const ModelBin& mb)
         pd.set(5, bias_term);
         pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
 
+        if (use_int8_inference)
+        {
+            pd.set(8, weight_data_int8_scales[g]);
+            pd.set(9, bottom_blob_int8_scales[g]);
+        }
+
         op->load_param(pd);
 
         // set weights
@@ -107,12 +121,6 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
     // convolv with NxN kernel
     // value = value + bias
 
-    if (use_int8_inference)
-    {
-        // TODO
-        return ConvolutionDepthWise::forward(bottom_blob, top_blob, opt);
-    }
-
     int w = bottom_blob.w;
     int h = bottom_blob.h;
     int channels = bottom_blob.c;
@@ -159,75 +167,91 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
     if (top_blob.empty())
         return -100;
 
-    const int maxk = kernel_w * kernel_h;
-
     // depth-wise
     if (channels == group && group == num_output)
     {
-        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
+        if (use_int8_inference)
         {
-            if (stride_w == 1 && stride_h == 1)
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
             {
-                convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
-                return 0;
+                if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
+                {
+                    Mat bottom_blob_bordered_int8;
+                    bottom_blob_bordered_int8.create(w, h, channels, (size_t)1u, opt.workspace_allocator);
+                    if (bottom_blob_bordered_int8.empty())
+                        return -100;
+
+                    // quantize, scale and round to nearest
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int g=0; g<group; g++)
+                    {
+                        ncnn::Option opt_g = opt;
+                        opt_g.num_threads = 1;
+                        opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
+
+                        const Mat bottom_blob_bordered_g = bottom_blob_bordered.channel(g);
+                        Mat bottom_blob_bordered_int8_g = bottom_blob_bordered_int8.channel(g);
+                        quantize_ops[g]->forward(bottom_blob_bordered_g, bottom_blob_bordered_int8_g, opt_g);
+                    }
+
+                    if (stride_w == 1 && stride_h == 1)
+                    {
+                        convdw3x3s1_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+                    }
+                    else if (stride_w == 2 && stride_h == 2)
+                    {
+                        convdw3x3s2_int8_neon(bottom_blob_bordered_int8, top_blob, weight_data, opt);
+                    }
+
+                    // dequantize, reverse scale inplace
+                    #pragma omp parallel for num_threads(opt.num_threads)
+                    for (int g=0; g<group; g++)
+                    {
+                        ncnn::Option opt_g = opt;
+                        opt_g.num_threads = 1;
+                        opt_g.blob_allocator = top_blob.allocator;
+
+                        Mat top_blob_g = top_blob.channel(g);
+                        dequantize_ops[g]->forward_inplace(top_blob_g, opt_g);
+                    }
+
+                    return 0;
+                }
             }
-            else if (stride_w == 2 && stride_h == 2)
+        }
+        else
+        {
+            if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
             {
-                convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
-                return 0;
+                if (stride_w == 1 && stride_h == 1)
+                {
+                    convdw3x3s1_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+                    return 0;
+                }
+                else if (stride_w == 2 && stride_h == 2)
+                {
+                    convdw3x3s2_neon(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
+                    return 0;
+                }
             }
         }
 
-#ifdef _OPENMP
-        int nested_current = omp_get_nested();
-        omp_set_nested(0);
-#endif
-
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int g=0; g<group; g++)
         {
             Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
             Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
-            Mat weight_data_g(maxk, (void*)((const float*)weight_data + maxk * g));
-            Mat bias_data_g;
-            if (bias_term)
-                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));
-
-            // call Convolution
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
-
-            // set param
-            ncnn::ParamDict pd;
-            pd.set(0, 1);// num_output
-            pd.set(1, kernel_w);
-            pd.set(11, kernel_h);
-            pd.set(2, dilation_w);
-            pd.set(12, dilation_h);
-            pd.set(3, stride_w);
-            pd.set(13, stride_h);
-            pd.set(4, 0);// pad_w
-            pd.set(14, 0);// pad_h
-            pd.set(5, bias_term);
-            pd.set(6, maxk);// weight_data_size
-
-            op->load_param(pd);
-
-            // set weights
-            ncnn::Mat weights[2];
-            weights[0] = weight_data_g;
-            weights[1] = bias_data_g;
-
-            op->load_model(ModelBinFromMatArray(weights));
 
-            // forward
-            op->forward(bottom_blob_bordered_g, top_blob_g, opt);
+            const ncnn::Layer* op = group_ops[g];
 
-            delete op;
+            ncnn::Option opt_g = opt;
+            opt_g.num_threads = 1;
+            opt_g.blob_allocator = top_blob.allocator;
+
+            // forward
+            op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
         }
 
-#ifdef _OPENMP
-        omp_set_nested(nested_current);
-#endif
         return 0;
     }
 
@@ -241,8 +265,11 @@ int ConvolutionDepthWise_arm::forward(const Mat& bottom_blob, Mat& top_blob, con
 
         const ncnn::Layer* op = group_ops[g];
 
+        ncnn::Option opt_g = opt;
+        opt_g.blob_allocator = top_blob.allocator;
+
         // forward
-        op->forward(bottom_blob_bordered_g, top_blob_g, opt);
+        op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
     }
 
     return 0;
diff --git a/src/layer/convolution.cpp b/src/layer/convolution.cpp
index b6466934a..ac6e98dac 100644
--- a/src/layer/convolution.cpp
+++ b/src/layer/convolution.cpp
@@ -53,6 +53,9 @@ int Convolution::load_param(const ParamDict& pd)
 
     use_int8_inference = pd.use_int8_inference;
 
+    if (weight_data_int8_scale == 0.f || bottom_blob_int8_scale == 0.f)
+        use_int8_inference = false;
+
     return 0;
 }
 
diff --git a/src/layer/convolutiondepthwise.cpp b/src/layer/convolutiondepthwise.cpp
index 5d416a82c..cc375a8ab 100644
--- a/src/layer/convolutiondepthwise.cpp
+++ b/src/layer/convolutiondepthwise.cpp
@@ -64,6 +64,9 @@ int ConvolutionDepthWise::load_param(const ParamDict& pd)
         return -100;
     }
 
+    if (weight_data_int8_scales.empty() || bottom_blob_int8_scales.empty())
+        use_int8_inference = false;
+
     // extend group if only one provided
     if (weight_data_int8_scales.w == 1)
     {
@@ -121,7 +124,30 @@ int ConvolutionDepthWise::load_model(const ModelBin& mb)
         for (int g=0; g<group; g++)
         {
             quantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Quantize);
+
+            ncnn::ParamDict pd;
+            pd.set(0, bottom_blob_int8_scales[g]);// scale
+
+            quantize_ops[g]->load_param(pd);
+        }
+
+        for (int g=0; g<group; g++)
+        {
             dequantize_ops[g] = ncnn::create_layer(ncnn::LayerType::Dequantize);
+
+            float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
+
+            ncnn::ParamDict pd;
+            pd.set(0, top_rescale);// scale
+            pd.set(1, bias_term);// bias_term
+            pd.set(2, 1);// bias_data_size
+
+            dequantize_ops[g]->load_param(pd);
+
+            ncnn::Mat weights[1];
+            weights[0] = Mat(1, (void*)((const float*)bias_data + g));
+
+            dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
         }
     }
 
@@ -252,11 +278,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
             {
                 // quantize, scale and round to nearest
                 {
-                    ncnn::ParamDict pd;
-                    pd.set(0, bottom_blob_int8_scales[g]);// scale
-
-                    quantize_ops[g]->load_param(pd);
-
                     ncnn::Option opt_g = opt;
                     opt_g.num_threads = 1;
                     opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -293,20 +314,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
 
                 // dequantize, reverse scale inplace
                 {
-                    float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
-
-                    ncnn::ParamDict pd;
-                    pd.set(0, top_rescale);// scale
-                    pd.set(1, bias_term);// bias_term
-                    pd.set(2, 1);// bias_data_size
-
-                    dequantize_ops[g]->load_param(pd);
-
-                    ncnn::Mat weights[1];
-                    weights[0] = Mat(1, (void*)((const float*)bias_data + g));
-
-                    dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
-
                     ncnn::Option opt_g = opt;
                     opt_g.num_threads = 1;
                     opt_g.blob_allocator = top_blob.allocator;
@@ -325,11 +332,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int g=0; g<group; g++)
             {
-                ncnn::ParamDict pd;
-                pd.set(0, bottom_blob_int8_scales[g]);// scale
-
-                quantize_ops[g]->load_param(pd);
-
                 ncnn::Option opt_g = opt;
                 opt_g.num_threads = 1;
                 opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -387,20 +389,6 @@ int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const O
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int g=0; g<group; g++)
             {
-                float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
-
-                ncnn::ParamDict pd;
-                pd.set(0, top_rescale);// scale
-                pd.set(1, bias_term);// bias_term
-                pd.set(2, num_output_g);// bias_data_size
-
-                dequantize_ops[g]->load_param(pd);
-
-                ncnn::Mat weights[1];
-                weights[0] = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
-
-                dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
-
                 ncnn::Option opt_g = opt;
                 opt_g.num_threads = 1;
                 opt_g.blob_allocator = top_blob.allocator;
diff --git a/src/layer/x86/convolutiondepthwise_x86.cpp b/src/layer/x86/convolutiondepthwise_x86.cpp
index ce09b013b..ce2eaf207 100644
--- a/src/layer/x86/convolutiondepthwise_x86.cpp
+++ b/src/layer/x86/convolutiondepthwise_x86.cpp
@@ -28,6 +28,94 @@ namespace ncnn {
 
 DEFINE_LAYER_CREATOR(ConvolutionDepthWise_x86)
 
+ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
+{
+}
+
+ConvolutionDepthWise_x86::~ConvolutionDepthWise_x86()
+{
+    for (int i=0; i<(int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+}
+
+int ConvolutionDepthWise_x86::load_model(const ModelBin& mb)
+{
+    int ret = ConvolutionDepthWise::load_model(mb);
+    if (ret != 0)
+        return ret;
+
+    // create Convolution op for each group
+    const int maxk = kernel_w * kernel_h;
+    int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
+
+    for (int i=0; i<(int)group_ops.size(); i++)
+        delete group_ops[i];
+
+    group_ops.clear();
+
+    if (channels == group && group == num_output)
+    {
+        // depth-wise specific
+        if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1)
+        {
+            if ((stride_w == 1 && stride_h == 1) || (stride_w == 2 && stride_h == 2))
+            {
+                return 0;
+            }
+        }
+    }
+
+    const int channels_g = channels / group;
+    const int num_output_g = num_output / group;
+
+    group_ops.resize(group);
+
+    for (int g=0; g<group; g++)
+    {
+        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
+        Mat bias_data_g;
+        if (bias_term)
+            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
+
+        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
+
+        // set param
+        ncnn::ParamDict pd;
+        pd.set(0, num_output_g);// num_output
+        pd.set(1, kernel_w);
+        pd.set(11, kernel_h);
+        pd.set(2, dilation_w);
+        pd.set(12, dilation_h);
+        pd.set(3, stride_w);
+        pd.set(13, stride_h);
+        pd.set(4, 0);// pad_w
+        pd.set(14, 0);// pad_h
+        pd.set(5, bias_term);
+        pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
+
+        if (use_int8_inference)
+        {
+            pd.set(8, weight_data_int8_scales[g]);
+            pd.set(9, bottom_blob_int8_scales[g]);
+        }
+
+        op->load_param(pd);
+
+        // set weights
+        ncnn::Mat weights[2];
+        weights[0] = weight_data_g;
+        weights[1] = bias_data_g;
+
+        op->load_model(ModelBinFromMatArray(weights));
+
+        group_ops[g] = op;
+    }
+
+    return 0;
+}
+
 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     // convolv with NxN kernel
@@ -79,8 +167,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     if (top_blob.empty())
         return -100;
 
-    const int maxk = kernel_w * kernel_h;
-
     // depth-wise
     if (channels == group && group == num_output)
     {
@@ -99,11 +185,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     #pragma omp parallel for num_threads(opt.num_threads)
                     for (int g=0; g<group; g++)
                     {
-                        ncnn::ParamDict pd;
-                        pd.set(0, bottom_blob_int8_scales[g]);// scale
-
-                        quantize_ops[g]->load_param(pd);
-
                         ncnn::Option opt_g = opt;
                         opt_g.num_threads = 1;
                         opt_g.blob_allocator = bottom_blob_bordered_int8.allocator;
@@ -126,20 +207,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
                     #pragma omp parallel for num_threads(opt.num_threads)
                     for (int g=0; g<group; g++)
                     {
-                        float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
-
-                        ncnn::ParamDict pd;
-                        pd.set(0, top_rescale);// scale
-                        pd.set(1, bias_term);// bias_term
-                        pd.set(2, 1);// bias_data_size
-
-                        dequantize_ops[g]->load_param(pd);
-
-                        ncnn::Mat weights[1];
-                        weights[0] = Mat(1, (void*)((const float*)bias_data + g));
-
-                        dequantize_ops[g]->load_model(ModelBinFromMatArray(weights));
-
                         ncnn::Option opt_g = opt;
                         opt_g.num_threads = 1;
                         opt_g.blob_allocator = top_blob.allocator;
@@ -174,42 +241,8 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
         {
             Mat bottom_blob_bordered_g(w, h, 1, bottom_blob_bordered.channel(g));
             Mat top_blob_g(outw, outh, 1, top_blob.channel(g), top_blob.elemsize, top_blob.allocator);
-            Mat weight_data_g(maxk, (void*)((const unsigned char*)weight_data + maxk * g * weight_data.elemsize), weight_data.elemsize);
-            Mat bias_data_g;
-            if (bias_term)
-                bias_data_g = Mat(1, (void*)((const float*)bias_data + g));
-
-            // call Convolution
-            ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
-
-            // set param
-            ncnn::ParamDict pd;
-            pd.set(0, 1);// num_output
-            pd.set(1, kernel_w);
-            pd.set(11, kernel_h);
-            pd.set(2, dilation_w);
-            pd.set(12, dilation_h);
-            pd.set(3, stride_w);
-            pd.set(13, stride_h);
-            pd.set(4, 0);// pad_w
-            pd.set(14, 0);// pad_h
-            pd.set(5, bias_term);
-            pd.set(6, maxk);// weight_data_size
-
-            if (use_int8_inference)
-            {
-                pd.set(8, weight_data_int8_scales[g]);
-                pd.set(9, bottom_blob_int8_scales[g]);
-            }
-
-            op->load_param(pd);
-
-            // set weights
-            ncnn::Mat weights[2];
-            weights[0] = weight_data_g;
-            weights[1] = bias_data_g;
 
-            op->load_model(ModelBinFromMatArray(weights));
+            const ncnn::Layer* op = group_ops[g];
 
             ncnn::Option opt_g = opt;
             opt_g.num_threads = 1;
@@ -217,8 +250,6 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
 
             // forward
             op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
-
-            delete op;
         }
 
         return 0;
@@ -231,50 +262,14 @@ int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, con
     {
         Mat bottom_blob_bordered_g(w, h, channels_g, bottom_blob_bordered.channel(channels_g * g));
         Mat top_blob_g(outw, outh, num_output_g, top_blob.channel(num_output_g * g), top_blob.elemsize, top_blob.allocator);
-        Mat weight_data_g(maxk * channels_g * num_output_g, (void*)((const unsigned char*)weight_data + maxk * channels_g * num_output_g * g * weight_data.elemsize), weight_data.elemsize);
-        Mat bias_data_g;
-        if (bias_term)
-            bias_data_g = Mat(num_output_g, (void*)((const float*)bias_data + num_output_g * g));
-
-        // call Convolution
-        ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
 
-        // set param
-        ncnn::ParamDict pd;
-        pd.set(0, num_output_g);// num_output
-        pd.set(1, kernel_w);
-        pd.set(11, kernel_h);
-        pd.set(2, dilation_w);
-        pd.set(12, dilation_h);
-        pd.set(3, stride_w);
-        pd.set(13, stride_h);
-        pd.set(4, 0);// pad_w
-        pd.set(14, 0);// pad_h
-        pd.set(5, bias_term);
-        pd.set(6, maxk * channels_g * num_output_g);// weight_data_size
-
-        if (use_int8_inference)
-        {
-            pd.set(8, weight_data_int8_scales[g]);
-            pd.set(9, bottom_blob_int8_scales[g]);
-        }
-
-        op->load_param(pd);
-
-        // set weights
-        ncnn::Mat weights[2];
-        weights[0] = weight_data_g;
-        weights[1] = bias_data_g;
-
-        op->load_model(ModelBinFromMatArray(weights));
+        const ncnn::Layer* op = group_ops[g];
 
         ncnn::Option opt_g = opt;
         opt_g.blob_allocator = top_blob.allocator;
 
         // forward
         op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
-
-        delete op;
     }
 
     return 0;
diff --git a/src/layer/x86/convolutiondepthwise_x86.h b/src/layer/x86/convolutiondepthwise_x86.h
index 82352312e..6cec2538b 100644
--- a/src/layer/x86/convolutiondepthwise_x86.h
+++ b/src/layer/x86/convolutiondepthwise_x86.h
@@ -22,7 +22,15 @@ namespace ncnn {
 class ConvolutionDepthWise_x86 : public ConvolutionDepthWise
 {
 public:
+    ConvolutionDepthWise_x86();
+    virtual ~ConvolutionDepthWise_x86();
+
+    virtual int load_model(const ModelBin& mb);
+
     virtual int forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+
+public:
+    std::vector<ncnn::Layer*> group_ops;
 };
 
 } // namespace ncnn