diff --git a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
index 11a27b1b4d..dcfaa8115c 100644
--- a/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
+++ b/mindspore/lite/nnacl/assembly/opt/MatmulDpInt8.S
@@ -421,13 +421,37 @@ End3:
   smax v17.4s, v17.4s, v7.4s
   smax v18.4s, v18.4s, v7.4s
   smax v19.4s, v19.4s, v7.4s
-
-  // Apply the act_min bound
+  smax v20.4s, v20.4s, v7.4s
+  smax v21.4s, v21.4s, v7.4s
+  smax v22.4s, v22.4s, v7.4s
+  smax v23.4s, v23.4s, v7.4s
+  smax v24.4s, v24.4s, v7.4s
+  smax v25.4s, v25.4s, v7.4s
+  smax v26.4s, v26.4s, v7.4s
+  smax v27.4s, v27.4s, v7.4s
+  smax v28.4s, v28.4s, v7.4s
+  smax v29.4s, v29.4s, v7.4s
+  smax v30.4s, v30.4s, v7.4s
+  smax v31.4s, v31.4s, v7.4s
+
+  // Apply the act_max bound
   dup v6.4s, w9
   smin v16.4s, v16.4s, v6.4s
   smin v17.4s, v17.4s, v6.4s
   smin v18.4s, v18.4s, v6.4s
   smin v19.4s, v19.4s, v6.4s
+  smin v20.4s, v20.4s, v6.4s
+  smin v21.4s, v21.4s, v6.4s
+  smin v22.4s, v22.4s, v6.4s
+  smin v23.4s, v23.4s, v6.4s
+  smin v24.4s, v24.4s, v6.4s
+  smin v25.4s, v25.4s, v6.4s
+  smin v26.4s, v26.4s, v6.4s
+  smin v27.4s, v27.4s, v6.4s
+  smin v28.4s, v28.4s, v6.4s
+  smin v29.4s, v29.4s, v6.4s
+  smin v30.4s, v30.4s, v6.4s
+  smin v31.4s, v31.4s, v6.4s
 
   // int32 -> int16
   sqxtn v0.4h, v16.4s
diff --git a/mindspore/lite/nnacl/opt_op_handler.c b/mindspore/lite/nnacl/opt_op_handler.c
index a3fc07a1d8..294f6af837 100644
--- a/mindspore/lite/nnacl/opt_op_handler.c
+++ b/mindspore/lite/nnacl/opt_op_handler.c
@@ -57,6 +57,6 @@ void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst,
                                   int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                   int32_t maxi, bool per_channel) {
   return MatmulInt8DpNeon64(a, b, dst, UP_ROUND(row, 8), UP_ROUND(col, 8), deep_4, input_sum, bias, mini, maxi,
-                            output_zp, multiplier[0], left_shift[0], right_shift[0], row, col, col);
+                            output_zp, multiplier[0], left_shift[0], right_shift[0], row, col, stride);
 }
 #endif
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
index 110b0a5d4b..942c11fd07 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -38,7 +38,7 @@ Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
     matmul_param_ = nullptr;
   }
   if (packed_weight_ != nullptr) {
-    delete packed_weight_;
+    free(packed_weight_);
     packed_weight_ = nullptr;
   }
   FreeResizeBuf();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index a1e47bd2a4..fdb3cd0ddb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -60,6 +60,7 @@ int ConvolutionDepthwiseInt8CPUKernel::InitWeightBias() {
   for (int i = 0; i < weight_tensor->ElementsNum(); i++) {
     packed_weight_[i] = (int16_t)(tmp_weight[i] - weight_zp);
   }
+  free(tmp_weight);
 
   bias_data_ = reinterpret_cast<int32_t *>(malloc(channel * sizeof(int32_t)));
   if (bias_data_ == nullptr) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
index 1c069ce764..211bad75dc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -402,7 +402,7 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
   if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
     kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   } else if (kernel_h == 1 && kernel_w == 1 && filter_quant_size == 1) {
-    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
+    kernel = new (std::nothrow) kernel::Convolution1x1Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   } else {
     kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
   }