!12181 [MSLITE] int8 matmul base

From: @ling_qiao_min Reviewed-by: Signed-off-by:
4 years ago · 2f1d4f9ef9
--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -182,40 +182,6 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
  return;
 }
 void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
                       bool peroc) {
  /* support per-layer && weight per-channel */
  /*  row4x16-major * row16x4-major => (int8)row-major*/
  for (int r = 0; r < row; r++) {
    for (int c = 0; c < col; c++) {
      int r4div = r / C4NUM, r4mod = r % C4NUM;
      int c4div = c / C4NUM, c4mod = c % C4NUM;
      size_t ci = r * stride + c;
      int32_t value = 0;
      for (int d = 0; d < deep_16; d++) {
        int d16div = d / C16NUM, d16mod = d % C16NUM;
        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
        size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
        value = value + a[ai] * b[bi];
      }
      int32_t cur_input_sum =
        peroc ? input_sum[c4div * UP_ROUND(row, C4NUM) * C4NUM + r * C4NUM + c4mod] : input_sum[r];
      value -= cur_input_sum;
      value += bias[c];
      int32_t cur_left_shift = peroc ? left_shift[c] : left_shift[0];
      int32_t cur_right_shift = peroc ? right_shift[c] : right_shift[0];
      int32_t cur_multiplier = peroc ? multiplier[c] : multiplier[0];
      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
      value = MSMIN(maxi, value);
      value = MSMAX(mini, value);
      dst[ci] = (int8_t)value;
    }
  }
  return;
 }
 void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                      int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
@@ -353,6 +319,105 @@ void MatMulInt8_4x16_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row
  return;
 }
 #ifdef ENABLE_ARM64
 void PackInput4x4AndInputSumPert_arm64(const int8_t *src_ic, int8_t *pack_ic, int32_t *input_sum_r, size_t src_stride,
                                       size_t ic_4div, size_t ic_4res, int32_t filter_zp) {
  asm volatile(
    "dup v2.4s, wzr \n"
    "mov x14, %[input_sum_r] \n"
    "dup v3.4s, %w[filter_zp]  \n"
    "mov x10, %[src_ic] \n"
    "mov x11, %[pack_ic] \n"
    "mov x15, #0 \n"
    "1: \n"
    "cmp x15, %[ic_4div] \n"
    "add x15, x15, #4\n"
    "mov x12, x10 \n"
    "add x10, x10, #4\n"
    "blt 2f \n"
    "cmp %[ic_4res], #0\n"
    "beq 6f \n"
    "cmp %[ic_4res], #1\n"
    "beq 3f \n"
    "cmp %[ic_4res], #2\n"
    "beq 4f \n"
    "cmp %[ic_4res], #3\n"
    "beq 5f \n"
    "2: \n"
    "ld1 {v0.s}[0], [x12], %[src_stride]\n"
    "ld1 {v0.s}[1], [x12], %[src_stride]\n"
    "ld1 {v0.s}[2], [x12], %[src_stride]\n"
    "ld1 {v0.s}[3], [x12], %[src_stride]\n"
    "st1 {v0.16b}, [x11], #16\n"
    "saddlp v1.8h, v0.16b \n"
    "saddlp v0.4s, v1.8h \n"
    "add v2.4s, v2.4s, v0.4s \n"
    "b 1b \n"
    "3: \n" /* ic res 1 */
    "dup v0.4s, wzr \n"
    "ld1 {v0.b}[0],  [x12], %[src_stride]\n"
    "ld1 {v0.b}[4],  [x12], %[src_stride]\n"
    "ld1 {v0.b}[8],  [x12], %[src_stride]\n"
    "ld1 {v0.b}[12], [x12], %[src_stride]\n"
    "st1 {v0.16b}, [x11], #16\n"
    "saddlp v1.8h, v0.16b \n"
    "saddlp v0.4s, v1.8h \n"
    "add v2.4s, v2.4s, v0.4s \n"
    "b 6f \n"
    "4: \n" /* ic res 2 */
    "dup v0.4s, wzr \n"
    "ld1 {v0.h}[0], [x12], %[src_stride]\n"
    "ld1 {v0.h}[2], [x12], %[src_stride]\n"
    "ld1 {v0.h}[4], [x12], %[src_stride]\n"
    "ld1 {v0.h}[6], [x12], %[src_stride]\n"
    "st1 {v0.16b}, [x11], #16\n"
    "saddlp v1.8h, v0.16b \n"
    "saddlp v0.4s, v1.8h \n"
    "add v2.4s, v2.4s, v0.4s \n"
    "b 6f \n"
    "5: \n" /* ic res 3 */
    "dup v0.4s, wzr \n"
    "add x13, x12, #2 \n"
    "ld1 {v0.h}[0], [x12], %[src_stride]\n"
    "ld1 {v0.b}[2], [x13], %[src_stride]\n"
    "ld1 {v0.h}[2], [x12], %[src_stride]\n"
    "ld1 {v0.b}[6], [x13], %[src_stride]\n"
    "ld1 {v0.h}[4], [x12], %[src_stride]\n"
    "ld1 {v0.b}[10], [x13], %[src_stride]\n"
    "ld1 {v0.h}[6], [x12], %[src_stride]\n"
    "ld1 {v0.b}[14], [x13], %[src_stride]\n"
    "st1 {v0.16b}, [x11], #16\n"
    "saddlp v1.8h, v0.16b \n"
    "saddlp v0.4s, v1.8h \n"
    "add v2.4s, v2.4s, v0.4s \n"
    "b 6f \n"
    "6: \n"
    "mul v2.4s, v2.4s, v3.4s \n"
    "st1 {v2.4s}, [x14], #16 \n"
    :
    : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r),
      [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp)
    : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3");
  return;
 }
 #endif
 void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum,
                                 size_t input_channel, size_t plane_size, int32_t filter_zp) {
  int ic4 = UP_ROUND(input_channel, C4NUM);
@@ -370,99 +435,7 @@ void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input,
 #ifdef ENABLE_ARM64
    size_t src_stride = input_channel;
    size_t ic_4res = input_channel - ic_4div;
    asm volatile(
      "dup v2.4s, wzr \n"
      "mov x14, %[input_sum_r] \n"
      "dup v3.4s, %w[filter_zp]  \n"
      "mov x10, %[src_ic] \n"
      "mov x11, %[pack_ic] \n"
      "mov x15, #0 \n"
      "1: \n"
      "cmp x15, %[ic_4div] \n"
      "add x15, x15, #4\n"
      "mov x12, x10 \n"
      "add x10, x10, #4\n"
      "blt 2f \n"
      "cmp %[ic_4res], #0\n"
      "beq 6f \n"
      "cmp %[ic_4res], #1\n"
      "beq 3f \n"
      "cmp %[ic_4res], #2\n"
      "beq 4f \n"
      "cmp %[ic_4res], #3\n"
      "beq 5f \n"
      "2: \n"
      "ld1 {v0.s}[0], [x12], %[src_stride]\n"
      "ld1 {v0.s}[1], [x12], %[src_stride]\n"
      "ld1 {v0.s}[2], [x12], %[src_stride]\n"
      "ld1 {v0.s}[3], [x12], %[src_stride]\n"
      "st1 {v0.16b}, [x11], #16\n"
      "saddlp v1.8h, v0.16b \n"
      "saddlp v0.4s, v1.8h \n"
      "add v2.4s, v2.4s, v0.4s \n"
      "b 1b \n"
      "3: \n" /* ic res 1 */
      "dup v0.4s, wzr \n"
      "ld1 {v0.b}[0],  [x12], %[src_stride]\n"
      "ld1 {v0.b}[4],  [x12], %[src_stride]\n"
      "ld1 {v0.b}[8],  [x12], %[src_stride]\n"
      "ld1 {v0.b}[12], [x12], %[src_stride]\n"
      "st1 {v0.16b}, [x11], #16\n"
      "saddlp v1.8h, v0.16b \n"
      "saddlp v0.4s, v1.8h \n"
      "add v2.4s, v2.4s, v0.4s \n"
      "b 6f \n"
      "4: \n" /* ic res 2 */
      "dup v0.4s, wzr \n"
      "ld1 {v0.h}[0], [x12], %[src_stride]\n"
      "ld1 {v0.h}[2], [x12], %[src_stride]\n"
      "ld1 {v0.h}[4], [x12], %[src_stride]\n"
      "ld1 {v0.h}[6], [x12], %[src_stride]\n"
      "st1 {v0.16b}, [x11], #16\n"
      "saddlp v1.8h, v0.16b \n"
      "saddlp v0.4s, v1.8h \n"
      "add v2.4s, v2.4s, v0.4s \n"
      "b 6f \n"
      "5: \n" /* ic res 3 */
      "dup v0.4s, wzr \n"
      "add x13, x12, #2 \n"
      "ld1 {v0.h}[0], [x12], %[src_stride]\n"
      "ld1 {v0.b}[2], [x13], %[src_stride]\n"
      "ld1 {v0.h}[2], [x12], %[src_stride]\n"
      "ld1 {v0.b}[6], [x13], %[src_stride]\n"
      "ld1 {v0.h}[4], [x12], %[src_stride]\n"
      "ld1 {v0.b}[10], [x13], %[src_stride]\n"
      "ld1 {v0.h}[6], [x12], %[src_stride]\n"
      "ld1 {v0.b}[14], [x13], %[src_stride]\n"
      "st1 {v0.16b}, [x11], #16\n"
      "saddlp v1.8h, v0.16b \n"
      "saddlp v0.4s, v1.8h \n"
      "add v2.4s, v2.4s, v0.4s \n"
      "b 6f \n"
      "6: \n"
      "mul v2.4s, v2.4s, v3.4s \n"
      "st1 {v2.4s}, [x14], #16 \n"
      :
      : [ src_ic ] "r"(src_ic), [ pack_ic ] "r"(pack_ic), [ input_sum_r ] "r"(input_sum_r),
        [ src_stride ] "r"(src_stride), [ ic_4div ] "r"(ic_4div), [ ic_4res ] "r"(ic_4res), [ filter_zp ] "r"(filter_zp)
      : "x10", "x11", "x12", "x13", "x14", "x15", "v0", "v1", "v2", "v3");
    PackInput4x4AndInputSumPert_arm64(src_ic, pack_ic, input_sum_r, src_stride, ic_4div, ic_4res, filter_zp);
 #else
    int32_t tmp_sum_value[4] = {0};
    for (int ici = 0; ici < ic_4div; ici += C4NUM) {
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -25,12 +25,9 @@
 extern "C" {
 #endif
 /* 4x16 16x4 -> 4x4 */
 /* matmul */
 void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                     const int *input_sum, const int *bias);
 void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
                       bool per_channel);
 void RowMajor2Row16x4MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col16x4MajorInt8(int8_t *src, int row, int col, int8_t *dst);
 void CalcInputSums(int8_t *input, int row, int col, int weight_zp, int *dst, DataOrder order);
@@ -41,6 +38,7 @@ void MatmulInt8Opt(const int8_t *a, const int8_t *b, int8_t *dst, int row, int c
                   int32_t *right_shift, size_t stride, size_t filter_peroc, int32_t *filter_zp);
 /* 8x4 4x8 -> 8x8 */
 /* optimize conv */
 void RowMajor2Row8x4MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_4,
                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
@@ -48,6 +46,7 @@ void MatMulInt8_8x8_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
                      size_t per_channel);
 /* 4x16 16x2 -> 4x2 */
 /* arm32 conv1x1 */
 void RowMajor2Row2x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                      size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
@@ -55,6 +54,7 @@ void MatMulInt8_4x2_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row,
                      bool peroc);
 /* 4x4 4x16 -> 4x16 */
 /* optimize conv1x1 */
 void RowMajor2Row4x16MajorInt8(const int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void PackInput4x4AndInputSumPert(const int8_t *src_input, int8_t *packed_input, int32_t *input_sum,
                                 size_t input_channel, size_t plane_size, int32_t filter_zp);
--- a/mindspore/lite/nnacl/int8/quantize.h
+++ b/mindspore/lite/nnacl/int8/quantize.h
@@ -66,17 +66,6 @@ typedef struct PreluQuantArg {
  QuantArg out_quant_args_;
 } PreluQuantArg;
 typedef struct MatmulQuantArg {
  QuantArg input;
  QuantArg weight;
  QuantArg output;
  int32_t out_act_min;
  int32_t out_act_max;
  int32_t left_shift;
  int32_t right_shift;
  int32_t quant_multiplier;
 } MatmulQuantArg;
 typedef struct CropQuantArg {
  QuantArg in_args_;
  QuantArg out_args_;
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -73,4 +73,15 @@ typedef struct MatmulQuantParameter {
  int32_t *quant_multiplier_;
 } MatmulQuantParameter;
 typedef struct MatmulQuantArg {
  QuantArg input;
  QuantArg weight;
  QuantArg output;
  int32_t out_act_min;
  int32_t out_act_max;
  int32_t left_shift;
  int32_t right_shift;
  int32_t quant_multiplier;
 } MatmulQuantArg;
 #endif  // MINDSPORE_LITE_NNACL_MATMUL_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.cc
@@ -67,10 +67,5 @@ int FullconnectionCPUKernel::ReSize() {
  return MatmulFp32BaseCPUKernel::ReSize();
 }
 int FullconnectionCPUKernel::Run() {
  MatmulFp32BaseCPUKernel::Run();
  return RET_OK;
 }
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_FullConnection, LiteKernelCreator<FullconnectionCPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection_fp32.h
@@ -33,7 +33,6 @@ class FullconnectionCPUKernel : public MatmulFp32BaseCPUKernel {
  ~FullconnectionCPUKernel() = default;
  int Init() override;
  int ReSize() override;
  int Run() override;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_FULLCONNECTION_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@@ -15,258 +15,44 @@
 */
 #include "src/runtime/kernel/arm/int8/fullconnection_int8.h"
 #include "src/runtime/runtime_api.h"
 #include "src/kernel_registry.h"
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_FullConnection;
 namespace mindspore::kernel {
 void FullconnectionInt8CPUKernel::FreeQuantParam() {
  if (quant_.filter_scale_ != nullptr) {
    free(quant_.filter_scale_);
    quant_.filter_scale_ = nullptr;
  }
  if (quant_.filter_zp_ != nullptr) {
    free(quant_.filter_zp_);
    quant_.filter_zp_ = nullptr;
  }
  if (quant_.left_shift_ != nullptr) {
    free(quant_.left_shift_);
    quant_.left_shift_ = nullptr;
  }
  if (quant_.right_shift_ != nullptr) {
    free(quant_.right_shift_);
    quant_.right_shift_ = nullptr;
  }
  if (quant_.quant_multiplier_ != nullptr) {
    free(quant_.quant_multiplier_);
    quant_.quant_multiplier_ = nullptr;
  }
  return;
 }
 void FullconnectionInt8CPUKernel::FreeTmpBuffer() {
  if (pack_a_ptr_ != nullptr) {
    free(pack_a_ptr_);
    pack_a_ptr_ = nullptr;
  }
  if (pack_b_ptr_ != nullptr) {
    free(pack_b_ptr_);
    pack_b_ptr_ = nullptr;
  }
  if (input_sums_ != nullptr) {
    free(input_sums_);
    input_sums_ = nullptr;
  }
  if (weight_bias_sums_ != nullptr) {
    free(weight_bias_sums_);
    weight_bias_sums_ = nullptr;
  }
  if (bias_ptr_ != nullptr) {
    free(bias_ptr_);
    bias_ptr_ = nullptr;
  }
  return;
 }
 int FullconnectionInt8CPUKernel::MallocQuantParam() {
  auto weight_tensor = in_tensors_.at(1);
  auto weight_quant_params = weight_tensor->quant_params();
  int col = weight_tensor->shape().front();
  filter_per_channel_ = (weight_quant_params.size() > 1);
  int init_size = filter_per_channel_ ? col : 1;
 int FullconnectionInt8CPUKernel::Init() {
  param_->batch = 1;
  param_->a_transpose_ = false;
  param_->b_transpose_ = true;
  quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
  if (quant_.filter_scale_ == nullptr) {
    return RET_ERROR;
  }
  quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.filter_zp_ == nullptr) {
    return RET_ERROR;
  }
  quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.left_shift_ == nullptr) {
    return RET_ERROR;
  }
  quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.right_shift_ == nullptr) {
    return RET_ERROR;
  }
  quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.quant_multiplier_ == nullptr) {
    return RET_ERROR;
  }
  return RET_OK;
 }
  InitParameter();
 int FullconnectionInt8CPUKernel::Init() {
  auto ret = MallocQuantParam();
  auto ret = MatmulBaseInt8CPUKernel::Init();
  if (ret != RET_OK) {
    FreeQuantParam();
    MS_LOG(ERROR) << "ParallelLaunch failed";
    return ret;
  }
  auto in_quant_params = in_tensors_.at(0)->quant_params();
  quant_.input_.zp_ = in_quant_params.front().zeroPoint;
  quant_.input_.scale_ = in_quant_params.front().scale;
  auto out_quant_params = out_tensors_.at(0)->quant_params();
  quant_.output_.zp_ = out_quant_params.front().zeroPoint;
  quant_.output_.scale_ = out_quant_params.front().scale;
  auto weight_tensor = in_tensors_.at(1);
  fc_param_->b_const_ = (weight_tensor->data_c() != nullptr);
  int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1;
  auto weight_quant_params = weight_tensor->quant_params();
  for (int i = 0; i < weight_quant_num; i++) {
    quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint;
    quant_.filter_scale_[i] = weight_quant_params[i].scale;
  }
  for (int i = 0; i < weight_quant_num; ++i) {
    const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]);
    double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_);
    QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i],
                                              &quant_.right_shift_[i]);
  }
  CalculateActivationRangeQuantized(fc_param_->act_type_ == ActType_Relu, fc_param_->act_type_ == ActType_Relu6,
                                    quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_,
                                    &quant_.out_act_max_);
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }
 void FullconnectionInt8CPUKernel::InitParam() {
 int FullconnectionInt8CPUKernel::ReSize() {
  int row = 1;
  for (size_t i = 0; i < out_tensors_.at(0)->shape().size() - 1; ++i) {
    row *= (out_tensors_.at(0)->shape()).at(i);
  }
  fc_param_->row_ = row;
  fc_param_->col_ = out_tensors_.at(0)->shape().back();
  fc_param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
  param_->row_ = row;
  param_->col_ = out_tensors_.at(0)->shape().back();
  param_->deep_ = (in_tensors_.at(1)->shape()).at(1);
  fc_param_->row_4_ = UP_ROUND(fc_param_->row_, C4NUM);
  fc_param_->col_4_ = UP_ROUND(fc_param_->col_, C4NUM);
  fc_param_->col_8_ = UP_ROUND(fc_param_->col_, C8NUM);
  fc_param_->deep_16_ = UP_ROUND(fc_param_->deep_, C16NUM);
  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(fc_param_->col_4_, C4NUM));
  thread_stride_ = UP_DIV(UP_DIV(fc_param_->col_4_, C4NUM), thread_count_);
  return;
 }
 int FullconnectionInt8CPUKernel::ReSize() {
  FreeTmpBuffer();
  InitParam();
  pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t)));
  if (pack_a_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  pack_b_ptr_ = reinterpret_cast<int8_t *>(malloc(fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t)));
  if (pack_b_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  input_sums_ = reinterpret_cast<int *>(malloc(fc_param_->row_4_ * sizeof(int)));
  if (input_sums_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  weight_bias_sums_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int)));
  if (weight_bias_sums_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  memset(pack_a_ptr_, 0, fc_param_->row_4_ * fc_param_->deep_16_ * sizeof(int8_t));
  memset(pack_b_ptr_, 0, fc_param_->col_4_ * fc_param_->deep_16_ * sizeof(int8_t));
  memset(input_sums_, 0, fc_param_->row_4_ * sizeof(int));
  memset(weight_bias_sums_, 0, fc_param_->col_4_ * sizeof(int));
  if (in_tensors_.size() == 3) {
    bias_ptr_ = reinterpret_cast<int *>(malloc(fc_param_->col_4_ * sizeof(int)));
    if (bias_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Memory allocation failed";
      FreeTmpBuffer();
      return RET_MEMORY_FAILED;
    }
    memcpy(bias_ptr_, in_tensors_.at(2)->data_c(), fc_param_->col_ * sizeof(int));
  } else {
    bias_ptr_ = nullptr;
  }
  if (fc_param_->b_const_) {
    auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
    RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_);
    CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
                       weight_bias_sums_, ColMajor, filter_per_channel_);
  }
  return RET_OK;
 }
 int FullconnectionInt8CPUKernel::RunImpl(int task_id) {
  int stride = thread_stride_ * C4NUM;
  int cur_stride = task_id * stride;
  int res_stride = fc_param_->col_ - cur_stride;
  int cur_oc = MSMIN(stride, res_stride);
  if (cur_oc <= 0) {
    return RET_OK;
  }
  int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_;
  int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_;
  int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_;
  int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_;
  MatmulInt8Opt(pack_a_ptr_, pack_b_ptr_ + cur_stride * fc_param_->deep_16_, c_ptr_ + cur_stride, fc_param_->row_,
                cur_oc, fc_param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_,
                quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, fc_param_->col_,
                filter_per_channel_, cur_zp);
  return RET_OK;
 }
 int FcInt8Run(void *cdata, int task_id) {
  auto fc = reinterpret_cast<FullconnectionInt8CPUKernel *>(cdata);
  auto ret = fc->RunImpl(task_id);
  auto ret = MatmulBaseInt8CPUKernel::ReSize();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "FcInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
    return ret;
  }
  return RET_OK;
 }
 int FullconnectionInt8CPUKernel::Run() {
  auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
  RowMajor2Row16x4MajorInt8(input_ptr, pack_a_ptr_, fc_param_->row_, fc_param_->deep_);
  int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0];
  CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, tmp_weight_zp, input_sums_, RowMajor);
  if (!fc_param_->b_const_) {
    auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
    RowMajor2Row16x4MajorInt8(weight_data, pack_b_ptr_, fc_param_->col_, fc_param_->deep_);
    CalcWeightBiasSums(weight_data, fc_param_->deep_, fc_param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
                       weight_bias_sums_, ColMajor, filter_per_channel_);
  }
  c_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
  auto ret = ParallelLaunch(this->context_->thread_pool_, FcInt8Run, this, thread_count_);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ParallelLaunch failed";
    MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed";
    return ret;
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
@@ -18,52 +18,19 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_FULLCONNECTION_INT8_H_
 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "mindspore/lite/nnacl/int8/quantize.h"
 #include "nnacl/common_func.h"
 #include "nnacl/int8/common_func_int8.h"
 #include "nnacl/int8/matmul_int8.h"
 #include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
 namespace mindspore::kernel {
 class FullconnectionInt8CPUKernel : public LiteKernel {
 class FullconnectionInt8CPUKernel : public MatmulBaseInt8CPUKernel {
 public:
  FullconnectionInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                              const std::vector<lite::Tensor *> &outputs, const mindspore::lite::InnerContext *ctx,
                              const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    fc_param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
  }
  ~FullconnectionInt8CPUKernel() override {
    FreeTmpBuffer();
    FreeQuantParam();
  }
      : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~FullconnectionInt8CPUKernel() override = default;
  int Init() override;
  int ReSize() override;
  int Run() override;
 public:
  int RunImpl(int task_id);
 private:
  void InitParam();
  void FreeTmpBuffer();
  void FreeQuantParam();
  int MallocQuantParam();
 private:
  MatMulParameter *fc_param_ = nullptr;
  MatmulQuantParameter quant_;
  int thread_count_ = 1;
  int thread_stride_ = 0;
  int8_t *pack_a_ptr_ = nullptr;
  int8_t *pack_b_ptr_ = nullptr;
  int8_t *c_ptr_ = nullptr;
  int *input_sums_ = nullptr;
  int *weight_bias_sums_ = nullptr;
  int *bias_ptr_ = nullptr;
  bool filter_per_channel_ = true;
 };
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.cc
@@ -0,0 +1,323 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
 #include "src/runtime/runtime_api.h"
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 namespace mindspore::kernel {
 int MatmulBaseInt8Run(void *cdata, int task_id) {
  auto op = reinterpret_cast<MatmulBaseInt8CPUKernel *>(cdata);
  auto ret = op->RunImpl(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
    return ret;
  }
  return RET_OK;
 }
 int MatmulBaseInt8CPUKernel::RunImpl(int task_id) {
  int stride = thread_stride_ * C4NUM;
  int cur_stride = task_id * stride;
  int res_stride = param_->col_ - cur_stride;
  int cur_oc = MSMIN(stride, res_stride);
  if (cur_oc <= 0) {
    return RET_OK;
  }
  int32_t *cur_left = filter_per_channel_ ? quant_.left_shift_ + cur_stride : quant_.left_shift_;
  int32_t *cur_right = filter_per_channel_ ? quant_.right_shift_ + cur_stride : quant_.right_shift_;
  int32_t *cur_mul = filter_per_channel_ ? quant_.quant_multiplier_ + cur_stride : quant_.quant_multiplier_;
  int32_t *cur_zp = filter_per_channel_ ? quant_.filter_zp_ + cur_stride : quant_.filter_zp_;
  MatmulInt8Opt(pack_a_ptr_, batch_b_ptr_ + cur_stride * param_->deep_16_, batch_c_ptr_ + cur_stride, param_->row_,
                cur_oc, param_->deep_16_, input_sums_, weight_bias_sums_ + cur_stride, quant_.out_act_min_,
                quant_.out_act_max_, quant_.output_.zp_, cur_mul, cur_left, cur_right, param_->col_,
                filter_per_channel_, cur_zp);
  return RET_OK;
 }
 MatmulBaseInt8CPUKernel::~MatmulBaseInt8CPUKernel() {
  FreeQuantParam();
  FreeTmpBuffer();
  if (bias_ptr_ != nullptr) {
    free(bias_ptr_);
    bias_ptr_ = nullptr;
  }
  return;
 }
 void MatmulBaseInt8CPUKernel::FreeQuantParam() {
  if (quant_.filter_scale_ != nullptr) {
    free(quant_.filter_scale_);
    quant_.filter_scale_ = nullptr;
  }
  if (quant_.filter_zp_ != nullptr) {
    free(quant_.filter_zp_);
    quant_.filter_zp_ = nullptr;
  }
  if (quant_.left_shift_ != nullptr) {
    free(quant_.left_shift_);
    quant_.left_shift_ = nullptr;
  }
  if (quant_.right_shift_ != nullptr) {
    free(quant_.right_shift_);
    quant_.right_shift_ = nullptr;
  }
  if (quant_.quant_multiplier_ != nullptr) {
    free(quant_.quant_multiplier_);
    quant_.quant_multiplier_ = nullptr;
  }
  return;
 }
 int MatmulBaseInt8CPUKernel::MallocQuantParam() {
  auto weight_tensor = in_tensors_.at(1);
  auto weight_quant_params = weight_tensor->quant_params();
  int col = weight_tensor->shape().front();
  filter_per_channel_ = (weight_quant_params.size() > 1);
  int init_size = filter_per_channel_ ? col : 1;
  quant_.filter_scale_ = reinterpret_cast<float *>(malloc(init_size * sizeof(float)));
  if (quant_.filter_scale_ == nullptr) {
    return RET_ERROR;
  }
  quant_.filter_zp_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.filter_zp_ == nullptr) {
    return RET_ERROR;
  }
  quant_.left_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.left_shift_ == nullptr) {
    return RET_ERROR;
  }
  quant_.right_shift_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.right_shift_ == nullptr) {
    return RET_ERROR;
  }
  quant_.quant_multiplier_ = reinterpret_cast<int32_t *>(malloc(init_size * sizeof(int32_t)));
  if (quant_.quant_multiplier_ == nullptr) {
    return RET_ERROR;
  }
  return RET_OK;
 }
 void MatmulBaseInt8CPUKernel::InitQuantParam() {
  auto in_quant_params = in_tensors_.at(0)->quant_params();
  quant_.input_.zp_ = in_quant_params.front().zeroPoint;
  quant_.input_.scale_ = in_quant_params.front().scale;
  auto out_quant_params = out_tensors_.at(0)->quant_params();
  quant_.output_.zp_ = out_quant_params.front().zeroPoint;
  quant_.output_.scale_ = out_quant_params.front().scale;
  auto weight_tensor = in_tensors_.at(1);
  int weight_quant_num = filter_per_channel_ ? weight_tensor->shape().front() : 1;
  auto weight_quant_params = weight_tensor->quant_params();
  for (int i = 0; i < weight_quant_num; i++) {
    quant_.filter_zp_[i] = weight_quant_params[i].zeroPoint;
    quant_.filter_scale_[i] = weight_quant_params[i].scale;
  }
  for (int i = 0; i < weight_quant_num; ++i) {
    const double in_scale = static_cast<double>(quant_.input_.scale_ * quant_.filter_scale_[i]);
    double real_multiplier = in_scale / static_cast<double>(quant_.output_.scale_);
    QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_.quant_multiplier_[i], &quant_.left_shift_[i],
                                              &quant_.right_shift_[i]);
  }
  CalculateActivationRangeQuantized(param_->act_type_ == ActType_Relu, param_->act_type_ == ActType_Relu6,
                                    quant_.output_.zp_, quant_.output_.scale_, &quant_.out_act_min_,
                                    &quant_.out_act_max_);
 }
 void MatmulBaseInt8CPUKernel::InitParameter() {
  param_->a_const_ = (in_tensors_[0]->data_c() != nullptr);
  param_->b_const_ = (in_tensors_[1]->data_c() != nullptr);
  return;
 }
 void MatmulBaseInt8CPUKernel::ResizeParameter() {
  param_->row_align_ = UP_ROUND(param_->row_, C4NUM);
  param_->col_align_ = UP_ROUND(param_->col_, C4NUM);
  param_->deep_16_ = UP_ROUND(param_->deep_, C16NUM);
  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(param_->col_align_, C4NUM));
  thread_stride_ = UP_DIV(UP_DIV(param_->col_align_, C4NUM), thread_count_);
  return;
 }
 void MatmulBaseInt8CPUKernel::FreeTmpBuffer() {
  if (pack_a_ptr_ != nullptr) {
    free(pack_a_ptr_);
    pack_a_ptr_ = nullptr;
  }
  if (pack_b_ptr_ != nullptr) {
    free(pack_b_ptr_);
    pack_b_ptr_ = nullptr;
  }
  if (input_sums_ != nullptr) {
    free(input_sums_);
    input_sums_ = nullptr;
  }
  if (weight_bias_sums_ != nullptr) {
    free(weight_bias_sums_);
    weight_bias_sums_ = nullptr;
  }
  return;
 }
 void MatmulBaseInt8CPUKernel::TransferB() {
  auto weight_data = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
  for (int i = 0; i < param_->batch; i++) {
    auto current_weight = weight_data + i * param_->deep_ * param_->col_;
    auto current_b_pack = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_;
    auto current_sums = weight_bias_sums_ + i * param_->col_align_;
    if (param_->b_transpose_) {
      RowMajor2Row16x4MajorInt8(current_weight, current_b_pack, param_->col_, param_->deep_);
      CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
                         current_sums, ColMajor, filter_per_channel_);
    } else {
      RowMajor2Col16x4MajorInt8(current_weight, param_->deep_, param_->col_, current_b_pack);
      CalcWeightBiasSums(current_weight, param_->deep_, param_->col_, quant_.input_.zp_, quant_.filter_zp_, bias_ptr_,
                         current_sums, RowMajor, false);
    }
  }
  return;
 }
 int MatmulBaseInt8CPUKernel::InitTmpBuffer() {
  pack_a_ptr_ = reinterpret_cast<int8_t *>(malloc(param_->row_align_ * param_->deep_16_ * sizeof(int8_t)));
  if (pack_a_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  pack_b_ptr_ =
    reinterpret_cast<int8_t *>(malloc(param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t)));
  if (pack_b_ptr_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  input_sums_ = reinterpret_cast<int *>(malloc(param_->row_align_ * sizeof(int)));
  if (input_sums_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  weight_bias_sums_ = reinterpret_cast<int *>(malloc(param_->batch * param_->col_align_ * sizeof(int)));
  if (weight_bias_sums_ == nullptr) {
    FreeTmpBuffer();
    return RET_ERROR;
  }
  memset(pack_a_ptr_, 0, param_->row_align_ * param_->deep_16_ * sizeof(int8_t));
  memset(pack_b_ptr_, 0, param_->batch * param_->col_align_ * param_->deep_16_ * sizeof(int8_t));
  memset(input_sums_, 0, param_->row_align_ * sizeof(int));
  memset(weight_bias_sums_, 0, param_->batch * param_->col_align_ * sizeof(int));
  return RET_OK;
 }
 int MatmulBaseInt8CPUKernel::InitBias() {
  if (in_tensors_.size() == 3) {
    auto bias_tensor = in_tensors_[2];
    int max_bias_data = UP_ROUND(bias_tensor->ElementsNum(), C4NUM);
    bias_ptr_ = reinterpret_cast<int *>(malloc(max_bias_data * sizeof(int)));
    if (bias_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Memory allocation failed";
      FreeTmpBuffer();
      return RET_MEMORY_FAILED;
    }
    memcpy(bias_ptr_, bias_tensor->data_c(), bias_tensor->ElementsNum() * sizeof(int));
  } else {
    bias_ptr_ = nullptr;
  }
  return RET_OK;
 }
 int MatmulBaseInt8CPUKernel::Init() {
  auto ret = MallocQuantParam();
  if (ret != RET_OK) {
    FreeQuantParam();
    return ret;
  }
  InitQuantParam();
  ret = InitBias();
  if (ret != RET_OK) {
    FreeQuantParam();
    return ret;
  }
  return RET_OK;
 }
 int MatmulBaseInt8CPUKernel::ReSize() {
  FreeTmpBuffer();
  ResizeParameter();
  auto ret = InitTmpBuffer();
  if (ret != RET_OK) {
    FreeQuantParam();
    return ret;
  }
  if (param_->b_const_ == true) {
    TransferB();
  }
  return RET_OK;
 }
 int MatmulBaseInt8CPUKernel::Run() {
  if (param_->b_const_ == false) {
    TransferB();
  }
  int8_t *a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
  int8_t *c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
  int32_t tmp_weight_zp = filter_per_channel_ ? 1 : quant_.filter_zp_[0];
  for (int i = 0; i < param_->batch; i++) {
    auto current_src_a = a_ptr + i * param_->row_ * param_->deep_;
    if (param_->a_transpose_) {
      RowMajor2Col16x4MajorInt8(current_src_a, param_->deep_, param_->row_, pack_a_ptr_);
      CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, ColMajor);
    } else {
      RowMajor2Row16x4MajorInt8(current_src_a, pack_a_ptr_, param_->row_, param_->deep_);
      CalcInputSums(current_src_a, param_->row_, param_->deep_, tmp_weight_zp, input_sums_, RowMajor);
    }
    batch_b_ptr_ = pack_b_ptr_ + i * param_->col_align_ * param_->deep_16_;
    batch_sums_ = weight_bias_sums_ + i * param_->col_align_;
    batch_c_ptr_ = c_ptr + i * param_->row_ * param_->col_;
    auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulBaseInt8Run, this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
      return ret;
    }
  }
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_base_int8.h
@@ -0,0 +1,82 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_
 #include <vector>
 #include "include/errorcode.h"
 #include "include/context.h"
 #include "src/lite_kernel.h"
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/common_func.h"
 #include "nnacl/int8/quantize.h"
 #include "nnacl/int8/common_func_int8.h"
 #include "nnacl/int8/matmul_int8.h"
 namespace mindspore::kernel {
 class MatmulBaseInt8CPUKernel : public LiteKernel {
 public:
  MatmulBaseInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                          const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                          const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    param_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
  }
  ~MatmulBaseInt8CPUKernel() override;
  int Init() override;
  int ReSize() override;
  int Run() override;
 public:
  int RunImpl(int task_id);
 protected:
  void InitParameter();
 private:
  void ResizeParameter();
  int InitBias();
 private:
  int InitTmpBuffer();
  void FreeTmpBuffer();
  void TransferA();
  void TransferB();
 private:
  int MallocQuantParam();
  void FreeQuantParam();
  void InitQuantParam();
 protected:
  MatMulParameter *param_ = nullptr;
  MatmulQuantParameter quant_;
  int thread_count_ = 1;
  int thread_stride_ = 0;
  int8_t *pack_a_ptr_ = nullptr;
  int8_t *pack_b_ptr_ = nullptr;
  int *input_sums_ = nullptr;
  int *weight_bias_sums_ = nullptr;
  int *bias_ptr_ = nullptr;
  bool filter_per_channel_ = true;
  int8_t *batch_b_ptr_ = nullptr;
  int8_t *batch_c_ptr_ = nullptr;
  int *batch_sums_ = nullptr;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_BASE_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
@@ -22,46 +22,27 @@
 #include "src/kernel_registry.h"
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_MatMul;
 namespace mindspore::kernel {
 MatmulInt8CPUKernel::~MatmulInt8CPUKernel() { FreeTmpBuffer(); }
 int MatmulInt8CPUKernel::Init() {
  InitParameter();
 void MatmulInt8CPUKernel::FreeTmpBuffer() {
  if (a_r4x16_ptr_ != nullptr) {
    context_->allocator->Free(a_r4x16_ptr_);
    a_r4x16_ptr_ = nullptr;
  }
  if (input_sums_ != nullptr) {
    context_->allocator->Free(input_sums_);
    input_sums_ = nullptr;
  }
  if (b_c16x4_batch_ != nullptr) {
    context_->allocator->Free(b_c16x4_batch_);
    b_c16x4_batch_ = nullptr;
  }
  if (weight_bias_sums_batch_ != nullptr) {
    context_->allocator->Free(weight_bias_sums_batch_);
    weight_bias_sums_batch_ = nullptr;
  }
  if (bias_ptr_ != nullptr) {
    context_->allocator->Free(bias_ptr_);
    bias_ptr_ = nullptr;
  auto ret = MatmulBaseInt8CPUKernel::Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "ParallelLaunch failed";
    return ret;
  }
  return;
 }
 int MatmulInt8CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
  }
  return ReSize();
 }
 int MatmulInt8CPUKernel::ReSize() {
  FreeTmpBuffer();
  int batch = 1;
  auto x_shape = in_tensors_.at(0)->shape();
  auto o_shape = out_tensors_.at(0)->shape();
@@ -69,159 +50,19 @@ int MatmulInt8CPUKernel::ReSize() {
  for (size_t i = 0; i < x_shape.size() - 2; ++i) {
    batch *= x_shape[i];
  }
  params_->batch = batch;
  param_->batch = batch;
  MS_ASSERT(o_shape.size() >= 2);
  params_->row_ = o_shape[o_shape.size() - 2];
  params_->col_ = o_shape[o_shape.size() - 1];
  params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
  params_->row_4_ = UP_ROUND(params_->row_, 4);
  params_->col_4_ = UP_ROUND(params_->col_, 4);
  params_->deep_16_ = UP_ROUND(params_->deep_, 16);
  a_r4x16_ptr_ =
    reinterpret_cast<int8_t *>(context_->allocator->Malloc(params_->row_4_ * params_->deep_16_ * sizeof(int8_t)));
  if (!a_r4x16_ptr_) return RET_MEMORY_FAILED;
  memset(a_r4x16_ptr_, 0, params_->row_4_ * params_->deep_16_ * sizeof(int8_t));
  input_sums_ = reinterpret_cast<int *>(context_->allocator->Malloc(params_->row_4_ * sizeof(int)));
  if (!input_sums_) return RET_MEMORY_FAILED;
  memset(input_sums_, 0, params_->row_4_ * sizeof(int));
  b_c16x4_batch_ = reinterpret_cast<int8_t *>(
    context_->allocator->Malloc(params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t)));
  if (!b_c16x4_batch_) return RET_MEMORY_FAILED;
  memset(b_c16x4_batch_, 0, params_->batch * params_->col_4_ * params_->deep_16_ * sizeof(int8_t));
  weight_bias_sums_batch_ =
    reinterpret_cast<int *>(context_->allocator->Malloc(params_->batch * params_->col_4_ * sizeof(int)));
  if (!weight_bias_sums_batch_) return RET_MEMORY_FAILED;
  memset(weight_bias_sums_batch_, 0, params_->batch * params_->col_4_ * sizeof(int));
  if (in_tensors_.size() == 3) {
    auto bias_size = params_->col_4_ * sizeof(int);
    bias_ptr_ = reinterpret_cast<int *>(context_->allocator->Malloc(bias_size));
    if (!bias_ptr_) return RET_MEMORY_FAILED;
    memcpy(bias_ptr_, in_tensors_[2]->data_c(), bias_size);
  } else {
    bias_ptr_ = NULL;
  }
  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(params_->col_4_, 4));
  thread_stride_ = UP_DIV(UP_DIV(params_->col_4_, 4), thread_count_);
  auto input_tensor = in_tensors_.at(0);
  auto params = input_tensor->quant_params();
  MS_ASSERT(params.size() == 1);
  quant_params_.input.zp_ = params.front().zeroPoint;
  quant_params_.input.scale_ = params.front().scale;
  auto weight_tensor = in_tensors_.at(1);
  params = weight_tensor->quant_params();
  MS_ASSERT(params.size() == 1);
  quant_params_.weight.zp_ = params.front().zeroPoint;
  quant_params_.weight.scale_ = params.front().scale;
  auto output_tensor = out_tensors_.at(0);
  params = output_tensor->quant_params();
  MS_ASSERT(params.size() == 1);
  quant_params_.output.zp_ = params.front().zeroPoint;
  quant_params_.output.scale_ = params.front().scale;
  params_->b_const_ = (in_tensors_.at(1)->data_c() != nullptr);
  if (params_->b_const_) {
    auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
    for (int i = 0; i < params_->batch; ++i) {
      auto cur_b = b_ptr + i * params_->deep_ * params_->col_;
      auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
      auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_;
      if (params_->b_transpose_) {
        RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_);
        CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
                           bias_ptr_, cur_sums, ColMajor, false);
      } else {
        RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack);
        CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
                           bias_ptr_, cur_sums, RowMajor, false);
      }
    }
  }
  double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
  QuantizeRoundParameterWithDoublePrecision(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
                                            &quant_params_.right_shift);
  return RET_OK;
 }
 int MatmulInt8CPUKernel::RunImpl(int task_id) {
  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_4_, 4) - task_id * thread_stride_);
  if (cur_oc <= 0) {
    return RET_OK;
  }
  int cur_oc_res = MSMIN(thread_stride_ * C4NUM, params_->col_ - task_id * thread_stride_ * C4NUM);
  auto cur_b = b_c16x4_ptr_ + task_id * thread_stride_ * 4 * params_->deep_16_;
  auto cur_bias = weight_bias_sums_ + task_id * thread_stride_ * 4;
  auto cur_c = c_ptr_ + task_id * thread_stride_ * 4;
  auto &p = quant_params_;
 #ifdef ENABLE_ARM64
  MatmulInt8Neon64(a_r4x16_ptr_, cur_b, cur_c, params_->row_4_, cur_oc * C4NUM, params_->deep_16_, input_sums_,
                   cur_bias, INT8_MIN, INT8_MAX, p.output.zp_, &p.quant_multiplier, &p.left_shift, &p.right_shift,
                   params_->row_, cur_oc_res, params_->col_ * sizeof(int8_t), false);
 #else
  MatMulInt8_16x4_r(a_r4x16_ptr_, cur_b, cur_c, params_->row_, cur_oc_res, params_->deep_16_, params_->col_,
                    input_sums_, cur_bias, &p.left_shift, &p.right_shift, &p.quant_multiplier, p.output.zp_, INT8_MIN,
                    INT8_MAX, false);
 #endif
  return RET_OK;
 }
  param_->row_ = o_shape[o_shape.size() - 2];
  param_->col_ = o_shape[o_shape.size() - 1];
  param_->deep_ = param_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
 int MatmulInt8Run(void *cdata, int task_id) {
  auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
  auto ret = op->RunImpl(task_id);
  auto ret = MatmulBaseInt8CPUKernel::ReSize();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
    MS_LOG(ERROR) << "MatmulBaseInt8CPUKernel failed";
    return ret;
  }
  return RET_OK;
 }
 int MatmulInt8CPUKernel::Run() {
  auto a_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(0)->data_c());
  auto c_ptr = reinterpret_cast<int8_t *>(out_tensors_.at(0)->data_c());
  auto a_stride = params_->row_ * params_->deep_;
  auto b_stride = params_->deep_ * params_->col_;
  auto c_stride = params_->row_ * params_->col_;
  if (!params_->b_const_) {
    auto b_ptr = reinterpret_cast<int8_t *>(in_tensors_.at(1)->data_c());
    for (int i = 0; i < params_->batch; ++i) {
      auto cur_b = b_ptr + i * b_stride;
      auto cur_b_pack = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
      auto cur_sums = weight_bias_sums_batch_ + i * params_->col_4_;
      if (params_->b_transpose_) {
        RowMajor2Row16x4MajorInt8(cur_b, cur_b_pack, params_->col_, params_->deep_);
        CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
                           bias_ptr_, cur_sums, ColMajor, false);
      } else {
        RowMajor2Col16x4MajorInt8(cur_b, params_->deep_, params_->col_, cur_b_pack);
        CalcWeightBiasSums(cur_b, params_->deep_, params_->col_, quant_params_.input.zp_, &quant_params_.weight.zp_,
                           bias_ptr_, cur_sums, RowMajor, false);
      }
    }
  }
  for (int i = 0; i < params_->batch; ++i) {
    auto cur_a_ptr = a_ptr + i * a_stride;
    if (params_->a_transpose_) {
      RowMajor2Col16x4MajorInt8(cur_a_ptr, params_->deep_, params_->row_, a_r4x16_ptr_);
      CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, ColMajor);
    } else {
      RowMajor2Row16x4MajorInt8(cur_a_ptr, a_r4x16_ptr_, params_->row_, params_->deep_);
      CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, quant_params_.weight.zp_, input_sums_, RowMajor);
    }
    b_c16x4_ptr_ = b_c16x4_batch_ + i * params_->col_4_ * params_->deep_16_;
    weight_bias_sums_ = weight_bias_sums_batch_ + i * params_->col_4_;
    c_ptr_ = c_ptr + i * c_stride;
    auto ret = ParallelLaunch(this->context_->thread_pool_, MatmulInt8Run, this, thread_count_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
      return ret;
    }
  }
  return RET_OK;
 }
 REG_KERNEL(kCPU, kNumberTypeInt8, PrimitiveType_MatMul, LiteKernelCreator<MatmulInt8CPUKernel>)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
@@ -22,39 +22,18 @@
 #include "nnacl/matmul_parameter.h"
 #include "mindspore/lite/nnacl/int8/quantize.h"
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/int8/matmul_base_int8.h"
 using mindspore::lite::InnerContext;
 namespace mindspore::kernel {
 class MatmulInt8CPUKernel : public LiteKernel {
 class MatmulInt8CPUKernel : public MatmulBaseInt8CPUKernel {
 public:
  MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
                      const std::vector<lite::Tensor *> &outputs, const InnerContext *ctx,
                      const std::vector<lite::Tensor *> &outputs, const lite::InnerContext *ctx,
                      const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    params_ = reinterpret_cast<MatMulParameter *>(op_parameter_);
  }
  ~MatmulInt8CPUKernel() override;
      : MatmulBaseInt8CPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~MatmulInt8CPUKernel() override = default;
  int Init() override;
  int ReSize() override;
  int Run() override;
  int RunImpl(int task_id);
 private:
  void FreeTmpBuffer();
 private:
  MatMulParameter *params_ = nullptr;
  MatmulQuantArg quant_params_;
  int8_t *a_r4x16_ptr_ = nullptr;
  int8_t *b_c16x4_ptr_ = nullptr;
  int8_t *c_ptr_ = nullptr;
  int8_t *b_c16x4_batch_ = nullptr;
  int *bias_ptr_ = nullptr;
  int *input_sums_ = nullptr;
  int *weight_bias_sums_ = nullptr;
  int *weight_bias_sums_batch_ = nullptr;
  int thread_stride_ = 0;
  int thread_count_ = 0;
 };
 }  // namespace mindspore::kernel
--- a/mindspore/lite/test/run_benchmark_nets.sh
+++ b/mindspore/lite/test/run_benchmark_nets.sh
@@ -599,9 +599,9 @@ function Run_x86() {
        echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile='${ms_models_path}'/'${model_name}'.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/'${model_name}'.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/'${model_name}'.ms.out --accuracyThreshold=${accuracy_limit}' >> "${run_x86_log_file}"
        export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./benchmark/benchmark --modelFile=${ms_models_path}/${model_name}_weightquant.ms --inDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/input/${model_name}.ms.bin --benchmarkDataFile=/home/workspace/mindspore_dataset/mslite/models/hiai/input_output/output/${model_name}.ms.out --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}"
        if [ $? = 0 ]; then
            run_result='x86: '${model_name}'[weight quant] pass'; echo ${run_result} >> ${run_benchmark_result_file}
            run_result='x86: '${model_name}'[weight_quant] pass'; echo ${run_result} >> ${run_benchmark_result_file}
        else
            run_result='x86: '${model_name}'[weight quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
            run_result='x86: '${model_name}'[weight_quant] failed'; echo ${run_result} >> ${run_benchmark_result_file}; return 1
        fi
    done < ${models_mindspore_weightquant_config}
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
@@ -79,58 +79,6 @@ void MMInt8TestInit(std::vector<lite::Tensor *> *inputs, std::vector<lite::Tenso
  delete[] weight_data;
 }
 TEST_F(TestMatmulInt8, simple) {
 #define ROW 10
 #define COL 15
 #define DEPTH 10
 #define ROW4 UP_ROUND(ROW, 4)
 #define COL4 UP_ROUND(COL, 4)
 #define DEPTH16 UP_ROUND(DEPTH, 16)
  int8_t a[ROW * DEPTH] = {-3, -3, 0, -2, -4, -2, 1,  0,  -1, 0,  5,  1,  3,  4,  4,  -3, -5, 2,  -2, 4,
                           4,  5,  1, -1, 5,  5,  2,  -1, 0,  4,  -4, 2,  5,  -2, 5,  3,  -1, 2,  -4, 5,
                           -5, 4,  5, 3,  5,  4,  -2, 5,  5,  -5, -5, -5, 2,  -4, -3, 3,  -3, -5, 5,  0,
                           2,  -4, 4, 2,  -5, 3,  -1, 3,  -3, 2,  -5, -4, 0,  -5, 2,  4,  0,  -5, -1, 4,
                           3,  5,  5, 2,  -5, -5, -4, -5, 3,  3,  3,  0,  -2, 0,  -2, -3, -2, 3,  5,  -5};
  int8_t b[DEPTH * COL] = {1,  2,  -2, -5, -4, 2,  3,  2,  -5, 4,  -5, 4,  1, -2, 1,  5,  5,  5,  2,  5,  -3, -3,
                           -1, -3, -1, 0,  -4, 0,  1,  -2, -2, -3, -5, 1,  1, 0,  4,  5,  -3, -1, 4,  3,  5,  4,
                           2,  4,  -3, -4, 1,  4,  -4, 5,  -1, -2, 3,  5,  5, 2,  1,  -4, 1,  2,  -3, 0,  -2, 4,
                           -3, -3, 1,  3,  4,  -1, 3,  1,  -5, -1, 2,  0,  0, 5,  -1, -5, 5,  -5, 0,  3,  -3, 4,
                           3,  1,  -3, -3, 2,  -2, -3, -3, 3,  4,  2,  -1, 2, 0,  -2, 4,  5,  3,  -1, -3, -2, -1,
                           4,  3,  -5, 1,  0,  0,  -1, -4, -3, -2, 5,  3,  2, 1,  -4, 1,  4,  5,  -1, 2,  -2, 2,
                           1,  -2, 5,  2,  -4, -4, 1,  1,  2,  -1, -5, -4, 4, 1,  -3, 4,  -1, -4};
  int8_t correct[ROW * COL] = {
    -36, -33, 11,  4,   -12, -7,  11,  0,   37,  -30, -13, -2, -30, -3,  29,  46,  -13, -84, -8,  6,   39,  26,
    -67, -48, 57,  12,  32,  44,  -24, -85, 22,  32,  -8,  -8, 20,  10,  -45, 12,  -69, 36,  22,  -37, 58,  27,
    -24, -11, -22, -50, 26,  50,  28,  -56, -42, -23, -1,  70, -58, 54,  35,  -61, 54,  40,  -11, 35,  43,  3,
    7,   30,  -7,  -13, 73,  -3,  26,  26,  -11, -37, 0,   19, 34,  -4,  0,   -22, 71,  8,   -25, -6,  -5,  31,
    8,   63,  -25, -55, -62, -17, 23,  1,   36,  12,  -38, 2,  11,  27,  18,  5,   4,   -59, -17, 1,   25,  9,
    13,  -77, 13,  9,   -11, 26,  -52, 42,  28,  6,   44,  4,  2,   26,  19,  -31, 46,  23,  -57, 15,  -31, 39,
    40,  -9,  8,   38,  40,  27,  -19, -47, 14,  50,  14,  18, 0,   -59, 39,  -48, -47, 35};
  int8_t output[ROW * COL] = {0};
  int8_t *a_r4x16 = new int8_t[ROW4 * DEPTH16];
  memset(a_r4x16, 0, ROW4 * DEPTH16);
  int8_t *b_c16x4 = new int8_t[COL4 * DEPTH16];
  memset(b_c16x4, 0, COL4 * DEPTH16);
  RowMajor2Row16x4MajorInt8(a, a_r4x16, ROW, DEPTH);
  RowMajor2Col16x4MajorInt8(b, DEPTH, COL, b_c16x4);
  int a_sums[ROW4] = {0};
  int bias[COL4] = {0};
  int multiplier, ls, rs;
  QuantizeRoundParameterWithDoublePrecision(1.0f, &multiplier, &ls, &rs);
 #ifdef ENABLE_ARM64
  MatmulInt8Neon64(a_r4x16, b_c16x4, output, ROW4, COL4, DEPTH16, a_sums, bias, INT8_MIN, INT8_MAX, 0, &multiplier, &ls,
                   &rs, ROW, COL, COL, false);
 #else
  MatMulInt8_16x4_r(a_r4x16, b_c16x4, output, ROW, COL, DEPTH16, COL, a_sums, bias, &ls, &rs, &multiplier, 0, INT8_MIN,
                    INT8_MAX, false);
 #endif
  ASSERT_EQ(0, CompareOutputData(output, correct, ROW * COL, 0.1));
  delete[] a_r4x16;
  delete[] b_c16x4;
 }
 TEST_F(TestMatmulInt8, mmtest1) {
  float in[] = {6.583835634764597,   11.337275140963907,  -4.125256949459629, 10.994337291530833,
                19.086065139532636,  3.620842999158455,   13.167624585590346, -18.326739299407755,