[MS][LITE][Develop]conv1x1 int8

5 years ago · 6cfcdaab3d
--- a/mindspore/lite/nnacl/int8/conv_int8.c
+++ b/mindspore/lite/nnacl/int8/conv_int8.c
@@ -367,6 +367,26 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
  }
 }

 void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
                 MATMUL_OPT_R_FUNC matmul_func) {
  if (matmul_func != NULL) {
    matmul_func(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
                conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
                conv_param->conv_quant_arg_.quant_multiplier_, conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
                conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
                (conv_param->conv_quant_arg_.filter_arg_num_ > 1));
  } else {
    MatMulInt8_16x4_r(packed_input, packed_weight, dst, row, col, deep16, conv_param->output_channel_, input_sum, bias,
                      conv_param->conv_quant_arg_.left_shift_, conv_param->conv_quant_arg_.right_shift_,
                      conv_param->conv_quant_arg_.quant_multiplier_,
                      conv_param->conv_quant_arg_.output_quant_args_[0].zp_,
                      conv_param->conv_quant_arg_.out_act_min_[0], conv_param->conv_quant_arg_.out_act_max_[0],
                      (conv_param->conv_quant_arg_.filter_arg_num_ > 1));
  }
  return;
 }

 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
--- a/mindspore/lite/nnacl/int8/conv_int8.h
+++ b/mindspore/lite/nnacl/int8/conv_int8.h
@@ -25,6 +25,8 @@
 #include "nnacl/conv_parameter.h"
 #include "nnacl/winograd_utils.h"
 #include "nnacl/quantization/quantize.h"
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/int8/matmul_int8.h"

 typedef void (*GEMM_FUNC)(int8_t *dst, const int8_t *src, const int8_t *weight, const int32_t *bias, size_t ksize,
                          size_t ic4, size_t output_channel, size_t offset, const int32_t *input_sum, size_t act_min,
@@ -51,6 +53,11 @@ void ConvInt8Opt(int8_t *input_data, int8_t *packed_input, int8_t *packed_weight
                 int32_t *tmp_dst, int8_t *tmp_out, int8_t *output_data, int32_t *input_sum, int task_id,
                 ConvParameter *conv_param, GEMM_FUNC gemm_func);

 // int8 convolution 1x1
 void Conv1x1Int8(const int8_t *packed_input, const int8_t *packed_weight, int8_t *dst, const int32_t *input_sum,
                 const int32_t *bias, int row, int col, int deep16, ConvParameter *conv_param,
                 MATMUL_OPT_R_FUNC matmul_func);

 // int8 convolution 3x3
 void Conv3x3Int8(int16_t *input_data, int16_t *transed_weight, const int32_t *bias_data, int8_t *output_data,
                 int16_t *tile_buffer, int16_t *block_unit_buffer, int32_t *tmp_dst_buffer, int8_t *tmp_out,
--- a/mindspore/lite/nnacl/int8/deconv.c
+++ b/mindspore/lite/nnacl/int8/deconv.c
@@ -172,73 +172,7 @@ void DeConvPackWeightSum(int8_t *weight, int32_t *weight_sum, int32_t input_zp,
 void DeConvPackInputSum(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16,
                        bool suppport_opt) {
  /* optimize normal -> same layout */
 #ifdef ENABLE_ARM64
  asm volatile(
    "mov x10, %[src] \n"
    "mov x11, %[dst] \n"
    "dup v15.4s, %w[filter_zp]  \n"

    "mov x0, #0 \n"
    "1: \n"
    "cmp x0, %[row4] \n"
    "beq 4f \n"
    "add x0, x0, #4\n"
    "dup v10.4s, wzr \n"
    "mov x2, #0 \n"

    "2: \n"
    "cmp x2, %[col16] \n"
    "beq 3f \n"
    "add x2, x2, #16\n"

    "ld1 {v0.16b}, [x10], #16\n"
    "ld1 {v1.16b}, [x10], #16\n"
    "ld1 {v2.16b}, [x10], #16\n"
    "ld1 {v3.16b}, [x10], #16\n"

    "saddlp v4.8h, v0.16b \n"
    "saddlp v5.8h, v1.16b \n"
    "saddlp v6.8h, v2.16b \n"
    "saddlp v7.8h, v3.16b \n"

    "saddlp v0.4S, v4.8h \n"
    "saddlp v1.4S, v5.8h \n"
    "saddlp v2.4S, v6.8h \n"
    "saddlp v3.4S, v7.8h \n"

    "addv s4, v0.4S \n"
    "addv s5, v1.4S \n"
    "addv s6, v2.4S \n"
    "addv s7, v3.4S \n"

    "mov v0.s[0], v4.s[0] \n"
    "mov v0.s[1], v5.s[0] \n"
    "mov v0.s[2], v6.s[0] \n"
    "mov v0.s[3], v7.s[0] \n"

    "add v10.4s, v10.4s, v0.4s \n"
    "b 2b\n"

    "3: \n"
    "mul v10.4s, v10.4s, v15.4s \n"
    "st1 {v10.4s}, [x11], #16 \n"
    "beq 1b \n"

    "4: \n"

    :
    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
 #else
  for (int r = 0; r < row4; r++) {
    int32_t tmp_value = 0;
    for (int c = 0; c < col16; c++) {
      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
      tmp_value += src[src_index];
    }
  }
 #endif
  PackInputSum16x4PerLater(src, dst, filter_zp, row4, col16);
  return;
 }

--- a/mindspore/lite/nnacl/int8/matmul_int8.c
+++ b/mindspore/lite/nnacl/int8/matmul_int8.c
@@ -28,6 +28,19 @@ void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
  }
 }

 void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
  int col16 = UP_ROUND(col, C16NUM);
  for (int r = 0; r < row; r++) {
    int rd4 = r / C4NUM;
    int rm4 = r % C4NUM;
    for (int c = 0; c < col; c++) {
      int cd16 = c / C16NUM;
      int cm16 = c % C16NUM;
      dst_ptr[cd16 * col16 * C4NUM + rd4 * C4NUM * C16NUM + rm4 * C16NUM + cm16] = src_ptr[r * col16 + c];
    }
  }
 }

 void MatrixPack4x16UnitInt8(int8_t *src, int8_t *dst, int row, int col, int stride) {
  for (int r = 0; r < row; r++) {
    int8_t *src_r = src + r * stride;
@@ -145,7 +158,38 @@ void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int
  return;
 }

 #ifdef ENABLE_ARM64
 void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
                       bool per_channel) {
  /*  row4x16-major * row16x4-major => (int8)row-major  : per-channel */
  for (int r = 0; r < row; r++) {
    for (int c = 0; c < col; c++) {
      int r4div = r / C4NUM, r4mod = r % C4NUM;
      int c4div = c / C4NUM, c4mod = c % C4NUM;
      size_t ci = r * stride + c;
      int32_t value = 0;
      for (int d = 0; d < deep_16; d++) {
        int d16div = d / C16NUM, d16mod = d % C16NUM;
        size_t ai = r4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + r4mod * C16NUM + d16mod;
        size_t bi = c4div * deep_16 * C4NUM + d16div * C4NUM * C16NUM + c4mod * C16NUM + d16mod;
        value = value + a[ai] * b[bi];
      }
      int32_t cur_input_sum = per_channel ? input_sum[c4div * UP_ROUND(row, C4NUM) + r * C4NUM + c4mod] : input_sum[r];
      value -= cur_input_sum;
      value += bias[c];
      int32_t cur_left_shift = per_channel ? left_shift[c] : left_shift[0];
      int32_t cur_right_shift = per_channel ? right_shift[c] : right_shift[0];
      int32_t cur_multiplier = per_channel ? multiplier[c] : multiplier[0];
      value = MultiplyByQuantizedMultiplier(value, cur_multiplier, cur_left_shift, cur_right_shift) + output_zp;
      value = MSMIN(maxi, value);
      value = MSMAX(mini, value);
      dst[ci] = (int8_t)value;
    }
  }
  return;
 }

 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16) {
  int stride = sizeof(int8_t) * 16 * 4;
  for (int r = 0; r < row; ++r) {
@@ -201,4 +245,3 @@ void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow)
    }
  }
 }
 #endif
--- a/mindspore/lite/nnacl/int8/matmul_int8.h
+++ b/mindspore/lite/nnacl/int8/matmul_int8.h
@@ -28,17 +28,22 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int *c, const int row8, const
                const int a_zp, const int b_zp);
 void MatMulInt8_16x4(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                     const int *input_sum, const int *bias);
 void MatMulInt8_16x4_r(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                       size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                       int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini, int32_t maxi,
                       bool per_channel);
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Row4x16MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Row16x4MajorInt8(void *src_ptr, void *dst_ptr, int row, int col);

 #ifdef ENABLE_ARM64
 void RowMajor2Row4x16Major(int8_t *src, int row, int col, int8_t *dst, int col_16);
 void RowMajor2Col16x4Major(int8_t *src, int row, int col, int8_t *dst, int row_16);
 void RowMajor2Asums(int8_t *a, int row, int col, int b_zp, int *dst);
 void RowMajor2Bbias(int8_t *b, int row, int col, int a_zp, int b_zp, int *bias, int *dst);
 void Row4x4Major2RowMajor(int8_t *src, int row4, int8_t *dst, int row, int cow);

 #ifdef ENABLE_ARM64
 // bias = bias + depth * a_zp * b_zp - a_zp * b_sums
 void MatmulInt8Neon64(const int8_t *a, const int8_t *b, int8_t *dst, int row4, int col4, int deep16, const int *a_sums,
                      const int *bias, int act_min, int act_max, int out_zp, int multiplier, int left_shift,
--- a/mindspore/lite/nnacl/matmul_parameter.h
+++ b/mindspore/lite/nnacl/matmul_parameter.h
@@ -22,6 +22,11 @@
 typedef void (*MATMUL_OPT_R4_FUNC)(const int8_t *a, const int8_t *b, int *dst, int row_4, int col_4, int deep_16,
                                   const int *input_sum, const int *bias);

 typedef void (*MATMUL_OPT_R_FUNC)(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                  int32_t maxi, bool per_channel);

 typedef void (*MAT_TRANS_FUNC)(void *dst, void *a, int row, int col);

 typedef enum ActType { ActType_No, ActType_Relu, ActType_Relu6 } ActType;
--- a/mindspore/lite/nnacl/opt_op_handler.c
+++ b/mindspore/lite/nnacl/opt_op_handler.c
@@ -15,6 +15,7 @@
 */

 #include <stdlib.h>
 #include <stdbool.h>

 #ifdef __cplusplus
 extern "C" {
@@ -45,4 +46,11 @@ void MatMulR4Int8_optimize_handler(const int8_t *a, const int8_t *b, int *dst, i
                                   const int *input_sum, const int *bias) {
  return MatMulOptR4Int8Neon64(a, b, dst, row4, col4, deep16, input_sum, bias);
 }

 void MatMulRInt8_optimize_handler(const int8_t *a, const int8_t *b, int8_t *dst, size_t row, size_t col, size_t deep_16,
                                  size_t stride, const int32_t *input_sum, const int32_t *bias, int32_t *left_shift,
                                  int32_t *right_shift, int32_t *multiplier, int32_t output_zp, int32_t mini,
                                  int32_t maxi, bool per_channel) {
  return;
 }
 #endif
--- a/mindspore/lite/nnacl/pack.c
+++ b/mindspore/lite/nnacl/pack.c
@@ -153,22 +153,24 @@ void PackWeightInt8Opt(int8_t *weight_data, ConvParameter *conv_param, int8_t *p
  }        // kernel plane loop
 }

 void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param) {
 void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size) {
  /* support nhwc */
  char *src = (char *)src_ptr;
  char *dst = (char *)dst_ptr;
  for (int dst_h = 0; dst_h < conv_param->output_h_; dst_h++) {
    int src_h = dst_h * conv_param->stride_h_ - conv_param->pad_h_;
    if (src_h < 0 || src_h >= conv_param->input_h_) {
      continue;
    }
    const float *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_;
    float *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_;
    const char *src_h_ptr = src + src_h * conv_param->input_w_ * conv_param->input_channel_ * data_size;
    char *dst_h_ptr = dst + dst_h * conv_param->output_w_ * conv_param->input_channel_ * data_size;
    for (int dst_w = 0; dst_w < conv_param->output_w_; dst_w++) {
      int src_w = dst_w * conv_param->stride_w_ - conv_param->pad_w_;
      if (src_w < 0 || src_w >= conv_param->input_w_) {
        continue;
      }
      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_, src_h_ptr + src_w * conv_param->input_channel_,
             conv_param->input_channel_ * sizeof(float));
      memcpy(dst_h_ptr + dst_w * conv_param->input_channel_ * data_size,
             src_h_ptr + src_w * conv_param->input_channel_ * data_size, conv_param->input_channel_ * data_size);
    }
  }
  return;
@@ -188,6 +190,105 @@ void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParam
  return;
 }

 void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16) {
  /* optimize normal -> same layout */
 #ifdef ENABLE_ARM64
  asm volatile(
    "mov x10, %[src] \n"
    "mov x11, %[dst] \n"
    "dup v15.4s, %w[filter_zp]  \n"

    "mov x0, #0 \n"
    "1: \n"
    "cmp x0, %[row4] \n"
    "beq 4f \n"
    "add x0, x0, #4\n"
    "dup v10.4s, wzr \n"
    "mov x2, #0 \n"

    "2: \n"
    "cmp x2, %[col16] \n"
    "beq 3f \n"
    "add x2, x2, #16\n"

    "ld1 {v0.16b}, [x10], #16\n"
    "ld1 {v1.16b}, [x10], #16\n"
    "ld1 {v2.16b}, [x10], #16\n"
    "ld1 {v3.16b}, [x10], #16\n"

    "saddlp v4.8h, v0.16b \n"
    "saddlp v5.8h, v1.16b \n"
    "saddlp v6.8h, v2.16b \n"
    "saddlp v7.8h, v3.16b \n"

    "saddlp v0.4S, v4.8h \n"
    "saddlp v1.4S, v5.8h \n"
    "saddlp v2.4S, v6.8h \n"
    "saddlp v3.4S, v7.8h \n"

    "addv s4, v0.4S \n"
    "addv s5, v1.4S \n"
    "addv s6, v2.4S \n"
    "addv s7, v3.4S \n"

    "mov v0.s[0], v4.s[0] \n"
    "mov v0.s[1], v5.s[0] \n"
    "mov v0.s[2], v6.s[0] \n"
    "mov v0.s[3], v7.s[0] \n"

    "add v10.4s, v10.4s, v0.4s \n"
    "b 2b\n"

    "3: \n"
    "mul v10.4s, v10.4s, v15.4s \n"
    "st1 {v10.4s}, [x11], #16 \n"
    "beq 1b \n"

    "4: \n"

    :
    : [ dst ] "r"(dst), [ src ] "r"(src), [ row4 ] "r"(row4), [ col16 ] "r"(col16), [ filter_zp ] "r"(filter_zp)
    : "x0", "x1", "x2", "x3", "x10", "x11", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v10", "v15");
 #else
  for (int r = 0; r < row4; r++) {
    int32_t tmp_value = 0;
    for (int c = 0; c < col16; c++) {
      int r4div = r / C4NUM, r4mod = r % C4NUM, c16div = c / C16NUM, c16mod = c % C16NUM;
      int src_index = r4div * C4NUM * col16 + c16div * C16NUM * C4NUM + r4mod * C16NUM + c16mod;
      tmp_value += src[src_index];
    }
    dst[r] = tmp_value * filter_zp;
  }
 #endif
  return;
 }

 void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
                          size_t plane_size, ConvParameter *conv_param) {
  size_t hw4 = UP_ROUND(plane_size, C4NUM);
  size_t ic16 = UP_ROUND(input_channel, C16NUM);
  if (conv_param->conv_quant_arg_.filter_arg_num_ == 1) {
    PackInputSum16x4PerLater(input_value, input_sum, conv_param->conv_quant_arg_.filter_quant_args_[0].zp_, hw4, ic16);
  } else {
    for (int ri = 0; ri < plane_size; ri++) {
      int ri4div = ri / C4NUM, ri4mod = ri % C4NUM;
      for (int ci = 0; ci < output_channel; ci++) {
        int32_t tmp_sum_value = 0;
        int ci4div = ci / C4NUM, ci4mod = ci % C4NUM;
        int32_t filter_zp = conv_param->conv_quant_arg_.filter_quant_args_[ci].zp_;
        for (int di = 0; di < input_channel; di++) {
          size_t di16div = di / C16NUM, di16mod = di % C16NUM;
          int src_index = ri4div * C4NUM * ic16 + di16div * C16NUM * C4NUM + ri4mod * C16NUM + di16mod;
          tmp_sum_value += input_value[src_index];
        }
        int dst_index = ci4div * C4NUM * hw4 + ri * C4NUM + ci4mod;
        input_sum[dst_index] = tmp_sum_value * filter_zp;
      }
    }
  }
  return;
 }

 void Im2ColPackUnitFp32(const float *input_data, ConvParameter *conv_param, float *packed_input, int real_cal_num,
                        int block_index) {
  // input format : nhwc
--- a/mindspore/lite/nnacl/pack.h
+++ b/mindspore/lite/nnacl/pack.h
@@ -35,10 +35,15 @@ void Im2ColPackUnitInt8(const int8_t *input_data, int8_t *packed_input, int real
 void Im2ColPackUnitInt8Opt(const int8_t *input_data, int8_t *packed_input, int real_cal_num, int block_index,
                           int32_t *input_sum, ConvParameter *conv_param);

 void Conv1x1InputPackFp32(const float *src, float *dst, ConvParameter *conv_param);
 void PackInputSum16x4PerLater(const int8_t *src, int32_t *dst, int32_t filter_zp, size_t row4, size_t col16);

 void Conv1x1InputPack(const void *src_ptr, void *dst_ptr, ConvParameter *conv_param, int data_size);

 void Pack1x1WeightFp32(const float *weight_data, float *packed_weight, ConvParameter *conv_param);

 void PackInputSum16x4Int8(int8_t *input_value, int32_t *input_sum, size_t input_channel, size_t output_channel,
                          size_t plane_size, ConvParameter *conv_param);

 void MatrixPack(const float *src, float *dst, int row, int ic4, int stride);

 void PackInputToC8Int8(const int8_t *input_data, int16_t *packed_input, ConvParameter *conv_param);
--- a/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/convolution_base.cc
@@ -118,10 +118,13 @@ int ConvolutionBaseCPUKernel::CheckLayout(lite::tensor::Tensor *input_tensor) {
 }

 int ConvolutionBaseCPUKernel::SetIfPerChannel() {
  auto filter_tensor = in_tensors_.at(kWeightIndex);
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();

  uint8_t per_channel = 0b0;
  if (conv_quant_arg_->input_arg_num_ != kPerTensor) {
    int in_channel = conv_param_->input_channel_;
    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != in_channel) {
    if (static_cast<int>(conv_quant_arg_->input_arg_num_) != input_channel) {
      MS_LOG(ERROR) << "input per channel quant param length is not equal to input channel.";
      return RET_ERROR;
    }
@@ -129,8 +132,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
  }

  if (conv_quant_arg_->filter_arg_num_ != kPerTensor) {
    int filter_num = conv_param_->output_channel_;
    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != filter_num) {
    if (static_cast<int>(conv_quant_arg_->filter_arg_num_) != output_channel) {
      MS_LOG(ERROR) << "weight per channel quant param length is not equal to filter num.";
      return RET_ERROR;
    }
@@ -138,8 +140,7 @@ int ConvolutionBaseCPUKernel::SetIfPerChannel() {
  }

  if (conv_quant_arg_->output_arg_num_ != kPerTensor) {
    int out_channel = conv_param_->output_channel_;
    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != out_channel) {
    if (static_cast<int>(conv_quant_arg_->output_arg_num_) != output_channel) {
      MS_LOG(ERROR) << "output per channel quant param length is not equal to output channel.";
      return RET_ERROR;
    }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -113,7 +113,7 @@ void Convolution1x1CPUKernel::Pre1x1Trans(float *src_input, float *src_output) {
  output_ptr_ = src_output;

  if (pre_trans_input_) {
    Conv1x1InputPackFp32(src_input, input_ptr_, conv_param_);
    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(float));
  } else {
    input_ptr_ = src_input;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -0,0 +1,270 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "src/runtime/runtime_api.h"

 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {

 Convolution1x1Int8CPUKernel::~Convolution1x1Int8CPUKernel() {
  if (matmul_param_ != nullptr) {
    delete matmul_param_;
    matmul_param_ = nullptr;
  }
  if (packed_weight_ != nullptr) {
    delete packed_weight_;
    packed_weight_ = nullptr;
  }
  FreeResizeBuf();
  FreeQuantParam();
 }

 void Convolution1x1Int8CPUKernel::FreeResizeBuf() {
  if (packed_input_ != nullptr) {
    free(packed_input_);
    packed_input_ = nullptr;
  }
  if (input_sum_ != nullptr) {
    free(input_sum_);
    input_sum_ = nullptr;
  }
  return;
 }

 void Convolution1x1Int8CPUKernel::CheckSupportOptimize() {
  support_optimize_ = false;
  matmul_func_ = MatMulInt8_16x4_r;
 #ifdef ENABLE_ARM64
  void *optimize_op_handler = OptimizeModule::GetInstance()->optimized_op_handler_;
  if (optimize_op_handler != nullptr) {
    dlerror();
    *(reinterpret_cast<void **>(&matmul_func_)) = dlsym(optimize_op_handler, "MatMulRInt8_optimize_handler");
    auto dlopen_error = dlerror();
    if (dlopen_error != nullptr) {
      MS_LOG(ERROR) << "load matmul func failed! " << dlopen_error << ".";
      support_optimize_ = false;
      matmul_func_ = nullptr;
    } else {
      support_optimize_ = true;
    }
  } else {
    support_optimize_ = false;
    matmul_func_ = nullptr;
  }
 #endif

  matmul_func_ = MatMulInt8_16x4_r;
  return;
 }

 int Convolution1x1Int8CPUKernel::InitWeightBias() {
  auto filter_tensor = in_tensors_.at(kWeightIndex);
  auto input_channel = filter_tensor->Channel();
  auto output_channel = filter_tensor->Batch();

  /* weight */
  size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
  packed_weight_ = reinterpret_cast<int8_t *>(malloc(size));
  if (packed_weight_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 int8 Malloc weight error!";
    return RET_ERROR;
  }
  memset(packed_weight_, 0, size);
  RowMajor2Row4x16MajorInt8(reinterpret_cast<int8_t *>(filter_tensor->Data()), packed_weight_, output_channel,
                            input_channel);

  /* bias = bias - v2 x zp1 + zp1 x zp2  */
  int col4 = UP_ROUND(output_channel, C4NUM);
  bias_data_ = malloc(col4 * sizeof(int32_t));
  if (bias_data_ == nullptr) {
    MS_LOG(ERROR) << "Conv1x1 int8 Malloc bias_ptr_ error!";
    return RET_ERROR;
  }
  memset(bias_data_, 0, col4 * sizeof(int32_t));
  if (in_tensors_.size() == 3) {
    memcpy(bias_data_, in_tensors_[kBiasIndex]->Data(), output_channel * sizeof(int32_t));
  }

  int32_t *bias_data = reinterpret_cast<int32_t *>(bias_data_);
  int8_t *weight = reinterpret_cast<int8_t *>(filter_tensor->Data());
  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;
  for (int oc = 0; oc < output_channel; oc++) {
    int32_t weight_sum_value = 0;
    int32_t filter_zp = (conv_param_->conv_quant_arg_.filter_arg_num_ == 1)
                          ? conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_
                          : conv_param_->conv_quant_arg_.filter_quant_args_[oc].zp_;
    for (int ic = 0; ic < input_channel; ic++) {
      weight_sum_value += weight[oc * input_channel + ic];
    }
    bias_data[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
  }
  return RET_OK;
 }

 int Convolution1x1Int8CPUKernel::Init() {
  if (!InferShapeDone()) {
    return RET_OK;
  }
  matmul_param_ = new (std::nothrow) MatMulParameter();
  if (matmul_param_ == nullptr) {
    MS_LOG(ERROR) << "Init matmul_param_ failed.";
    return RET_ERROR;
  }

  CheckSupportOptimize();

  auto ret = SetQuantParam();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Set quant param failed.";
    return ret;
  }

  ret = InitWeightBias();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init weight bias failed.";
    return ret;
  }

  return ReSize();
 }

 int Convolution1x1Int8CPUKernel::InitParam() {
  pre_trans_input_ = (conv_param_->pad_h_ != 0 || conv_param_->pad_w_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);

  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
  matmul_param_->deep_ = conv_param_->input_channel_;
  matmul_param_->col_ = conv_param_->output_channel_;

  thread_count_ = MSMIN(op_parameter_->thread_num_, UP_DIV(matmul_param_->col_, C4NUM));
  thread_stride_ = UP_DIV(UP_DIV(matmul_param_->col_, C4NUM), thread_count_);

  size_t size = UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM);
  packed_input_ = reinterpret_cast<int8_t *>(malloc(size * sizeof(int8_t)));
  if (packed_input_ == nullptr) {
    MS_LOG(ERROR) << "conv1x1 int8 Malloc packed_input_ error!";
    return RET_ERROR;
  }
  memset(packed_input_, 0, size * sizeof(int8_t));

  if (conv_quant_arg_->per_channel_ & FILTER_PER_CHANNEL) {
    size = UP_ROUND(conv_param_->output_channel_, C4NUM) * UP_ROUND(matmul_param_->row_, C4NUM);
  } else {
    size = UP_ROUND(matmul_param_->row_, C4NUM);
  }
  input_sum_ = reinterpret_cast<int32_t *>(malloc(size * sizeof(int32_t)));
  if (input_sum_ == nullptr) {
    MS_LOG(ERROR) << "malloc input_sum_ failed.";
    return RET_ERROR;
  }
  memset(input_sum_, 0, size * sizeof(int32_t));

  return RET_OK;
 }

 int Convolution1x1Int8CPUKernel::ReSize() {
  FreeResizeBuf();

  ConvolutionBaseCPUKernel::Init();

  int error_code = InitParam();
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Convolution base init failed.";
    return error_code;
  }
  return RET_OK;
 }

 void Convolution1x1Int8CPUKernel::Pre1x1Trans(int8_t *src_input, int8_t *src_output) {
  output_ptr_ = src_output;
  if (pre_trans_input_) {
    Conv1x1InputPack(src_input, input_ptr_, conv_param_, sizeof(int8_t));
  } else {
    input_ptr_ = src_input;
  }
  RowMajor2Row16x4MajorInt8(input_ptr_, packed_input_, matmul_param_->row_, matmul_param_->deep_);
  return;
 }

 int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
  int cur_oc = MSMIN(thread_stride_ * C4NUM, matmul_param_->col_ - task_id * thread_stride_ * C4NUM);
  if (cur_oc <= 0) {
    return RET_OK;
  }

  int32_t *bias = reinterpret_cast<int32_t *>(bias_data_) + thread_stride_ * C4NUM * task_id;

  Conv1x1Int8(packed_input_, packed_weight_ + task_id * thread_stride_ * C4NUM * matmul_param_->deep_,
              output_ptr_ + task_id * thread_stride_ * C4NUM, input_sum_, bias + task_id * thread_stride_ * C4NUM,
              matmul_param_->row_, cur_oc, UP_ROUND(matmul_param_->deep_, C16NUM), conv_param_, matmul_func_);
  return RET_OK;
 }

 int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
  auto error_code = conv->RunImpl(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "conv1x1 Int8 Run error task_id[" << task_id << "] error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int Convolution1x1Int8CPUKernel::Run() {
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }

  if (pre_trans_input_) {
    input_ptr_ =
      reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t)));
    if (input_ptr_ == nullptr) {
      MS_LOG(ERROR) << "Conv1x1 int8 Malloc input_ptr_ error!";
      return RET_MEMORY_FAILED;
    }
  }

  int8_t *src_in = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
  int8_t *src_out = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());

  for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
    Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
                src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);

    PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_,
                         conv_param_);

    int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_);
    if (error_code != RET_OK) {
      MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
      return RET_ERROR;
    }
  }

  if (pre_trans_input_ && input_ptr_ != nullptr) {
    ctx_->allocator->Free(input_ptr_);
    input_ptr_ = nullptr;
  }

  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h
@@ -0,0 +1,68 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "schema/model_generated.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "nnacl/int8/conv_int8.h"
 #include "nnacl/int8/matmul_int8.h"
 #include "nnacl/matmul_parameter.h"
 #include "nnacl/optimized_kernel.h"

 namespace mindspore::kernel {
 class Convolution1x1Int8CPUKernel : public ConvolutionBaseCPUKernel {
 public:
  Convolution1x1Int8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                              const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx,
                              const mindspore::lite::PrimitiveC *primitive)
      : ConvolutionBaseCPUKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~Convolution1x1Int8CPUKernel() override;

  int Init() override;
  int ReSize() override;
  int Run() override;

 public:
  int RunImpl(int task_id);

 private:
  void FreeResizeBuf();
  int InitParam();
  int InitWeightBias();
  void Pre1x1Trans(int8_t *src_input, int8_t *src_output);
  void CheckSupportOptimize();

 private:
  int32_t *input_sum_ = nullptr; /* per-channel: oc4 format */
  int8_t *packed_weight_ = nullptr;
  int8_t *packed_input_ = nullptr;
  int8_t *input_ptr_ = nullptr;
  int8_t *output_ptr_ = nullptr;
  size_t thread_count_ = 1;
  size_t thread_stride_ = 0;
  bool pre_trans_input_ = false;
  MatMulParameter *matmul_param_ = nullptr;
  MATMUL_OPT_R_FUNC matmul_func_ = nullptr;
  bool support_optimize_ = false;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONVOLUTION_1x1_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -16,6 +16,7 @@

 #include "src/runtime/kernel/arm/int8/convolution_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_3x3_int8.h"
 #include "src/runtime/kernel/arm/int8/convolution_1x1_int8.h"
 #include "nnacl/int8/conv_int8.h"
 #include "src/runtime/kernel/arm/base/layout_transform.h"
 #include "schema/model_generated.h"
@@ -400,6 +401,9 @@ kernel::LiteKernel *CpuConvInt8KernelCreator(const std::vector<lite::tensor::Ten
  kernel::LiteKernel *kernel;
  if (kernel_h == 3 && kernel_w == 3 && stride_h == 1 && stride_w == 1 && dilation_h == 1 && dilation_w == 1) {
    kernel = new (std::nothrow) kernel::Convolution3x3Int8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else if (kernel_h == 1 && kernel_w == 1) {
    /* Convolution1x1Int8CPUKernel */
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  } else {
    kernel = new (std::nothrow) kernel::ConvolutionInt8CPUKernel(opParameter, inputs, outputs, ctx, primitive);
  }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/conv1x1_fp32_tests.cc
@@ -54,7 +54,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack1) {
  conv_param->pad_h_ = conv_param->pad_w_ = 2;

  float out[20] = {0};
  Conv1x1InputPackFp32(in, out, conv_param);
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 20));
  delete conv_param;
 }
@@ -95,7 +95,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack2) {
  conv_param->pad_h_ = conv_param->pad_w_ = 0;

  float out[28] = {0};
  Conv1x1InputPackFp32(in, out, conv_param);
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  CompareOutputData(out, correct, 28, 0.0001);
  delete conv_param;
 }
@@ -114,7 +114,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack3) {
  float correct[] = {0.0,       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 17.025112,
                     -5.052577, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};

  Conv1x1InputPackFp32(in, out, conv_param);
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 18));
  delete conv_param;
 }
@@ -136,7 +136,7 @@ TEST_F(TestConv1x1Fp32, Input1x1PrePack4) {
                     -1.770, 41.903, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0,    0.0,    0.0,
                     0.0,    0.0,    0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,    0.0};
  float out[54] = {0};
  Conv1x1InputPackFp32(in, out, conv_param);
  Conv1x1InputPack(in, out, conv_param, sizeof(float));
  EXPECT_EQ(0, lite::CompareOutputData(out, correct, 54));
  delete conv_param;
 }
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/conv_1x1_int8_tests.cc
@@ -0,0 +1,281 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/lite_kernel.h"
 #include "src/common/file_utils.h"
 #include "nnacl/quantization/quantize.h"
 #include "nnacl/common_func.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.h"

 namespace mindspore {
 using lite::tensor::Tensor;
 class TestConv1x1Int8 : public mindspore::CommonTest {
 public:
  TestConv1x1Int8() {}
 };

 TEST_F(TestConv1x1Int8, Input1x1PrePack1) {
  auto conv_param = new ConvParameter();
  conv_param->input_channel_ = 6;
  conv_param->input_h_ = conv_param->input_w_ = 3;
  conv_param->output_h_ = conv_param->output_w_ = 3;
  conv_param->stride_h_ = conv_param->stride_w_ = 2;
  conv_param->pad_h_ = conv_param->pad_w_ = 1;
  int8_t in[] = {4,  13,  -3, 16, 19, 8,  19, -6, -2, -9, 9,  18, 23, 8,  47, -14, 15, 4,
                 -0, 37,  -0, 6,  0,  -1, 37, 13, 11, 1,  -1, 41, 9,  14, 3,  0,   8,  9,
                 14, -14, -8, -8, -8, 7,  19, 17, 13, 3,  9,  18, -1, -0, 18, 0,   4,  -2};
  int8_t correct[] = {0, 0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 37, 13, 11,
                      1, -1, 41, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0};
  int8_t out[54] = {0};
  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
  CompareOutputData(out, correct, 54, 0);
  delete conv_param;
 }

 TEST_F(TestConv1x1Int8, Input1x1PrePack2) {
  auto conv_param = new ConvParameter();
  int8_t in[] = {-0, -0, -7, -0, -6, 4,  9,  9,  12, -0, 6,  2,  13, 15, 16, -7, 9,  1,  10, 13, 17, 17, 4,  13,
                 -6, 5,  7,  -7, 15, 0,  1,  -5, -7, 18, 15, 19, -7, 13, 7,  -0, 16, -5, 16, -7, 6,  10, -5, 10,
                 9,  12, -9, -8, -4, 18, -5, 0,  7,  12, 13, 16, -9, -4, 18, -0, 8,  6,  2,  10, 16, 1,  -1, 2,
                 9,  8,  9,  13, 7,  -0, 15, -7, 0,  -0, 17, 19, 9,  17, -6, -2, 7,  -0, 10, -6, -6, 18, -0, 9,
                 9,  6,  3,  -1, -8, 10, 17, -9, 17, 6,  -3, 7,  -2, -0, -9, 1,  -3, 15, 13, 4,  18};
  int8_t correct[] = {0, 0, 0, 0, 0, 0, 15, -7, -7, 0, 0, 0, 9, 7, 0, 0, 0, 0, 0, 0};

  conv_param->input_h_ = 9;
  conv_param->input_w_ = 13;
  conv_param->input_channel_ = 1;
  conv_param->output_h_ = 4;
  conv_param->output_w_ = 5;
  conv_param->stride_h_ = conv_param->stride_w_ = 4;
  conv_param->pad_h_ = conv_param->pad_w_ = 2;

  int8_t out[20] = {0};
  Conv1x1InputPack(in, out, conv_param, sizeof(int8_t));
  CompareOutputData(out, correct, 20, 0);
  delete conv_param;
 }

 int Conv1x1Int8TestInit1_perchannel(std::vector<lite::tensor::Tensor *> *inputs_,
                                    std::vector<lite::tensor::Tensor *> *outputs_, ConvParameter *conv_param,
                                    int8_t **correct) {
  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
  in_t->AddQuantParam(*in_quant_arg);
  in_t->MallocData();
  int8_t in[] = {62,  -14, 88, 2,   -35, 43,  83,  -111, 75,  26, 14,  -121,
                 -78, 56,  37, -31, 15,  -75, -10, -115, -71, 74, -65, -15};
  memcpy(in_t->Data(), in, in_t->ElementsNum() * sizeof(int8_t));
  inputs_->push_back(in_t);

  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  weight_t->MallocData();
  auto weight_quant_arg1 = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg1->zeroPoint = 66, weight_quant_arg1->scale = 0.96439215686275;
  auto weight_quant_arg2 = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg2->zeroPoint = 33, weight_quant_arg2->scale = 0.76439215686275;
  auto weight_quant_arg3 = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg3->zeroPoint = -20, weight_quant_arg3->scale = 0.99117647;
  weight_t->AddQuantParam(*weight_quant_arg1);
  weight_t->AddQuantParam(*weight_quant_arg2);
  weight_t->AddQuantParam(*weight_quant_arg3);
  int8_t weight[] = {65, 67, 65, 65, 32, 33, 34, 33, -19, -20, -19, -20};
  memcpy(weight_t->Data(), weight, weight_t->ElementsNum() * sizeof(int8_t));
  inputs_->push_back(weight_t);

  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  out_t->MallocData();
  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.294321233;
  out_t->AddQuantParam(*output_quant_arg);
  outputs_->push_back(out_t);

  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
  int8_t nchw_co[] = {-83, 34, 100, 10, 113, 55, 3, 16, 63, 6, 93, 20, 5, 6, 42, 35, 28, -24};
  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(int8_t));

  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  return out_t->ElementsNum();
 }

 TEST_F(TestConv1x1Int8, Conv1x1TestPerChannel) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto conv_param = new ConvParameter();
  int8_t *correct;
  auto ctx = new lite::Context;
  ctx->thread_num_ = 1;
  int total_size = Conv1x1Int8TestInit1_perchannel(&inputs_, &outputs_, conv_param, &correct);
  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

  conv1x1->Init();
  conv1x1->Run();
  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 70);

  delete conv1x1;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }

 int Conv1x1Int8TestInit1(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
                         ConvParameter *conv_param, int8_t **correct) {
  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
  in_t->AddQuantParam(*in_quant_arg);
  in_t->MallocData();
  float in[] = {12.216284, 3.3466918,  15.327419, 5.234958,  0.804376,   9.952188,  14.727955,  -8.080715,
                13.71383,  8.055829,   6.5845337, -9.25232,  -4.24519,   11.550042, 9.262012,   1.2780352,
                6.7263746, -3.9301445, 3.764492,  -8.602078, -3.3558068, 13.619035, -2.6694393, 3.2008505};
  Quantize(in, in_t->ElementsNum(), in_quant_arg->scale, in_quant_arg->zeroPoint,
           reinterpret_cast<int8_t *>(in_t->Data()));
  inputs_->push_back(in_t);

  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
  weight_t->AddQuantParam(*weight_quant_arg);
  weight_t->MallocData();
  float weight[] = {-0.7308652, 0.5257509,  -0.87825793, -1.123181,   -1.2206168, 0.562695,
                    1.5382664,  -0.5020635, 0.8591602,   -0.26410004, 1.1262615,  0.073132955};
  Quantize(weight, weight_t->ElementsNum(), weight_quant_arg->scale, weight_quant_arg->zeroPoint,
           reinterpret_cast<int8_t *>(weight_t->Data()));
  inputs_->push_back(weight_t);

  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  out_t->MallocData();
  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
  out_t->AddQuantParam(*output_quant_arg);
  outputs_->push_back(out_t);

  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
  float nchw_co[] = {-26.51016327, 7.92113757, 27.25741343, 0.785643655,  31.3307619, 14.05927672,
                     -1.178490666, 2.5676252,  16.39408946, -0.394793726, 25.2866881, 3.827249175,
                     -0.626854507, -0.3122176, 10.42769169, 8.362184085,  6.04617807, -9.252362384};
  Quantize(nchw_co, out_t->ElementsNum(), output_quant_arg->scale, output_quant_arg->zeroPoint, *correct);

  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  return out_t->ElementsNum();
 }

 TEST_F(TestConv1x1Int8, Conv1x1Int8Test1) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto conv_param = new ConvParameter();
  int8_t *correct;
  auto ctx = new lite::Context;
  ctx->thread_num_ = 1;
  int total_size = Conv1x1Int8TestInit1(&inputs_, &outputs_, conv_param, &correct);
  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

  conv1x1->Init();
  conv1x1->Run();
  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);

  delete conv1x1;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }

 int Conv1x1Int8TestInit2(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
                         ConvParameter *conv_param, int8_t **correct) {
  size_t buffer_size;
  Tensor *in_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
  in_quant_arg->zeroPoint = -42, in_quant_arg->scale = 0.117647;
  in_t->AddQuantParam(*in_quant_arg);
  in_t->MallocData();
  std::string input_path = "./input";
  auto input = mindspore::lite::ReadFile(input_path.c_str(), &buffer_size);
  memcpy(in_t->Data(), input, buffer_size);
  inputs_->push_back(in_t);
  delete[] input;

  Tensor *weight_t = new Tensor(kNumberTypeInt8, {3, 1, 1, 4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg->zeroPoint = 66, weight_quant_arg->scale = 0.036439215686275;
  weight_t->AddQuantParam(*weight_quant_arg);
  weight_t->MallocData();
  std::string weight_path = "./weight";
  auto weight = mindspore::lite::ReadFile(weight_path.c_str(), &buffer_size);
  memcpy(weight_t->Data(), weight, buffer_size);
  inputs_->push_back(weight_t);
  delete[] weight;

  Tensor *bias_t = new Tensor(kNumberTypeInt32, {4}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  weight_t->MallocData();
  std::string bias_path = "./bias";
  auto bias = mindspore::lite::ReadFile(bias_path.c_str(), &buffer_size);
  memcpy(bias_t->Data(), bias, buffer_size);
  inputs_->push_back(bias_t);
  delete[] bias;

  Tensor *out_t = new Tensor(kNumberTypeInt8, {1, 2, 3, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  out_t->MallocData();
  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
  output_quant_arg->zeroPoint = 7, output_quant_arg->scale = 0.234321233;
  out_t->AddQuantParam(*output_quant_arg);
  outputs_->push_back(out_t);

  *correct = reinterpret_cast<int8_t *>(malloc(out_t->ElementsNum() * sizeof(int8_t)));
  std::string output_path = "./output";
  auto output = mindspore::lite::ReadFile(output_path.c_str(), &buffer_size);
  memcpy(*correct, output, buffer_size);
  delete[] output;

  conv_param->kernel_h_ = conv_param->kernel_w_ = 1;
  conv_param->stride_h_ = conv_param->stride_w_ = 1;
  conv_param->dilation_h_ = conv_param->dilation_w_ = 1;
  conv_param->pad_h_ = conv_param->pad_w_ = 0;
  conv_param->is_relu_ = conv_param->is_relu6_ = false;
  return out_t->ElementsNum();
 }

 TEST_F(TestConv1x1Int8, Conv1x1Int8Test2) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto conv_param = new ConvParameter();
  int8_t *correct;
  auto ctx = new lite::Context;
  ctx->thread_num_ = 1;
  int total_size = Conv1x1Int8TestInit2(&inputs_, &outputs_, conv_param, &correct);
  kernel::Convolution1x1Int8CPUKernel *conv1x1 = new kernel::Convolution1x1Int8CPUKernel(
    reinterpret_cast<OpParameter *>(conv_param), inputs_, outputs_, ctx, nullptr);

  conv1x1->Init();
  conv1x1->Run();
  CompareOutputData(reinterpret_cast<int8_t *>(outputs_[0]->Data()), correct, total_size, 2);

  delete conv1x1;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }
 }  // namespace mindspore