!12391 add conv1x1 coder

From: @zhujingxuan Reviewed-by: Signed-off-by:
4 years ago · 3cc3d5c9cf
--- a/mindspore/lite/micro/cmake/file_list.cmake
+++ b/mindspore/lite/micro/cmake/file_list.cmake
@@ -81,6 +81,7 @@ set(CODER_OPCODERS_SRC
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/concat_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/fullconnection_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/matmul_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_3x3_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/conv2d_int8_coder.cc
        ${MICRO_DIR}/coder/opcoders/nnacl/int8/pooling_int8_coder.cc
@@ -126,13 +127,10 @@ set(LITE_KERNEL_SRC
        ${LITE_DIR}/nnacl/int8/fixed_point.c
        ${LITE_DIR}/nnacl/fp32/matmul_fp32.c
        ${LITE_DIR}/nnacl/int8/conv3x3_int8.c
        )
 set(MICRO_ADAPTER_SRC
        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
        ${MICRO_DIR}/wrapper/int8/conv_init_int8.c
        ${LITE_DIR}/nnacl/int8/conv1x1_int8.c
        ${LITE_DIR}/nnacl/base/conv1x1_base.c
        )

 list(APPEND FILE_SET ${CODER_SRC} ${CODER_UTILS_SRC} ${CODER_OPCODERS_SRC} ${CODER_GENERATOR_SRC}
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC} ${MICRO_ADAPTER_SRC})
        ${CODER_ALLOCATOR_SRC} ${LITE_SRC} ${LITE_KERNEL_SRC})

--- a/mindspore/lite/micro/cmake/wrapper.cmake
+++ b/mindspore/lite/micro/cmake/wrapper.cmake
@@ -0,0 +1,12 @@
 SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")

 set(MICRO_WRAPPER_SRC
        ${LITE_DIR}/src/runtime/thread_pool.c
        ${MICRO_DIR}/wrapper/fp32/matmul_fp32_wrapper.c
        ${MICRO_DIR}/wrapper/int8/matmul_int8_wrapper.c
        ${MICRO_DIR}/wrapper/int8/conv_init_int8.c
        ${MICRO_DIR}/wrapper/int8/conv1x1_init_int8.c
        ${MICRO_DIR}/wrapper/int8/conv1x1_run_int8.c
        )

 list(APPEND FILE_SET ${MICRO_WRAPPER_SRC})
--- a/mindspore/lite/micro/coder/CMakeLists.txt
+++ b/mindspore/lite/micro/coder/CMakeLists.txt
@@ -19,6 +19,7 @@ include_directories(${TOP_DIR}/mindspore/core/)
 #include coder
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../)
 include(${MICRO_DIR}/cmake/file_list.cmake)
 include(${MICRO_DIR}/cmake/wrapper.cmake)
 add_executable(codegen main.cc ${FILE_SET})
 add_dependencies(codegen fbs_src)
 add_dependencies(codegen fbs_inner_src)
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.cc
@@ -0,0 +1,193 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h"
 #include <string>
 #include <vector>
 #include "securec/include/securec.h"
 #include "src/runtime/kernel/arm/base/convolution_base.h"
 #include "micro/coder/opcoders/file_collector.h"
 #include "micro/coder/log.h"
 #include "micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.h"

 namespace mindspore::lite::micro::nnacl {

 int Conv2D1x1Int8Coder::Prepare(CoderContext *const context) {
  matmul_param_ = new (std::nothrow) MatMulParameter();
  MS_CHECK_PTR(matmul_param_);
  MS_CHECK_RET_CODE(Conv2DBaseCoder::Init(), "Init failed");
  MS_CHECK_RET_CODE(Conv2DBaseCoder::SetQuantParam(), "SetQuantParam failed");
  filter_peroc_ = (conv_param_->conv_quant_arg_.filter_arg_num_ != kPerTensor);
  if (filter_peroc_) {
    MS_CHECK_RET_CODE(InitFilterPeroc(), "InitFilterPeroc failed.");
  }
  CheckSupportOptimize();
  MS_CHECK_RET_CODE(InitWeightBias(context), "InitWeightBias failed");
  MS_CHECK_RET_CODE(InitParam(), "InitParam failed");
  MS_CHECK_RET_CODE(InitRunBuf(), "InitRunBuf failed");
  return RET_OK;
 }

 int Conv2D1x1Int8Coder::DoCode(CoderContext *const context) {
  Collect(context,
          {"nnacl/int8/conv1x1_int8.h", "nnacl/common_func.h", "wrapper/int8/conv1x1_init_int8.h",
           "wrapper/int8/conv1x1_run_int8.h"},
          {"common_func.c", "pack.c", "conv1x1_int8.c", "matmul_int8.c", "fixed_point.c", "conv1x1_init_int8.c",
           "conv1x1_run_int8.c"});

  nnacl::NNaclInt8Serializer code;

  code.CodeStruct("conv_param", *conv_param_);
  code.CodeStruct("matmul_param", *matmul_param_);

  code.CodeBaseStruct("Conv1x1Args", "args", input_sum_, filter_zp_ptr_, left_shift_, right_shift_, multiplier_,
                      packed_weight_, bias_data_, packed_input_, nullptr, nullptr, 0, 0, "conv_param", "matmul_param",
                      matmul_func_, pre_trans_input_, support_optimize_, filter_peroc_);

  code.CodeFunction("Conv1x1Run", input_tensor_, "args", "THREAD_POOL_DEFAULT", thread_num_s_, output_tensor_);

  context->AppendCode(code.str());
  return RET_OK;
 }

 void Conv2D1x1Int8Coder::CheckSupportOptimize() {
  support_optimize_ = false;
  matmul_func_ = "MatMulInt8_4x16_r";
  if (target_ == kARM64) {
    matmul_func_ = "MatMulDpInt8_optimize_handler";
  }
 }

 int Conv2D1x1Int8Coder::InitWeightBias(CoderContext *const context) {
  int32_t input_channel = filter_tensor_->Channel();
  int32_t output_channel = filter_tensor_->Batch();
  int32_t input_zp = conv_param_->conv_quant_arg_.input_quant_args_[0].zp_;

  nnacl::NNaclInt8Serializer code;

  packed_weight_ = static_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, kOnlineSize, kOnlinePackWeight));
  MS_CHECK_PTR(packed_weight_);
  bias_data_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, kOnlineSize, kOnlinePackWeight));
  MS_CHECK_PTR(bias_data_);

  std::string packed_weight_str = "(int8_t **)&" + allocator_->GetRuntimeAddr(packed_weight_);
  std::string bias_data_str = "(int32_t **)&" + allocator_->GetRuntimeAddr(bias_data_);
  std::string filter_zp_str = "";
  if (filter_peroc_) {
    filter_zp_str = allocator_->GetRuntimeAddr(filter_zp_ptr_);
  } else {
    MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
    filter_zp_str = "filter_zp";
    code << "int32_t filter_zp[1] = {" << conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_ << "};\n";
  }

  if (target_ == kARM64) {
    code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
                               output_channel, input_zp, "GetSupportOptFlag()", filter_peroc_, packed_weight_str,
                               bias_data_str);
  } else {
    code.CodeFunctionWithCheck("Conv1x1Init", filter_tensor_, bias_tensor_, filter_zp_str, input_channel,
                               output_channel, input_zp, support_optimize_, filter_peroc_, packed_weight_str,
                               bias_data_str);
  }

  context->AppendInitCode(code.str());
  return RET_OK;
 }

 int Conv2D1x1Int8Coder::InitFilterPeroc() {
  int32_t output_channel = filter_tensor_->Batch();
  int round_oc;
  if (target_ == kARM32A) {
    round_oc = UP_ROUND(output_channel, C2NUM);
  } else {
    round_oc = MSMAX(UP_ROUND(output_channel, C16NUM), UP_ROUND(output_channel, C4NUM));
  }

  MS_CHECK_TRUE(conv_quant_arg_->filter_arg_num_ == static_cast<size_t>(output_channel),
                "weight per channel quant param length is not equal to filter num, filter is not PerChannel");
  size_t output_size = output_channel * sizeof(int32_t);
  size_t oc_size = round_oc * sizeof(int32_t);

  /* filter zp */
  filter_zp_ptr_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, output_size, kOfflinePackWeight));
  MS_CHECK_PTR(filter_zp_ptr_);
  MS_CHECK_PTR(conv_param_->conv_quant_arg_.filter_quant_args_);
  for (int fi = 0; fi < output_channel; fi++) {
    filter_zp_ptr_[fi] = conv_param_->conv_quant_arg_.filter_quant_args_[fi].zp_;
  }

  /* left shift */
  left_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
  MS_CHECK_PTR(left_shift_);
  MS_CHECK_RET_CODE(memset_s(left_shift_, oc_size, 0, oc_size), "memset left_shift_ failed");
  MS_CHECK_RET_CODE(memcpy_s(left_shift_, oc_size, conv_param_->conv_quant_arg_.left_shift_, output_size),
                    "memcpy_s left_shift_ failed");

  /* right shift */
  right_shift_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
  MS_CHECK_PTR(right_shift_);
  MS_CHECK_RET_CODE(memset_s(right_shift_, oc_size, 0, oc_size), "memset right_shift_ failed");
  MS_CHECK_RET_CODE(memcpy_s(right_shift_, oc_size, conv_param_->conv_quant_arg_.right_shift_, output_size),
                    "memcpy_s right_shift_ failed");
  /* multiplier */
  multiplier_ = static_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, oc_size, kOfflinePackWeight));
  MS_CHECK_PTR(multiplier_);
  MS_CHECK_RET_CODE(memset_s(multiplier_, oc_size, 0, oc_size), "memset multiplier_ failed");
  MS_CHECK_RET_CODE(memcpy_s(multiplier_, oc_size, conv_param_->conv_quant_arg_.quant_multiplier_, output_size),
                    "memcpy_s multiplier_ failed");

  return RET_OK;
 }

 int Conv2D1x1Int8Coder::InitParam() {
  pre_trans_input_ = (conv_param_->pad_u_ != 0 || conv_param_->pad_l_ != 0 || conv_param_->stride_h_ != 1 ||
                      conv_param_->stride_w_ != 1);

  matmul_param_->row_ = conv_param_->output_h_ * conv_param_->output_w_;
  matmul_param_->deep_ = conv_param_->input_channel_;
  matmul_param_->col_ = conv_param_->output_channel_;
  matmul_param_->row_4_ = UP_ROUND(matmul_param_->row_, C4NUM);
  matmul_param_->deep_4_ = UP_ROUND(matmul_param_->deep_, C4NUM);
  matmul_param_->deep_16_ = UP_ROUND(matmul_param_->deep_, C16NUM);

  int row_pack_count = C4NUM;
  /* init input sum size */
  input_sum_size_ = UP_ROUND(matmul_param_->row_, row_pack_count);

  if (pre_trans_input_) {
    input_ptr_ = reinterpret_cast<int8_t *>(
      allocator_->Malloc(kNumberTypeInt8, matmul_param_->row_ * matmul_param_->deep_ * sizeof(int8_t), kWorkspace));
    MS_CHECK_PTR(input_ptr_);
  }

  return RET_OK;
 }

 int Conv2D1x1Int8Coder::InitRunBuf() {
  input_sum_ =
    reinterpret_cast<int32_t *>(allocator_->Malloc(kNumberTypeInt32, input_sum_size_ * sizeof(int32_t), kWorkspace));
  MS_CHECK_PTR(input_sum_);

  size_t size = MSMAX(UP_ROUND(matmul_param_->row_, C8NUM) * UP_ROUND(matmul_param_->deep_, C4NUM),
                      UP_ROUND(matmul_param_->row_, C4NUM) * UP_ROUND(matmul_param_->deep_, C16NUM));

  packed_input_ = reinterpret_cast<int8_t *>(allocator_->Malloc(kNumberTypeInt8, size * sizeof(int8_t), kWorkspace));
  MS_CHECK_PTR(packed_input_);
  return RET_OK;
 }

 }  // namespace mindspore::lite::micro::nnacl
--- a/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h
+++ b/mindspore/lite/micro/coder/opcoders/nnacl/int8/conv2d_1x1_int8_coder.h
@@ -0,0 +1,67 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
 #define MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
 #include "micro/coder/opcoders/base/conv2d_base_coder.h"
 #include <memory>
 #include <string>
 #include <vector>
 #include "nnacl/conv_parameter.h"

 namespace mindspore::lite::micro::nnacl {
 class Conv2D1x1Int8Coder final : public Conv2DBaseCoder {
 public:
  Conv2D1x1Int8Coder(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
                     const Model::Node *node, size_t node_index, Target target)
      : Conv2DBaseCoder(in_tensors, out_tensors, node, node_index, target) {}

  int Prepare(CoderContext *const context) override;

  int DoCode(CoderContext *const context) override;

  ~Conv2D1x1Int8Coder() override = default;

 private:
  void CheckSupportOptimize();

  int InitWeightBias(CoderContext *const context);

  int InitFilterPeroc();

  int InitParam();

  int InitRunBuf();

  int32_t *input_sum_{nullptr};     /* per-oc */
  int32_t *filter_zp_ptr_{nullptr}; /* per-oc up round  */
  int32_t *left_shift_{nullptr};    /* per-oc up round  */
  int32_t *right_shift_{nullptr};   /* per-oc up round  */
  int32_t *multiplier_{nullptr};    /* per-oc up round  */
  int8_t *packed_weight_{nullptr};
  int32_t *bias_data_{nullptr};
  int8_t *packed_input_{nullptr};
  int8_t *input_ptr_{nullptr};
  int8_t *output_ptr_{nullptr};
  size_t input_sum_size_{0};
  MatMulParameter *matmul_param_{nullptr};
  std::string matmul_func_;
  bool pre_trans_input_{false};
  bool support_optimize_{false};
  bool filter_peroc_{false};
 };
 }  // namespace mindspore::lite::micro::nnacl
 #endif  // MINDSPORE_LITE_MICRO_CODER_OPCODERS_Conv2D_1X1_INT8_CODER_H_
--- a/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
+++ b/mindspore/lite/micro/coder/opcoders/serializers/nnacl_serializer/nnacl_int8_serializer.cc
@@ -60,6 +60,16 @@ void NNaclInt8Serializer::CodeStruct(const std::string &name, const ConvParamete
    conv_parameter.input_unit_, conv_parameter.output_unit_, conv_parameter.pad_mode_, conv_parameter.act_type_);
 }

 void NNaclInt8Serializer::CodeStruct(const std::string &name, const MatMulParameter &matmul_parameter) {
  CodeBaseStruct("MatMulParameter", name, matmul_parameter.op_parameter_, matmul_parameter.has_bias_,
                 matmul_parameter.row_, matmul_parameter.col_, matmul_parameter.row_4_, matmul_parameter.row_6_,
                 matmul_parameter.row_12_, matmul_parameter.row_16_, matmul_parameter.row_align_,
                 matmul_parameter.col_4_, matmul_parameter.col_8_, matmul_parameter.col_align_, matmul_parameter.deep_,
                 matmul_parameter.deep_4_, matmul_parameter.deep_16_, matmul_parameter.batch,
                 matmul_parameter.a_transpose_, matmul_parameter.b_transpose_, matmul_parameter.a_const_,
                 matmul_parameter.b_const_, matmul_parameter.act_type_);
 }

 void NNaclInt8Serializer::CodeStruct(const std::string &name, const ArithmeticParameter &arithmetic_parameter) {
  CodeBaseStruct("ArithmeticParameter", name, arithmetic_parameter.op_parameter_, arithmetic_parameter.broadcasting_,
                 arithmetic_parameter.ndim_, arithmetic_parameter.activation_type_,
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.c
@@ -0,0 +1,90 @@
 /*
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "wrapper/int8/conv1x1_init_int8.h"
 #include <memory.h>
 #include "nnacl/int8/matmul_int8.h"
 #include "nnacl/errorcode.h"

 int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
                int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
                int8_t **packed_weight, int32_t **bias_data) {
  if (packed_weight == NULL || bias_data == NULL) {
    return NNACL_ERR;
  }
 #ifdef ENABLE_ARM32
  /* InitWeightBiasArm32 */
  /* weight */
  size_t size = UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C2NUM) * sizeof(int8_t);
  int8_t *packed_weight_ = (int8_t *)(malloc(size));
  if (packed_weight_ == NULL) {
    return NNACL_ERR;
  }
  memset(packed_weight_, 0, size);
  RowMajor2Row2x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
  /* bias */
  size = UP_ROUND(output_channel, C2NUM);
  int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
  if (bias_data_ == NULL) {
    free(packed_weight_);
    return NNACL_ERR;
  }
  memset(bias_data_, 0, size * sizeof(int32_t));
  if (src_bias != NULL) {
    memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
  }
 #else
  /* InitWeightBias */
  /* weight */
  size_t size = support_optimize ? UP_ROUND(input_channel, C4NUM) * UP_ROUND(output_channel, C16NUM) * sizeof(int8_t)
                                 : UP_ROUND(input_channel, C16NUM) * UP_ROUND(output_channel, C4NUM) * sizeof(int8_t);
  int8_t *packed_weight_ = (int8_t *)(malloc(size));
  if (packed_weight_ == NULL) {
    return NNACL_ERR;
  }
  memset(packed_weight_, 0, size);
  if (support_optimize) {
    RowMajor2Row4x16MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
  } else {
    RowMajor2Row16x4MajorInt8(src_weight, packed_weight_, output_channel, input_channel);
  }
  /* bias */
  size = support_optimize ? UP_ROUND(output_channel, C16NUM) : UP_ROUND(output_channel, C4NUM);
  int32_t *bias_data_ = (int32_t *)malloc(size * sizeof(int32_t));
  if (bias_data_ == NULL) {
    free(packed_weight_);
    return NNACL_ERR;
  }
  memset(bias_data_, 0, size * sizeof(int32_t));
  if (src_bias != NULL) {
    memcpy(bias_data_, src_bias, output_channel * sizeof(int32_t));
  }
 #endif
  /* InitBiasByzp */
  /* bias = bias - v2 x zp1 + zp1 x zp2  */
  for (int oc = 0; oc < output_channel; oc++) {
    int32_t weight_sum_value = 0;
    int32_t filter_zp = (filter_peroc) ? filter_zps[oc] : filter_zps[0];
    for (int ic = 0; ic < input_channel; ic++) {
      weight_sum_value += src_weight[oc * input_channel + ic];
    }
    bias_data_[oc] += filter_zp * input_zp * input_channel - weight_sum_value * input_zp;
  }

  *packed_weight = packed_weight_;
  *bias_data = bias_data_;
  return NNACL_OK;
 }
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_init_int8.h
@@ -0,0 +1,28 @@
 /*
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
 #define MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_

 #include <stdint.h>
 #include <stdbool.h>
 #include "nnacl/conv_parameter.h"

 int Conv1x1Init(int8_t *src_weight, int32_t *src_bias, int32_t *filter_zps, int32_t input_channel,
                int32_t output_channel, int32_t input_zp, bool support_optimize, bool filter_peroc,
                int8_t **packed_weight, int32_t **bias_data);

 #endif  // MINDSPORE_LITE_MICRO_INT8_CONV1X1_INIT_INT8_H_
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.c
@@ -0,0 +1,224 @@
 /*
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "wrapper/int8/conv1x1_run_int8.h"
 #include "nnacl/base/conv1x1_base.h"
 #include "nnacl/int8/matmul_int8.h"
 #include "nnacl/int8/pack_int8.h"
 #include "nnacl/int8/conv1x1_int8.h"
 #include "nnacl/errorcode.h"

 void Pre1x1Trans(Conv1x1Args *args, int8_t *src_input, int8_t *src_output) {
  args->output_ptr_ = src_output;
  if (args->pre_trans_input_) {
    Conv1x1InputPack(src_input, args->input_ptr_, args->conv_param_, sizeof(int8_t));
  } else {
    args->input_ptr_ = src_input;
  }
 }

 int OcOptPre(void *cdata, int task_id) {
  Conv1x1Args *args = (Conv1x1Args *)(cdata);
  int cur_stride = args->thread_stride_hw_ * C4NUM;
  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
  int cur_hw = MSMIN(cur_stride, res_stride);
  if (cur_hw <= 0) {
    return NNACL_OK;
  }
  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
  int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;

  if (args->filter_peroc_) {
    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
  } else {
    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
                                args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
  }
  return NNACL_OK;
 }

 int RunArm64OptOc(void *cdata, int task_id) {
  Conv1x1Args *args = (Conv1x1Args *)(cdata);
  int stride = args->thread_stride_oc_ * C16NUM;
  int cur_stride = task_id * stride;
  int res_stride = args->matmul_param_->col_ - cur_stride;
  int cur_oc = MSMIN(stride, res_stride);
  if (cur_oc <= 0) {
    return NNACL_OK;
  }

  bool filter_peroc = args->filter_peroc_;
  int32_t *cur_left_shift =
    filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
  int32_t *cur_right_shift =
    filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
  int32_t *cur_multiplier =
    filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
  int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;

  Conv1x1Int8Opt(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_4_,
                 args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
                 args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_4_, cur_left_shift, cur_right_shift,
                 cur_multiplier, args->conv_param_, args->matmul_func_, cur_zp);
  return NNACL_OK;
 }

 int RunArmOc(void *cdata, int task_id) {
  Conv1x1Args *args = (Conv1x1Args *)(cdata);
 #ifdef ENABLE_ARM32
  int col_tile = C2NUM;
 #else
  int col_tile = C4NUM;
 #endif
  int stride = args->thread_stride_oc_ * col_tile;
  int cur_stride = task_id * stride;
  int res_stride = args->matmul_param_->col_ - cur_stride;
  int cur_oc = MSMIN(stride, res_stride);
  if (cur_oc <= 0) {
    return NNACL_OK;
  }

  bool filter_peroc = args->filter_peroc_;
  int32_t *cur_left_shift =
    filter_peroc ? args->left_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.left_shift_;
  int32_t *cur_right_shift =
    filter_peroc ? args->right_shift_ + cur_stride : args->conv_param_->conv_quant_arg_.right_shift_;
  int32_t *cur_multiplier =
    filter_peroc ? args->multiplier_ + cur_stride : args->conv_param_->conv_quant_arg_.quant_multiplier_;
  int32_t *cur_zp = filter_peroc ? args->filter_zp_ptr_ + cur_stride : args->filter_zp_ptr_;

  Conv1x1Int8(args->packed_input_, args->packed_weight_ + cur_stride * args->matmul_param_->deep_16_,
              args->output_ptr_ + cur_stride, args->input_sum_, args->bias_data_ + cur_stride,
              args->matmul_param_->row_, cur_oc, args->matmul_param_->deep_16_, cur_left_shift, cur_right_shift,
              cur_multiplier, args->conv_param_, cur_zp);
  return NNACL_OK;
 }

 int RunArm64OptHw(void *cdata, int task_id) {
  Conv1x1Args *args = (Conv1x1Args *)(cdata);
  int cur_stride = args->thread_stride_hw_ * C4NUM;
  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
  int cur_hw = MSMIN(cur_stride, res_stride);
  if (cur_hw <= 0) {
    return NNACL_OK;
  }
  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
  int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
  int8_t *hw_packed_in = args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_4_;
  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;

  if (args->filter_peroc_) {
    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw, 1);
  } else {
    PackInput4x4AndInputSumPert(hw_in, hw_packed_in, hw_input_sum, args->matmul_param_->deep_, cur_hw,
                                args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_);
  }

  Conv1x1Int8Opt(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
                 args->matmul_param_->col_, args->matmul_param_->deep_4_, args->left_shift_, args->right_shift_,
                 args->multiplier_, args->conv_param_, args->matmul_func_, args->filter_zp_ptr_);
  return NNACL_OK;
 }

 int RunArmHw(void *cdata, int task_id) {
  Conv1x1Args *args = (Conv1x1Args *)(cdata);
  int cur_stride = args->thread_stride_hw_ * C4NUM;
  int res_stride = args->matmul_param_->row_ - task_id * args->thread_stride_hw_ * C4NUM;
  int cur_hw = MSMIN(cur_stride, res_stride);
  if (cur_hw <= 0) {
    return NNACL_OK;
  }

  int8_t *hw_in = args->input_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->input_channel_;
  int8_t *hw_out = args->output_ptr_ + task_id * args->thread_stride_hw_ * C4NUM * args->conv_param_->output_channel_;
  int8_t *hw_packed_in =
    args->packed_input_ + task_id * args->thread_stride_hw_ * C4NUM * args->matmul_param_->deep_16_;
  int32_t *hw_input_sum = args->input_sum_ + task_id * args->thread_stride_hw_ * C4NUM;

  RowMajor2Row16x4MajorInt8(hw_in, hw_packed_in, cur_hw, args->matmul_param_->deep_);

  if (args->filter_peroc_) {
    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, 1, UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
  } else {
    PackInputSum16x4PerLayer(hw_packed_in, hw_input_sum, args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
                             UP_ROUND(cur_hw, C4NUM), args->matmul_param_->deep_16_);
  }

  Conv1x1Int8(hw_packed_in, args->packed_weight_, hw_out, hw_input_sum, args->bias_data_, cur_hw,
              args->matmul_param_->col_, args->matmul_param_->deep_16_, args->left_shift_, args->right_shift_,
              args->multiplier_, args->conv_param_, args->filter_zp_ptr_);
  return NNACL_OK;
 }

 void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out) {
  int row_pack_count = C4NUM;
  int col_pack_count;

 #ifdef ENABLE_ARM32
  col_pack_count = C2NUM;
 #else
  if (args->support_optimize_) {
    col_pack_count = C16NUM;
  } else {
    col_pack_count = C4NUM;
  }
 #endif
  int hw_thread_count = UP_DIV(args->matmul_param_->row_, row_pack_count);
  int oc_thread_count = UP_DIV(args->matmul_param_->col_, col_pack_count);
  size_t thread_count_hw = MSMIN(thread_num, hw_thread_count);
  args->thread_stride_hw_ = UP_DIV(hw_thread_count, thread_count_hw);
  size_t thread_count_oc = MSMIN(thread_num, oc_thread_count);
  args->thread_stride_oc_ = UP_DIV(oc_thread_count, thread_count_oc);
  bool parallel_by_oc = oc_thread_count > thread_num;

  for (int batch_index = 0; batch_index < args->conv_param_->input_batch_; batch_index++) {
    Pre1x1Trans(args,
                src_in + batch_index * args->conv_param_->input_h_ * args->conv_param_->input_w_ *
                           args->conv_param_->input_channel_,
                src_out + batch_index * args->matmul_param_->row_ * args->matmul_param_->col_);
    if (parallel_by_oc) {
      /* input transpose and input sum */
      if (args->support_optimize_) {
        ParallelLaunch(thread_pool, OcOptPre, args, thread_count_hw);
      } else {
        RowMajor2Row16x4MajorInt8(args->input_ptr_, args->packed_input_, args->matmul_param_->row_,
                                  args->matmul_param_->deep_);
        if (args->filter_peroc_) {
          PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_, 1, args->matmul_param_->row_4_,
                                   args->matmul_param_->deep_16_);
        } else {
          PackInputSum16x4PerLayer(args->packed_input_, args->input_sum_,
                                   args->conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
                                   args->matmul_param_->row_4_, args->matmul_param_->deep_16_);
        }
      }
      /* matmul parallel by oc */
      if (args->support_optimize_) {
        ParallelLaunch(thread_pool, RunArm64OptOc, args, thread_count_oc);
      } else {
        ParallelLaunch(thread_pool, RunArmOc, args, thread_count_oc);
      }
    } else {
      /* matmul parallel by hw */
      if (args->support_optimize_) {
        ParallelLaunch(thread_pool, RunArm64OptHw, args, thread_count_hw);
      } else {
        ParallelLaunch(thread_pool, RunArmHw, args, thread_count_hw);
      }
    }
  }
 }
--- a/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h
+++ b/mindspore/lite/micro/wrapper/int8/conv1x1_run_int8.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_
 #define MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_

 #include <stdint.h>
 #include <stdbool.h>
 #include "nnacl/conv_parameter.h"
 #include "nnacl/matmul_parameter.h"
 #include "src/runtime/thread_pool.h"

 typedef struct {
  int32_t *input_sum_;     /* per-oc */
  int32_t *filter_zp_ptr_; /* per-oc up round  */
  int32_t *left_shift_;    /* per-oc up round  */
  int32_t *right_shift_;   /* per-oc up round  */
  int32_t *multiplier_;    /* per-oc up round  */
  int8_t *packed_weight_;
  int32_t *bias_data_;
  int8_t *packed_input_;
  int8_t *input_ptr_;
  int8_t *output_ptr_;
  size_t thread_stride_hw_;
  size_t thread_stride_oc_;
  ConvParameter *conv_param_;
  MatMulParameter *matmul_param_;
  MATMUL_OPT_DP_FUNC matmul_func_;
  bool pre_trans_input_;
  bool support_optimize_;
  bool filter_peroc_;
 } Conv1x1Args;

 void Conv1x1Run(int8_t *src_in, Conv1x1Args *args, struct ThreadPool *thread_pool, int thread_num, int8_t *src_out);

 #endif  // MINDSPORE_LITE_MICRO_INT8_CONV1X1_RUN_H_