!3889 Add fp32 & int8 ops of Matmul(Batchmatmul)

Merge pull request !3889 from zhanyuan/master
5 years ago · 4ea5686a97
--- a/mindspore/lite/src/ops/matmul.cc
+++ b/mindspore/lite/src/ops/matmul.cc
@@ -33,29 +33,30 @@ int MatMul::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor
  auto output = outputs_.front();
  MS_ASSERT(output != nullptr);

  std::vector<int> x_shape = input0->shape();
  std::vector<int> w_shape = input1->shape();
  if (x_shape.size() < 2 || w_shape.size() < 2) {
  std::vector<int> a_shape = input0->shape();
  std::vector<int> b_shape = input1->shape();
  if (a_shape.size() < 3 || b_shape.size() < 3) {
    MS_LOG(ERROR) << "inputs shape is invalid";
    return RET_INPUT_TENSOR_ERROR;
  }

  for (int i = 0; i < a_shape.size() - 2; ++i) {
    if (a_shape[i] != b_shape[i]) {
      MS_LOG(ERROR) << "Op MatMul's dimensions must be equal";
      return RET_INPUT_TENSOR_ERROR;
    }
  }

  auto matmul_prim = this->primitive->value_as_MatMul();
  if (matmul_prim->transposeA()) {
    int tmp = x_shape.back();
    x_shape[x_shape.size() - 1] = x_shape[x_shape.size() - 2];
    x_shape[x_shape.size() - 2] = tmp;
    std::swap(a_shape[a_shape.size() - 1], a_shape[a_shape.size() - 2]);
  }
  if (matmul_prim->transposeB()) {
    int tmp = w_shape.back();
    w_shape[w_shape.size() - 1] = w_shape[w_shape.size() - 2];
    w_shape[w_shape.size() - 2] = tmp;
    std::swap(b_shape[b_shape.size() - 1], b_shape[b_shape.size() - 2]);
  }
  auto y_shape_size = std::max(x_shape.size(), w_shape.size());
  std::vector<int> y_shape(y_shape_size);
  y_shape = x_shape;
  y_shape[y_shape_size - 1] = w_shape[w_shape.size() - 1];
  output->set_shape(y_shape);
  std::vector<int> c_shape(a_shape);
  c_shape[c_shape.size() - 1] = b_shape[b_shape.size() - 1];
  output->set_shape(c_shape);
  output->set_data_type(input0->data_type());
  output->SetFormat(input0->GetFormat());

--- a/mindspore/lite/src/ops/ops.cc
+++ b/mindspore/lite/src/ops/ops.cc
@@ -139,6 +139,8 @@ Primitive *Primitive::CreatePrimitive(schema::Primitive *primitive) {
      return new lite::SpaceToBatch(const_cast<schema::Primitive *>(primitive));
    case schema::PrimitiveType_QuantDTypeCast:
      return new lite::QuantDTypeCast(const_cast<schema::Primitive *>(primitive));
    case schema::PrimitiveType_MatMul:
      return new lite::MatMul(const_cast<schema::Primitive *>(primitive));
    default:
      break;
  }
--- a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.cc
@@ -0,0 +1,72 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/runtime/kernel/arm/base/matmul_base.h"
 #include "src/runtime/kernel/arm/fp32/matmul.h"
 #include "src/runtime/kernel/arm/int8/matmul_int8.h"
 #include "src/kernel_factory.h"
 #include "include/errorcode.h"
 #include "include/context.h"

 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_MatMul;

 namespace mindspore::kernel {
 kernel::LiteKernel *CpuMatmulKernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                           const std::vector<lite::tensor::Tensor *> &outputs, OpParameter *opParameter,
                                           const lite::Context *ctx, const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Concat);
  auto input_tensor = inputs.at(kInputIndex);
  auto data_type = input_tensor->data_type();
  kernel::LiteKernel *kernel = nullptr;
  switch (data_type) {
    case kNumberTypeInt8:
    case kNumberTypeUInt8: {
      kernel = new (std::nothrow) MatmulInt8CPUKernel(opParameter, inputs, outputs, ctx);
      if (!kernel) {
        MS_LOG(ERROR) << "kernel is nullptr.";
        return nullptr;
      }
      break;
    }

    case kNumberTypeFloat32: {
      kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs, ctx);
      if (!kernel) {
        MS_LOG(ERROR) << "kernel is nullptr.";
        return nullptr;
      }
      break;
    }

    default:
      break;
  }

  auto ret = kernel->Init();
  if (ret != RET_OK) {
    delete kernel;
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h
+++ b/mindspore/lite/src/runtime/kernel/arm/base/matmul_base.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "include/context.h"
 #include "src/runtime/kernel/arm/opclib/matmul.h"

 using mindspore::lite::Context;

 namespace mindspore::kernel {
 class MatmulBaseCPUKernel : public LiteKernel {
 public:
  MatmulBaseCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
      : LiteKernel(parameter, inputs, outputs), ctx_(ctx), thread_count_(ctx->threadNum) {
    params_ = reinterpret_cast<MatMulParameter *>(opParameter);
  }
  ~MatmulBaseCPUKernel() = default;

  int Init() override { return 0; }
  int ReSize() override { return 0; }
  int Run() override { return 0; }

 protected:
  MatMulParameter *params_;
  int thread_count_;
  int thread_stride_;
  const Context *ctx_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_BASE_MATMUL_BASE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
@@ -15,44 +15,102 @@
 */

 #include "src/runtime/kernel/arm/fp32/matmul.h"
 #include <vector>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/fp32/matmul.h"
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_MatMul;

 namespace mindspore::kernel {
 MatmulCPUKernel::~MatmulCPUKernel() {
  ctx_->allocator->Free(a_c8_ptr_);
  ctx_->allocator->Free(b_r8_ptr_);
  ctx_->allocator->Free(c_r8x8_ptr_);
 }

 int MatmulCPUKernel::ReSize() { return RET_OK; }

 int MatmulCPUKernel::Run() { return RET_OK; }
 int MatmulCPUKernel::Init() {
  int batch = 1;
  auto x_shape = inputs_[0]->shape();
  auto o_shape = outputs_[0]->shape();
  for (int i = 0; i < x_shape.size() - 2; ++i) {
    batch *= x_shape[i];
  }
  params_->batch = batch;
  params_->row_ = o_shape[o_shape.size() - 2];
  params_->col_ = o_shape[o_shape.size() - 1];
  params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
  params_->row_8_ = UP_ROUND(params_->row_, 8);
  params_->col_8_ = UP_ROUND(params_->col_, 8);
  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);

 int MatmulCPUKernel::Init() { return RET_OK; }
  a_c8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(float)));
  if (!a_c8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(float));
  b_r8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(float)));
  if (!b_r8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(float));
  c_r8x8_ptr_ = reinterpret_cast<float *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(float)));
  if (!c_r8x8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(float));
  return RET_OK;
 }

 kernel::LiteKernel *CpuMatmulFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const lite::Context *ctx,
                                               const kernel::KernelKey &desc) {
  MS_ASSERT(desc.type == schema::PrimitiveType_MatMul);
  auto *kernel = new (std::nothrow) MatmulCPUKernel(opParameter, inputs, outputs);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "new MatmulCPUKernel fail!";
    return nullptr;
 int MatmulCPUKernel::RunImpl(int task_id) {
  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
  if (cur_oc <= 0) {
    return RET_OK;
  }
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    delete kernel;
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    return nullptr;
  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
  MatMul(a_c8_ptr_, cur_b, cur_c, NULL, ActType_No, params_->deep_, params_->row_8_, cur_oc * 8);
  return RET_OK;
 }

 int MatmulFloatRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto op = reinterpret_cast<MatmulCPUKernel *>(cdata);
  auto error_code = op->RunImpl(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "MatmulFp32Run error task_id[" << task_id << "] error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return kernel;
  return RET_OK;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MatMul, CpuMatmulFp32KernelCreator)
 int MatmulCPUKernel::Run() {
  auto a_ptr = reinterpret_cast<float *>(inputs_[0]->Data());
  auto b_ptr = reinterpret_cast<float *>(inputs_[1]->Data());
  auto c_ptr = reinterpret_cast<float *>(outputs_[0]->Data());
  auto a_stride = params_->row_ * params_->deep_;
  auto b_stride = params_->deep_ * params_->col_;
  auto c_stride = params_->row_ * params_->col_;
  for (int i = 0; i < params_->batch; ++i) {
    auto cur_a_ptr = a_ptr + i * a_stride;
    auto cur_b_ptr = b_ptr + i * b_stride;
    auto cur_c_ptr = c_ptr + i * c_stride;
    if (params_->a_transpose_) {
      RowMajor2Row8Major(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
    } else {
      RowMajor2Col8Major(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
    }
    if (params_->b_transpose_) {
      RowMajor2Col8Major(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
    } else {
      RowMajor2Row8Major(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
    }
    LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
    Row8x8Major2RowMajor(c_r8x8_ptr_, cur_c_ptr, params_->row_, params_->col_);
  }
  return RET_OK;
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h
@@ -19,27 +19,26 @@

 #include <vector>
 #include "src/lite_kernel.h"

 #include "src/runtime/kernel/arm/opclib/matmul.h"
 #include "src/runtime/kernel/arm/base/matmul_base.h"

 namespace mindspore::kernel {
 class MatmulCPUKernel : public LiteKernel {
 class MatmulCPUKernel : public MatmulBaseCPUKernel {
 public:
  explicit MatmulCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                           const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {
    matmul_param_ = reinterpret_cast<MatMulParameter *>(parameter);
  }
  ~MatmulCPUKernel() override = default;

                           const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
      : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
  ~MatmulCPUKernel() override;
  int Init() override;
  int ReSize() override;
  int Run() override;
  int RunImpl(int task_id);

 private:
  MatMulParameter *matmul_param_;
  float *a_c8_ptr_;
  float *b_r8_ptr_;
  float *c_r8x8_ptr_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_MATMUL_H_

--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h
@@ -42,7 +42,7 @@ class FullconnectionInt8CPUKernel : public FullconnectionBaseCPUKernel {
  int RunImpl(int task_id);

 private:
  FcQuantArg quant_params_;
  MatmulQuantArg quant_params_;
  int8_t *a_c8_ptr_;
  int8_t *b_r8_ptr_;
  int *c_r8x8_ptr_;
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
@@ -0,0 +1,142 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/int8/matmul_int8.h"
 #include "src/runtime/kernel/arm/opclib/int8/matmul.h"
 #include "src/runtime/kernel/arm/opclib/common_func.h"
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"

 using mindspore::lite::RET_MEMORY_FAILED;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 MatmulInt8CPUKernel::~MatmulInt8CPUKernel() {
  ctx_->allocator->Free(a_c8_ptr_);
  ctx_->allocator->Free(b_r8_ptr_);
  ctx_->allocator->Free(c_r8x8_ptr_);
 }

 int MatmulInt8CPUKernel::Init() {
  int batch = 1;
  auto x_shape = inputs_[0]->shape();
  auto o_shape = outputs_[0]->shape();
  for (int i = 0; i < x_shape.size() - 2; ++i) {
    batch *= x_shape[i];
  }
  params_->batch = batch;
  params_->row_ = o_shape[o_shape.size() - 2];
  params_->col_ = o_shape[o_shape.size() - 1];
  params_->deep_ = params_->a_transpose_ ? x_shape[x_shape.size() - 2] : x_shape[x_shape.size() - 1];
  params_->row_8_ = UP_ROUND(params_->row_, 8);
  params_->col_8_ = UP_ROUND(params_->col_, 8);
  thread_count_ = MSMIN(thread_count_, UP_DIV(params_->col_8_, 8));
  thread_stride_ = UP_DIV(UP_DIV(params_->col_8_, 8), thread_count_);

  a_c8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->row_8_ * params_->deep_ * sizeof(int8_t)));
  if (!a_c8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(a_c8_ptr_, 0, params_->row_8_ * params_->deep_ * sizeof(int8_t));
  b_r8_ptr_ = reinterpret_cast<int8_t *>(ctx_->allocator->Malloc(params_->col_8_ * params_->deep_ * sizeof(int8_t)));
  if (!b_r8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(b_r8_ptr_, 0, params_->col_8_ * params_->deep_ * sizeof(int8_t));
  c_r8x8_ptr_ = reinterpret_cast<int *>(ctx_->allocator->Malloc(params_->row_8_ * params_->col_8_ * sizeof(int)));
  if (!c_r8x8_ptr_) {
    return RET_MEMORY_FAILED;
  }
  memset(c_r8x8_ptr_, 0, params_->row_8_ * params_->col_8_ * sizeof(int));

  auto input_tensor = inputs_[0];
  auto params = input_tensor->GetQuantParams();
  MS_ASSERT(params.size() == 1);
  quant_params_.input.zp_ = params.front().zeroPoint;
  quant_params_.input.scale_ = params.front().scale;
  auto weight_tensor = inputs_[1];
  params = weight_tensor->GetQuantParams();
  MS_ASSERT(params.size() == 1);
  quant_params_.weight.zp_ = params.front().zeroPoint;
  quant_params_.weight.scale_ = params.front().scale;
  auto output_tensor = outputs_[0];
  params = output_tensor->GetQuantParams();
  MS_ASSERT(params.size() == 1);
  quant_params_.output.zp_ = params.front().zeroPoint;
  quant_params_.output.scale_ = params.front().scale;

  double real_multiplier = quant_params_.input.scale_ * quant_params_.weight.scale_ / quant_params_.output.scale_;
  QuantizeRoundParameter(real_multiplier, &quant_params_.quant_multiplier, &quant_params_.left_shift,
                         &quant_params_.right_shift);
  return RET_OK;
 }

 int MatmulInt8CPUKernel::ReSize() { return RET_OK; }

 int MatmulInt8CPUKernel::RunImpl(int task_id) {
  int cur_oc = MSMIN(thread_stride_, UP_DIV(params_->col_8_, 8) - task_id * thread_stride_);
  if (cur_oc <= 0) {
    return RET_OK;
  }
  auto cur_b = b_r8_ptr_ + task_id * thread_stride_ * C8NUM * params_->deep_;
  auto cur_c = c_r8x8_ptr_ + task_id * thread_stride_ * C8NUM * params_->row_8_;
  MatMulInt8(a_c8_ptr_, cur_b, cur_c, params_->row_8_, cur_oc * 8, params_->deep_, quant_params_.input.zp_,
             quant_params_.weight.zp_);
  return RET_OK;
 }

 int MatmulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
  auto ret = op->RunImpl(task_id);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "MatmulInt8Run error task_id[" << task_id << "] error_code[" << ret << "]";
    return ret;
  }
  return RET_OK;
 }

 int MatmulInt8CPUKernel::Run() {
  auto a_ptr = reinterpret_cast<int8_t *>(inputs_[0]->Data());
  auto b_ptr = reinterpret_cast<int8_t *>(inputs_[1]->Data());
  auto c_ptr = reinterpret_cast<int8_t *>(outputs_[0]->Data());
  auto a_stride = params_->row_ * params_->deep_;
  auto b_stride = params_->deep_ * params_->col_;
  auto c_stride = params_->row_ * params_->col_;

  for (int i = 0; i < params_->batch; ++i) {
    auto cur_a_ptr = a_ptr + i * a_stride;
    auto cur_b_ptr = b_ptr + i * b_stride;
    auto cur_c_ptr = c_ptr + i * c_stride;
    if (params_->a_transpose_) {
      RowMajor2Row8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->deep_, params_->row_);
    } else {
      RowMajor2Col8MajorInt8(cur_a_ptr, a_c8_ptr_, params_->row_, params_->deep_);
    }
    if (params_->b_transpose_) {
      RowMajor2Col8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->col_, params_->deep_);
    } else {
      RowMajor2Row8MajorInt8(cur_b_ptr, b_r8_ptr_, params_->deep_, params_->col_);
    }
    LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
    auto &q = quant_params_;
    SimplePostFuncInt8(c_r8x8_ptr_, cur_c_ptr, params_->col_, params_->row_, params_->row_8_, q.quant_multiplier,
                       q.left_shift, q.right_shift, q.output.zp_);
  }

  return RET_OK;
 }

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h
@@ -0,0 +1,47 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_

 #include <vector>
 #include "include/context.h"
 #include "src/runtime/kernel/arm/opclib/quantization/quantize.h"
 #include "src/runtime/kernel/arm/base/matmul_base.h"

 using mindspore::lite::Context;

 namespace mindspore::kernel {
 class MatmulInt8CPUKernel : public MatmulBaseCPUKernel {
 public:
  MatmulInt8CPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs, const Context *ctx)
      : MatmulBaseCPUKernel(parameter, inputs, outputs, ctx) {}
  ~MatmulInt8CPUKernel() override;
  int Init() override;
  int ReSize() override;
  int Run() override;
  int RunImpl(int task_id);

 private:
  MatmulQuantArg quant_params_;
  int8_t *a_c8_ptr_;
  int8_t *b_r8_ptr_;
  int *c_r8x8_ptr_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MATMUL_INT8_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.cc
@@ -236,3 +236,20 @@ void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane
  }
  return;
 }

 void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                        int32_t left_shift, int32_t right_shift, int32_t zp) {
  /*  (int32_t)row8x8-major * multiplier => (int8_t)row-major  */
  for (int r = 0; r < plane; r++) {
    for (int c = 0; c < oc; c++) {
      int c8div = c / 8, c8mod = c % 8;
      int src_index = c8div * plane8 * 8 + r * 8 + c8mod;
      int dst_index = r * oc + c;
      int32_t value = in[src_index];
      value = MultiplyByQuantizedMultiplier(value, multiplier, left_shift, right_shift) + zp;
      value = MSMIN(CHAR_MAX, value);
      value = MSMAX(CHAR_MIN, value);
      out[dst_index] = (int8_t)value;
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h
@@ -33,6 +33,8 @@ void ReluFp32(float *data, int ele_num);
 void Relu6Fp32(float *data, int ele_num);
 void PostFuncInt8(const int *in, const int *bias, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                  int32_t left_shift, int32_t right_shift, int32_t zp, int8_t mini, int8_t maxi);
 void SimplePostFuncInt8(const int *in, int8_t *out, int oc, int plane, int plane8, int32_t multiplier,
                        int32_t left_shift, int32_t right_shift, int32_t zp);
 void IndirectGemmFp32_8x8(float *output, const float *input, const float *weight, const float *bias, size_t step,
                          size_t ic4, size_t output_channel, size_t offset, size_t mode, size_t writeC4, size_t relu,
                          size_t relu6);
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/matmul.cc
@@ -65,9 +65,7 @@ void MatMul8x8(const float *a, const float *b, float *c, const float *bias, ActT
        size_t bi = c8div * deep * 8 + d * 8 + c8mod;
        value = value + a[ai] * b[bi];
      }
      if (bias != nullptr) {
        value += bias[col];
      }
      if (bias != nullptr) value += bias[col];
      if (act_type == ActType_Relu6) value = MSMIN(6.0f, value);
      if (act_type != ActType_No) value = MSMAX(0.0f, value);
      c[ci] = value;
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.cc
@@ -18,6 +18,17 @@
 #include <limits.h>
 #include "src/runtime/kernel/arm/opclib/quantization/fixed_point.h"

 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
  for (int r = 0; r < row; r++) {
    int8_t *src = src_ptr + r * col;
    for (int c = 0; c < col; c++) {
      int cd8 = c / 8;
      int cm8 = c % 8;
      dst_ptr[cd8 * 8 * row + r * 8 + cm8] = src[c];
    }
  }
 }

 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col) {
  for (int r = 0; r < row; r++) {
    int rd8 = r / 8;
@@ -26,7 +37,6 @@ void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col)
      dst_ptr[rd8 * col * 8 + c * 8 + rm8] = src_ptr[r * col + c];
    }
  }
  return;
 }

 void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
@@ -46,5 +56,4 @@ void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, co
      c[ci] = value;
    }
  }
  return;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h
@@ -22,7 +22,7 @@

 void MatMulInt8(const int8_t *a, const int8_t *b, int32_t *c, const int row8, const int col8, const int deep,
                const int32_t a_zp, const int32_t b_zp);
 void RowMajor2Row8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);
 void RowMajor2Col8MajorInt8(int8_t *src_ptr, int8_t *dst_ptr, int row, int col);

 #endif  // MINDSPORE_LITE_SRC_BACKEND_ARM_OPCLIB_INT8_MATMUL_H_

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/matmul.h
@@ -29,6 +29,7 @@ struct MatMulParameter {
  int col_8_;
  int deep_;
  bool has_bias_;
  int batch;
  bool a_transpose_; /* false :  row-major  */
  bool b_transpose_; /* true  :  col-major  */
  ActType act_type_;
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h
@@ -22,6 +22,7 @@
 #include <stdlib.h>
 #include <limits.h>
 #include <limits>
 #include "src/runtime/kernel/arm/opclib/op_base.h"

 struct QuantArg {
  double scale_;
@@ -49,7 +50,7 @@ struct ConcatQuantArg {
  QuantArg out_quant_args_;
 };

 struct FcQuantArg {
 struct MatmulQuantArg {
  QuantArg input;
  QuantArg weight;
  QuantArg output;
@@ -137,4 +138,22 @@ inline void CalculateActivationRangeQuantized(bool is_relu, bool is_relu6, int32
  *mini = min;
  *maxi = max;
 }

 // quantize from float to int8
 inline void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
  for (int i = 0; i < length; ++i) {
    int r = (int)round(input_data[i] / scale + zero_point);
    int8_t q = r > CHAR_MAX ? CHAR_MAX : r;
    q = q < CHAR_MIN ? CHAR_MIN : q;
    output_data[i] = q;
  }
 }

 // dequantize from int8 to float
 inline void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
  for (int i = 0; i < length; ++i) {
    output_data[i] = scale * (input_data[i] - zero_point);
  }
 }

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_QUANTIZATION_QUANTIZE_H_
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/matmul_fp32_tests.cc
@@ -0,0 +1,169 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include "mindspore/core/utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/matmul.h"
 #include "src/kernel_registry.h"
 #include "src/lite_kernel.h"

 namespace mindspore {
 class TestMatMulFp32 : public mindspore::Common {
 public:
  TestMatMulFp32() {}
 };

 int MMTestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
               float *a_ptr, float *b_ptr, std::vector<int> a_shape, std::vector<int> b_shape,
               std::vector<int> c_shape) {
  auto in_t =
    new lite::tensor::Tensor(kNumberTypeFloat, a_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  in_t->MallocData();
  memcpy(in_t->Data(), a_ptr, sizeof(float) * in_t->ElementsNum());
  inputs_->push_back(in_t);

  auto weight_t =
    new lite::tensor::Tensor(kNumberTypeFloat, b_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  weight_t->MallocData();
  memcpy(weight_t->Data(), b_ptr, sizeof(float) * weight_t->ElementsNum());
  inputs_->push_back(weight_t);

  auto out_t =
    new lite::tensor::Tensor(kNumberTypeFloat, c_shape, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  out_t->MallocData();
  outputs_->push_back(out_t);

  return out_t->ElementsNum();
 }

 TEST_F(TestMatMulFp32, simple) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto matmul_param = new MatMulParameter();
  matmul_param->a_transpose_ = false;
  matmul_param->b_transpose_ = false;
  matmul_param->has_bias_ = false;
  float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
               17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
  float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671,  -0.012470592,   -0.0053063435,
               0.006050155,   0.008656233,  0.012911413,  -0.0028635843, -0.00034080597, -0.0010622552,
               -0.012254699,  -0.01312836,  0.0025241964, -0.004706142,  0.002451482,    -0.009558459,
               0.004481974,   0.0033251503, -0.011705584, -0.001720293,  -0.0039410214,  -0.0073637343};
  std::vector<int> a_shape = {1, 2, 8};
  std::vector<int> b_shape = {1, 8, 3};
  std::vector<int> c_shape = {1, 2, 3};
  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
  auto ctx = new lite::Context;
  ctx->threadNum = 2;
  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
  mm->Init();
  mm->Run();
  float correct[] = {-0.1256939023733139, -0.07744802534580231,  0.07410638779401779,
                     -0.3049793541431427, -0.027687929570674896, -0.18109679222106934};
  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
  delete matmul_param;
  delete mm;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
 }

 TEST_F(TestMatMulFp32, simple_transb) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto matmul_param = new MatMulParameter();
  matmul_param->a_transpose_ = false;
  matmul_param->b_transpose_ = true;
  matmul_param->has_bias_ = false;
  float a[] = {-3.2366564, -4.7733846, -7.8329225, 16.146885, 5.060793,  -6.1471,  -1.7680453, -6.5721383,
               17.87506,   -5.1192183, 10.742863,  1.4536934, 19.693445, 19.45783, 5.063163,   0.5234792};
  float b[] = {-0.0024438887, 0.0006738146, -0.008169129, 0.0021510671,  -0.012470592,   -0.0053063435,
               0.006050155,   0.008656233,  0.012911413,  -0.0028635843, -0.00034080597, -0.0010622552,
               -0.012254699,  -0.01312836,  0.0025241964, -0.004706142,  0.002451482,    -0.009558459,
               0.004481974,   0.0033251503, -0.011705584, -0.001720293,  -0.0039410214,  -0.0073637343};
  std::vector<int> a_shape = {1, 2, 8};
  std::vector<int> b_shape = {1, 3, 8};
  std::vector<int> c_shape = {1, 2, 3};
  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
  auto ctx = new lite::Context;
  ctx->threadNum = 2;
  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
  mm->Init();
  mm->Run();
  float correct[] = {0.00533547, 0.002545945, 0.062974121, -0.445441471, -0.246223617, -0.142070031};
  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
  delete matmul_param;
  delete mm;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
 }

 TEST_F(TestMatMulFp32, batch) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto matmul_param = new MatMulParameter();
  matmul_param->a_transpose_ = false;
  matmul_param->b_transpose_ = true;
  matmul_param->has_bias_ = false;
  float a[] = {-4.946672525326248,  11.154420027909701,  -7.831129637356922,  17.309845099949953,  -10.46177877610444,
               2.5412751480833897,  2.700113860276929,   -12.616715572097341, -15.513316568881574, -9.513294738065516,
               17.931148376418896,  -10.83801964632579,  -14.023733862948017, -14.50805001403956,  0.7952221556310306,
               6.619720423569035,   -19.277904230909357, -13.450479287024839, 19.914652156692625,  16.542571697048878,
               -2.9715041389268926, 4.949555349889412,   -1.9408110276290103, -15.062828261031868, 0.20012569643335,
               8.260383531209776,   3.1092344458607357,  16.742272486091487,  17.31277252415167,   -16.60303202099434,
               -8.980314693173042,  -11.735087989358268, -14.918976184088514, -11.347592686892733, 11.808756029220604,
               -18.76179414554809,  7.579758962360987,   3.13240880962163,    6.528181981442103,   -16.802624652419794,
               -14.323146919914901, -16.197579076296144, 9.738053920125779,   -12.245780062949866, 8.817905278096319,
               0.5261391331275007,  -18.26152522535471,  -2.400461208771226};
  float b[] = {
    -0.895183867395529,    -0.8146900207660068,   -0.27931593219652817,  0.783554361201179,     -0.05080215007779798,
    -0.9879631271568501,   0.07710949009001333,   -0.9562579726211344,   0.29505553318356825,   -0.26651960351085124,
    -0.12755456259718279,  -0.8221417897250098,   -0.5094334041431876,   -0.9117373380256013,   0.991501784215064,
    0.20131976450979394,   0.07889260559412059,   -0.8138407752750305,   -0.047622075866657454, -0.2778043115153188,
    -0.6269973420163957,   -0.44345812666611617,  -0.8571568605933642,   0.020192166011526735,  0.4860054298402434,
    0.41525925469513614,   -0.40270506445219967,  -0.8716538067535347,   0.5276448387223114,    0.6064500154192936,
    -0.9553204135772526,   0.3253219646257437,    -0.7237956595774822,   0.3271284879679077,    -0.534543967339336,
    -0.4076498484281894,   0.01574797075171963,   -0.37322004720586244,  0.16425071396119928,   -0.5328652244800547,
    0.7389336170615435,    -0.6552069958923377,   -0.042305872596973604, -0.6714941466767734,   -0.9281411415119043,
    -0.7748558258281224,   -0.6209799945964443,   0.02526428593887675,   -0.44984776800225856,  0.6281401952319337,
    0.9907258228680276,    0.6288646615999687,    -0.82076880150175,     0.3065944740797497,    -0.29201038744043584,
    -0.025685501802048982, -0.07273175145419652,  0.9370449239208709,    -0.8233807408078093,   -0.4195634619023012,
    0.9799555630257346,    -0.23461882935715228,  -0.8884793313829993,   -0.4760267734754635,   -0.2874539543614072,
    -0.8795685985480997,   -0.08099698251915255,  -0.1626521023321741,   -0.9337167240793414,   0.40924842916829207,
    -0.7375713045221615,   -0.0065659291539015285};
  std::vector<int> a_shape = {3, 2, 8};
  std::vector<int> b_shape = {3, 3, 8};
  std::vector<int> c_shape = {3, 2, 3};
  int total_size = MMTestInit(&inputs_, &outputs_, a, b, a_shape, b_shape, c_shape);
  auto ctx = new lite::Context;
  ctx->threadNum = 1;
  auto mm = new kernel::MatmulCPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);
  mm->Init();
  mm->Run();
  float correct[] = {21.38518524169922,  -14.514888763427734, -11.040614128112793, 16.91403579711914,
                     27.07421112060547,  23.35394287109375,   -39.006141662597656, -2.021998405456543,
                     -17.63555145263672, -8.490625381469727,  5.317771911621094,   -14.561882019042969,
                     -7.251564025878906, -2.508212089538574,  5.86458683013916,    -3.466249465942383,
                     8.869029998779297,  25.034008026123047};

  float *output = reinterpret_cast<float *>(outputs_[0]->Data());
  for (int i = 0; i < 18; ++i) printf("%f ", output[i]);
  CompareOutputData(reinterpret_cast<float *>(outputs_[0]->Data()), correct, total_size, 0.0001);
  delete matmul_param;
  delete mm;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/fullconnection_int8_tests.cc
@@ -13,13 +13,11 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/int8/matmul.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"

@@ -30,21 +28,6 @@ class TestFcInt8 : public mindspore::Common {
  TestFcInt8() {}
 };

 void Quantize(float *input_data, int length, float scale, int zero_point, int8_t *output_data) {
  for (int i = 0; i < length; ++i) {
    int8_t q = static_cast<int8_t>(std::max<float>(
      std::numeric_limits<int8_t>::min(),
      std::min<float>(std::numeric_limits<int8_t>::max(), std::round(zero_point + (input_data[i] / scale)))));
    output_data[i] = q;
  }
 }

 void Dequantize(int8_t *input_data, int length, float scale, int zero_point, float *output_data) {
  for (int i = 0; i < length; ++i) {
    output_data[i] = scale * (input_data[i] - zero_point);
  }
 }

 int FcInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
                   MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
  float input_max = 20;
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/int8/matmul_int8_tests.cc
@@ -0,0 +1,126 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/quantization/quantize.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/common_func.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/lite_kernel.h"

 namespace mindspore {
 class TestMatmulInt8 : public mindspore::Common {
 public:
  TestMatmulInt8() {}
 };

 int MMInt8TestInit(std::vector<lite::tensor::Tensor *> *inputs_, std::vector<lite::tensor::Tensor *> *outputs_,
                   MatMulParameter *matmal_param, float **correct, double *scale, int *zeropoint) {
  float input_max = 20;
  float input_min = -20;
  float weight_max = 1;
  float weight_min = -1;
  float output_max = 30;
  float output_min = -30;

  double input_scale =
    (input_max - input_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
  int input_zp = std::numeric_limits<int8_t>::max() - input_max / input_scale;
  double weight_scale =
    (weight_max - weight_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
  int weight_zp = std::numeric_limits<int8_t>::max() - weight_max / weight_scale;
  double output_scale =
    (output_max - output_min) / (std::numeric_limits<int8_t>::max() - std::numeric_limits<int8_t>::min());
  int output_zp = std::numeric_limits<int8_t>::max() - output_max / output_scale;
  *scale = output_scale;
  *zeropoint = output_zp;

  auto in_t =
    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  in_t->MallocData();
  float in[] = {6.583835634764597,   11.337275140963907,  -4.125256949459629, 10.994337291530833,
                19.086065139532636,  3.620842999158455,   13.167624585590346, -18.326739299407755,
                14.877693740734841,  -17.092677920571653, 19.24147072807235,  -15.14805323833401,
                -18.075654829688737, -0.9164404591894204, -3.836646280336332, -10.870298671273918};
  Quantize(in, in_t->ElementsNum(), input_scale, input_zp, reinterpret_cast<int8_t *>(in_t->Data()));
  auto in_quant_arg = new mindspore::lite::tensor::QuantArg();
  in_quant_arg->zeroPoint = input_zp;
  in_quant_arg->scale = input_scale;
  in_t->AddQuantParam(*in_quant_arg);
  inputs_->push_back(in_t);

  auto weight_t =
    new lite::tensor::Tensor(kNumberTypeInt8, {1, 3, 8}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  weight_t->MallocData();
  float weight[] = {0.3651070698591563,    -0.5856943921727129,  -0.7472032663840145,  0.9489992871641959,
                    -0.8179490270358738,   -0.873058811259344,   0.39876672713807215,  -0.1816769383004213,
                    -0.13584645926733696,  -0.7614673836659709,  -0.2535825872616164,  -0.05265760030895916,
                    0.28558728305658754,   0.15404213943520118,  -0.1634824450738006,  -0.5068199082730189,
                    -0.026961256849111326, -0.1508441942453307,  0.9375335677537737,   0.3304690744194263,
                    -0.5091563780251127,   0.029887336278646925, -0.39540496207319276, 0.46094065001445084};
  Quantize(weight, weight_t->ElementsNum(), weight_scale, weight_zp, reinterpret_cast<int8_t *>(weight_t->Data()));
  auto weight_quant_arg = new mindspore::lite::tensor::QuantArg();
  weight_quant_arg->zeroPoint = weight_zp;
  weight_quant_arg->scale = weight_scale;
  weight_t->AddQuantParam(*weight_quant_arg);
  inputs_->push_back(weight_t);

  auto out_t =
    new lite::tensor::Tensor(kNumberTypeInt8, {1, 2, 3}, schema::Format_NHWC, static_cast<schema::NodeType>(1));
  out_t->MallocData();
  auto output_quant_arg = new mindspore::lite::tensor::QuantArg();
  output_quant_arg->zeroPoint = output_zp;
  output_quant_arg->scale = output_scale;
  out_t->AddQuantParam(*output_quant_arg);
  outputs_->push_back(out_t);

  *correct = reinterpret_cast<float *>(malloc(out_t->ElementsNum() * sizeof(float)));
  float nchw_co[] = {-0.912632942, 4.08398056, -25.385608673, 2.720281124, 7.745952606, 20.893184662};
  memcpy(*correct, nchw_co, out_t->ElementsNum() * sizeof(float));

  matmal_param->b_transpose_ = true;
  matmal_param->a_transpose_ = false;
  matmal_param->has_bias_ = false;
  return out_t->ElementsNum();
 }

 TEST_F(TestMatmulInt8, mmint8) {
  std::vector<lite::tensor::Tensor *> inputs_;
  std::vector<lite::tensor::Tensor *> outputs_;
  auto matmul_param = new MatMulParameter();
  float *correct;
  double output_scale;
  int output_zp;
  int total_size = MMInt8TestInit(&inputs_, &outputs_, matmul_param, &correct, &output_scale, &output_zp);
  auto ctx = new lite::Context;
  ctx->threadNum = 2;
  kernel::MatmulInt8CPUKernel *mm =
    new kernel::MatmulInt8CPUKernel(reinterpret_cast<OpParameter *>(matmul_param), inputs_, outputs_, ctx);

  mm->Init();
  mm->Run();
  float fout[6] = {0};
  Dequantize(reinterpret_cast<int8_t *>(outputs_[0]->Data()), outputs_[0]->ElementsNum(), output_scale, output_zp,
             fout);
  CompareOutputData(fout, correct, 6, 0.3);
  delete matmul_param;
  delete mm;
  for (auto t : inputs_) delete t;
  for (auto t : outputs_) delete t;
  free(correct);
 }

 }  // namespace mindspore