train on device

5 years ago · 943a25a47b
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -174,6 +174,19 @@ union PrimitiveType {
    Where,
    OneHot,
    Lstm,
    Conv2DGradFilter,
    Conv2DGradInput,
    PoolingGrad,
    BNGradInput,
    OptMomentum,
    BiasGrad,
    SoftmaxCrossEntropy,
    AddGrad,
    SubGrad,
    MulGrad,
    DivGrad,
    PowerGrad,
    ActivationGrad,
    PriorBox
 }

--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -55,7 +55,25 @@ enum ActivationType : byte {
    LINEAR = 15,
    UNKNOW = 16
 }

 enum ActivationGradType : byte {
    NO_ACTIVATION = 0,
    RELU = 1,
    SIGMOID = 2,
    RELU6 = 3,
    ELU = 4,
    LEAKY_RELU = 5,
    ABS = 6,
    RELU1 = 7,
    SOFTSIGN = 8,
    SOFTPLUS = 9,
    TANH = 10,
    SELU = 11,
    HSWISH = 12,
    HSIGMOID = 13,
    THRESHOLDRELU = 14,
    LINEAR = 15,
    UNKNOW = 16
 }
 enum ReduceType : byte {
    REDUCE_MAX = 0,
    REDUCE_MEAN = 1,
@@ -125,6 +143,10 @@ table SoftMax {
 table Activation {
    type: ActivationType = 0;
 }
 table ActivationGrad {
    type: ActivationGradType = 0;
 }


 table Conv2D {
    format: Format = 0;
@@ -146,7 +168,45 @@ table Conv2D {
    activationType: ActivationType = 0;
 }

 table FusedBatchNorm {
 table Conv2DGradFilter {
    format: Format = 0;
    group: int;
    channelIn: int;
    channelOut: int;
    kernelW: int;
    kernelH: int;
    strideW: int;
    strideH: int;
    padMode: PadMode;
    padUp: int;
    padDown: int;
    padLeft: int;
    padRight: int;
    dilateW: int;
    dilateH: int;
    hasBias: bool = false;
    activationType: ActivationType = 0;
 }

 table Conv2DGradInput {
    format: Format = 0;
    group: int;
    channelIn: int;
    channelOut: int;
    kernelW: int;
    kernelH: int;
    strideW: int;
    strideH: int;
    padMode: PadMode;
    padUp: int;
    padDown: int;
    padLeft: int;
    padRight: int;
    dilateW: int;
    dilateH: int;
    hasBias: bool = false;
    activationType: ActivationType = 0;
 }table FusedBatchNorm {
    epsilon: float = 0.00001;   // eg. epsilon=0.001
    momentum: float = 0.9;
    spatial: int = 1;
@@ -156,6 +216,31 @@ table CaffeBatchNorm {
    epsilon: float;   // eg. epsilon=0.001
 }

 table BiasGrad {
    axis: [int];
 }


 table SoftmaxCrossEntropy {
    axis: [int];
 }


 table PoolingGrad {
    format: Format = 0;
    poolingMode: PoolMode;
    global: bool = false;
    windowW: int;
    windowH: int;
    strideW: int;
    strideH: int;
    padMode: PadMode;
    padUp: int;
    padDown: int;
    padLeft: int;
    padRight: int;
    roundMode: RoundMode;
 }
 table Shape {
 }

@@ -286,7 +371,10 @@ table DeConv2D {
    hasBias: bool = false;
    activationType: ActivationType = 0;
 }

 table BNGradInput {
    eps : float;
    channels: int;
 }
 table Scale {
    format: Format = 0;
 }
@@ -307,6 +395,17 @@ table Mul {
 table Div {
 }

 table AddGrad {
 }

 table SubGrad {
 }

 table MulGrad {
 }

 table DivGrad {
 }
 table RealDiv {
 }

@@ -389,7 +488,11 @@ table Power {
    scale: float;
    shift: float;
 }

 table PowerGrad {
    power: float;
    scale: float;
    shift: float;
 }
 table ArgMax {
    axis: int;
    outMaxValue: bool;
@@ -712,6 +815,10 @@ table SquaredDifference {
 table TupleGetItem {
 }

 table OptMomentum {
 }


 table Where{
 }

--- a/mindspore/lite/src/common/file_utils_ext.cc
+++ b/mindspore/lite/src/common/file_utils_ext.cc
@@ -0,0 +1,53 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cmath>
 #include <cstddef>
 #include <iostream>
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"

 namespace mindspore {
 namespace lite {
 static int CompareOutputRelativeData(float *output_data, float *correct_data, int data_size) {
  float error = 0;

  // relative error
  float diffSum = 0.0f;
  float sum = 0.0f;
  for (int i = 0; i < data_size; i++) {
    sum += std::abs(correct_data[i]);
  }
  for (int i = 0; i < data_size; i++) {
    float diff = std::abs(output_data[i] - correct_data[i]);
    diffSum += diff;
  }
  error = diffSum / sum;
  if (error > 1e-4) {
    std::cout << "has accuracy error!\n" << error << "\n";
    return 1;
  }
  return 0;
 }

 int CompareRelativeOutput(float *output_data, std::string file_path) {
  size_t output_size;
  auto ground_truth = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
  size_t output_num = output_size / sizeof(float);
  std::cout << "output num : " << output_num << "\n";
  return CompareOutputRelativeData(output_data, ground_truth, output_num);
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/common/file_utils_ext.h
+++ b/mindspore/lite/src/common/file_utils_ext.h
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
 #define MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
 #include <string>


 namespace mindspore {
 namespace lite {
 int CompareRelativeOutput(float *output_data, std::string file_path);

 }
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_COMMON_FILE_UTILS_EXT_H_
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -64,7 +64,7 @@ class LiteKernel {
  LiteKernel() = default;
  explicit LiteKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs)
      : opParameter(parameter), inputs_(inputs), outputs_(outputs) {
      : opParameter(parameter), inputs_(inputs), outputs_(outputs), train_mode(false) {
    this->in_kernel_.clear();
    this->out_kernel_.clear();
  }
@@ -77,7 +77,10 @@ class LiteKernel {
  virtual int Run() { return -1; }

  std::string Name() { return this->name; }

  virtual void train() { train_mode = true; }
  virtual bool is_train() { return train_mode == true; }
  virtual void eval() { train_mode = false; }
  virtual bool is_eval() { return train_mode == false; }
  void set_name(const std::string &name) { this->name = name; }

  schema::PrimitiveType type() { return (schema::PrimitiveType)this->opParameter->type_; }
@@ -117,6 +120,7 @@ class LiteKernel {
  std::vector<lite::tensor::Tensor *> outputs_;
  std::vector<LiteKernel *> in_kernel_;
  std::vector<LiteKernel *> out_kernel_;
  bool train_mode;
 };

 class SubGraphKernel : public LiteKernel {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.cc
@@ -0,0 +1,110 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/activation_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
 #include "include/errorcode.h"

 using mindspore::lite::KernelRegistrar;
 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::ActivationGradType_HSWISH;
 using mindspore::schema::ActivationGradType_LEAKY_RELU;
 using mindspore::schema::ActivationGradType_RELU;
 using mindspore::schema::ActivationGradType_RELU6;
 using mindspore::schema::PrimitiveType_ActivationGrad;

 namespace mindspore::kernel {
 int ActivationGradCPUKernel::Init() {
    outputs_[0]->set_shape(inputs_[0]->shape());
    return RET_OK;
 }

 int ActivationGradCPUKernel::ReSize() { return RET_OK; }

 int ActivationGradCPUKernel::DoActivation(int task_id) {
  auto yt_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto input_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
  auto output_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto length = inputs_.at(0)->ElementsNum();

  auto error_code = RET_OK;

  if (type_ == schema::ActivationGradType_RELU) {
    error_code = ReluGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_RELU6) {
    error_code = Relu6Grad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_LEAKY_RELU) {
    error_code = LReluGrad(yt_addr, input_addr, length, output_addr, alpha_);
  } else if (type_ == schema::ActivationGradType_SIGMOID) {
    error_code = SigmoidGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_TANH) {
    error_code = TanhGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_HSWISH) {
    error_code = HSwishGrad(yt_addr, input_addr, length, output_addr);
    } else if (type_ == schema::ActivationGradType_HSIGMOID) {
    error_code = HSigmoidGrad(yt_addr, input_addr, length, output_addr);
  } else {
    MS_LOG(ERROR) << "Activation type error";
    return RET_ERROR;
  }
  if (error_code != RET_OK) {
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
  auto activationGrad_kernel = reinterpret_cast<ActivationGradCPUKernel *>(cdata);
  auto error_code = activationGrad_kernel->DoActivation(task_id);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "ActivationGradRun error task_id[" << task_id << "] error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return RET_OK;
 }

 int ActivationGradCPUKernel::Run() {
  int error_code = LiteBackendParallelLaunch(ActivationGradRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
    return RET_ERROR;
  }
  return RET_OK;
 }

 kernel::LiteKernel *CpuActivationGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                      const std::vector<lite::tensor::Tensor *> &outputs,
                                                      OpParameter *opParameter, const lite::Context *ctx,
                                                      const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_ActivationGrad);
  auto *kernel = new (std::nothrow) ActivationGradCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "InferShape kernel failed, name: " << opParameter->name_
                  << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_ActivationGrad, CpuActivationGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "src/runtime/kernel/arm/opclib/activation_grad.h"

 namespace mindspore::kernel {
 class ActivationGradCPUKernel : public LiteKernel {
 public:
  explicit ActivationGradCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                                   const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(param, inputs, outputs) {
    ActivationGradParameter *param_act_grad = reinterpret_cast<ActivationGradParameter *>(param);
    type_ = param_act_grad->type_;
    alpha_ = param_act_grad->alpha_;
  }
  ~ActivationGradCPUKernel() override = default;

  int Init() override;
  int ReSize() override;
  int Run() override;
  int DoActivation(int task_id);

 private:
  int thread_count_;
  int type_;
  float alpha_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ACTIVATION_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.cc
@@ -0,0 +1,285 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/fp32/reduce_grad.h"
 #include "src/runtime/kernel/arm/fp32/arithmetic_grad.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 namespace {
 constexpr int kArithGradOpInputNum = 3;
 constexpr int kArithGradOpOutputNum = 2;
 }  // namespace

 int ArithmeticGradCPUKernel::Init() {
  auto ret = InferShape();
  return ret;
 }

 int ArithmeticGradCPUKernel::InferShape() {
  if (inputs_.size() != kArithGradOpInputNum) {
    MS_LOG(ERROR) << "The number of input must be " << kArithGradOpInputNum;
    return RET_ERROR;
  }
  if (outputs_.size() != kArithGradOpOutputNum) {
    MS_LOG(ERROR) << "The number of output must be " << kArithGradOpOutputNum;
    return RET_ERROR;
  }
  auto dy = inputs_[0];
  auto x1 = inputs_[1];
  auto x2 = inputs_[2];
  auto dx1 = outputs_[0];
  auto dx2 = outputs_[1];

  MS_ASSERT(dy != nullptr);
  MS_ASSERT(x1 != nullptr);
  MS_ASSERT(x2 != nullptr);
  MS_ASSERT(dx1 != nullptr);
  MS_ASSERT(dx2 != nullptr);

  auto inShape0 = x1->shape();
  auto inShape1 = x2->shape();
  auto outShape = dy->shape();

  if ((type() == PrimitiveType_AddGrad) || (type() == PrimitiveType_SubGrad)) {
    arithmeticParameter_->ndim_ = outShape.size();
    auto fillDimNum0 = outShape.size() - inShape0.size();
    auto fillDimNum1 = outShape.size() - inShape1.size();
    int j0 = 0;
    int j1 = 0;
    for (unsigned int i = 0; i < outShape.size(); i++) {
      arithmeticParameter_->in_shape0_[i] = (i < fillDimNum0) ? 1 : inShape0[j0++];
      arithmeticParameter_->in_shape1_[i] = (i < fillDimNum1) ? 1 : inShape1[j1++];
      arithmeticParameter_->out_shape_[i] = outShape[i];
    }
  } else {
    // if (inShape0.size() < inShape1.size())
    if (dx1->ElementsNum() < dx2->ElementsNum()) {
      arithmeticParameter_->ndim_ = inShape1.size();
      if (type() == PrimitiveType_MulGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul2L;
      else if (type() == PrimitiveType_DivGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv2L;

      auto fillDimNum = inShape1.size() - inShape0.size();  // This will not work for batch!
      int j = 0;
      for (unsigned int i = 0; i < inShape1.size(); i++) {
        if (i < fillDimNum) {
          arithmeticParameter_->in_shape1_[i] = 1;
        } else {
          arithmeticParameter_->in_shape1_[i] = inShape0[j++];
        }
        arithmeticParameter_->in_shape0_[i] = inShape1[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    } else if (dx2->ElementsNum() < dx1->ElementsNum()) {  // if (inShape0.size() > inShape1.size())
      arithmeticParameter_->ndim_ = inShape0.size();
      if (type() == PrimitiveType_MulGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul1L;
      else if (type() == PrimitiveType_DivGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv1L;
      arithmeticParameter_->broadcasting_ = true;
      arithmeticParameter_->ndim_ = inShape0.size();
      int j = 0;
      auto fillDimNum = inShape0.size() - inShape1.size();
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        if (i < fillDimNum) {
          arithmeticParameter_->in_shape1_[i] = 1;
        } else {
          arithmeticParameter_->in_shape1_[i] = inShape1[j++];
        }
        arithmeticParameter_->in_shape0_[i] = inShape0[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    } else {
      arithmeticParameter_->broadcasting_ = false;
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        arithmeticParameter_->in_shape1_[i] = inShape1[i];
        arithmeticParameter_->in_shape0_[i] = inShape0[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    }
    tile_data0 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
    MS_ASSERT(tile_data0 != nullptr);
    tile_data1 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
    MS_ASSERT(tile_data1 != nullptr);
    if (type() == PrimitiveType_DivGrad) {
      tile_data2 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
      MS_ASSERT(tile_data2 != nullptr);
    }
  }

  dx1->set_shape(x1->shape());
  dx2->set_shape(x2->shape());
  // outTensor->set_shape(out_shape);
  dx1->set_data_type(dy->data_type());
  dx2->set_data_type(dy->data_type());
  return RET_OK;
 }

 void ArithmeticGradCPUKernel::ArithmeticGradAdd(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  if (dx1_size == dy_size)
    memcpy(dx1, dy, dy_size * sizeof(float));
  else
    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx1, arithmeticParameter_->in_shape0_,
                    arithmeticParameter_->ndim_);
  if (dx2_size == dy_size)
    memcpy(dx2, dy, dy_size * sizeof(float));
  else
    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx2, arithmeticParameter_->in_shape1_,
                    arithmeticParameter_->ndim_);
 }

 void ArithmeticGradCPUKernel::ArithmeticGradSub(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  if (dx1_size == dy_size)
    memcpy(dx1, dy, dy_size * sizeof(float));
  else
    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx1, arithmeticParameter_->in_shape0_,
                    arithmeticParameter_->ndim_);
  if (dx2_size == dy_size) {
    for (int i = 0; i < dx2_size; i++) {
      dx2[i] = -dy[i];
    }
  } else {
    ReduceSumByAxes(dy, arithmeticParameter_->out_shape_, dx2, arithmeticParameter_->in_shape1_,
                    arithmeticParameter_->ndim_);
    for (int i = 0; i < dx2_size; i++) {
      dx2[i] = -dx2[i];
    }
  }
 }

 void ArithmeticGradCPUKernel::ArithmeticGradMul(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  ElementMul(dy, x1_data, dx2, dy_size);
  ElementMul(dy, x2_data, dx1, dy_size);
 }

 void ArithmeticGradCPUKernel::ArithmeticGradMul1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  ElementMul(dy, x1_data, tile_data0, dy_size);
  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx2, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);

  BroadcastMul(dy, x2_data, tile_data0, tile_data1, dx1, dy_size, arithmeticParameter_);  // broadcast directly to dx1
 }

 void ArithmeticGradCPUKernel::ArithmeticGradMul2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  ElementMul(dy, x2_data, tile_data0, dy_size);
  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx1, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);

  BroadcastMul(dy, x1_data, tile_data0, tile_data1, dx2, dy_size, arithmeticParameter_);  // broadcast directly to dx2
 }

 void ArithmeticGradCPUKernel::ArithmeticGradDiv(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  auto x1 = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2 = reinterpret_cast<float *>(inputs_[2]->Data());
  ElementDiv(dy, x2, dx1, dy_size);
  ElementMulAndDivNegSquare(dy, x1, x2, dx2, dy_size);
 }

 void ArithmeticGradCPUKernel::ArithmeticGradDiv1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());

  ElementMul(x2_data, x2_data, dx2, dx2_size);
  ElementMul(x1_data, dy, dx1, dy_size);  // use dx1 buffer
  BroadcastDiv(dx1, dx2, tile_data0, tile_data1, tile_data2, dy_size,
               arithmeticParameter_);  // broadcast directly to dx1
  ReduceSumByAxes(tile_data2, arithmeticParameter_->in_shape0_, dx2, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);
  for (int i = 0; i < dx2_size; i++) dx2[i] = -dx2[i];
  // ReduceNegSumPrefix(tile_data2, dy_size, dx2, dx2_size); //then reduce into dx2

  // broadcasting x2
  BroadcastDiv(dy, x2_data, tile_data0, tile_data1, dx1, dy_size, arithmeticParameter_);  // broadcast directly to dx1
 }

 void ArithmeticGradCPUKernel::ArithmeticGradDiv2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());

  // dx1 = dy/x2
  ElementDiv(dy, x2_data, tile_data0, dy_size);  // first multiply into temp
  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx1, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);

  // dx2 = -dy*x1/(x2*x2)
  BroadcastMul(dy, x1_data, tile_data0, tile_data1, tile_data2, dy_size, arithmeticParameter_);  // broadcast numerator
  ElementDivNegSquare(tile_data2, x2_data, dx2, dy_size);
 }

 int ArithmeticGradCPUKernel::ReSize() { return RET_OK; }

 int ArithmeticGradCPUKernel::Run() {
  auto dy = reinterpret_cast<float *>(inputs_[0]->Data());
  // auto input1_data1 = reinterpret_cast<float *>(inputs_[1]->Data());
  auto dx1 = reinterpret_cast<float *>(outputs_[0]->Data());
  auto dx2 = reinterpret_cast<float *>(outputs_[1]->Data());

  size_t dy_size = inputs_.at(0)->ElementsNum();
  size_t dx1_size = outputs_.at(0)->ElementsNum();
  size_t dx2_size = outputs_[1]->ElementsNum();
  (this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
  return RET_OK;
 }

 kernel::LiteKernel *CpuArithmeticGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                       const std::vector<lite::tensor::Tensor *> &outputs,
                                                       OpParameter *opParameter, const lite::Context *ctx,
                                                       const kernel::KernelKey &desc) {
  MS_EXCEPTION_IF_NULL(opParameter);
  if (opParameter == nullptr) {
    return nullptr;
  }
  auto *kernel = new (std::nothrow) ArithmeticGradCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_MulGrad, CpuArithmeticGradFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_AddGrad, CpuArithmeticGradFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SubGrad, CpuArithmeticGradFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_DivGrad, CpuArithmeticGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h
@@ -0,0 +1,90 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
 #include "schema/model_generated.h"
 #include "ir/anf.h"

 using mindspore::schema::PrimitiveType_AddGrad;
 using mindspore::schema::PrimitiveType_DivGrad;
 using mindspore::schema::PrimitiveType_MulGrad;
 using mindspore::schema::PrimitiveType_SubGrad;

 namespace mindspore::kernel {

 class ArithmeticGradCPUKernel;

 class ArithmeticGradCPUKernel : public LiteKernel {
  typedef void (ArithmeticGradCPUKernel::*ArithmeticGradOperation)(float *, int, float *, int, float *, int);

 public:
  explicit ArithmeticGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                   const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
    switch (type()) {
      case PrimitiveType_MulGrad:
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul;  // this will be adjusted in InferShape
        break;
      case PrimitiveType_AddGrad:
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradAdd;
        break;
      case PrimitiveType_SubGrad:
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradSub;
        break;
      case PrimitiveType_DivGrad:
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv;  // this will be adjusted in InferShape
        break;
      default:
        MS_LOG(ERROR) << "Error Operator type " << parameter->type_;
        break;
    }
    arithmeticParameter_ = reinterpret_cast<ArithmeticParameter *>(parameter);
  }
  ~ArithmeticGradCPUKernel() override {
    if (tile_data0) delete[] tile_data0;
    if (tile_data1) delete[] tile_data1;
    if (tile_data2) delete[] tile_data2;
  }
  void InitKernel(const CNodePtr &kernel_node);

  int Init() override;
  int InferShape();
  int ReSize() override;
  int Run() override;

 private:
  void ArithmeticGradAdd(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradSub(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradMul(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradMul1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradMul2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradDiv(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradDiv1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  void ArithmeticGradDiv2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2, int dx2_size);
  ArithmeticParameter *arithmeticParameter_;
  ArithmeticGradOperation arithmetic_grad_;
  float *tile_data0;
  float *tile_data1;
  float *tile_data2;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_ARITHMETIC_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.cc
@@ -0,0 +1,115 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <vector>
 #include "src/runtime/kernel/arm/fp32/bias_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"


 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::schema::PrimitiveType_BiasGrad;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 int BiasGradCPUKernel::InferShape() {
  if (1 != this->inputs_.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one input";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one output";
    return RET_ERROR;
  }
  auto *in0 = inputs_.front();
  auto *out = outputs_.front();
  MS_ASSERT(in0 != nullptr);
  MS_ASSERT(out != nullptr);
  auto inshape = in0->shape();
  int ndim = inshape.size();
  for (int i = 0; i < ndim - 1; i++) {
    inshape[i] = 1;
  }
  out->set_shape(inshape);
  out->set_data_type(in0->data_type());
  return RET_OK;
 }

 int BiasGradCPUKernel::Init() {
  MS_ASSERT(InferShape() == RET_OK);

  auto dims = inputs_[0]->shape();
  bias_param->ndim_ = dims.size();
  for (unsigned int i = 0; i < bias_param->ndim_; i++) {
    bias_param->in_shape0_[i] = dims[i];
    bias_param->out_shape_[i] = 1;  // 1 dimension for N,H,W,
  }
  bias_param->out_shape_[bias_param->ndim_ - 1] = dims[bias_param->ndim_ - 1];
  for (int i = bias_param->ndim_; i < 4; i++) {
    bias_param->in_shape0_[i] = 0;
    bias_param->out_shape_[i] = 0;
  }
  return RET_OK;
 }


 int BiasGradCPUKernel::ReSize() { return 0; }

 int BiasGradCPUKernel::Run() {
  auto in = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto out = reinterpret_cast<float *>(outputs_.at(0)->Data());
  // size_t data_size = inputs_.at(0)->ElementsNum();

  size_t nhw_size = 1;
  size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1];  // C in NHWC
  for (unsigned int i = 0; i < bias_param->ndim_ - 1; i++) nhw_size *= bias_param->in_shape0_[i];

  size_t total_size = channels * nhw_size;
  for (size_t c = 0; c < channels; ++c) {
    out[c] = 0;
    for (size_t offset = 0; offset < total_size; offset += channels) {
      out[c] += in[offset + c];
    }
  }

  return RET_OK;
 }


 kernel::LiteKernel *CpuBiasGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                 const std::vector<lite::tensor::Tensor *> &outputs,
                                                 OpParameter *opParameter, const lite::Context *ctx,
                                                 const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_BiasGrad);
  auto *kernel = new  (std::nothrow) BiasGradCPUKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BiasGrad, CpuBiasGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"

 namespace mindspore::kernel {
 class BiasGradCPUKernel : public LiteKernel {
 public:
  explicit BiasGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                             const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {
    bias_param = reinterpret_cast<ArithmeticParameter *>(parameter);
  }
  ~BiasGradCPUKernel() override = default;

  int Init() override;
  int InferShape();
  int ReSize() override;
  int Run() override;

 private:
  ArithmeticParameter *bias_param;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.cc
@@ -0,0 +1,115 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <algorithm>
 #include <vector>
 #include "schema/model_generated.h"
 #include "src/kernel_factory.h"
 #include "src/runtime/kernel/arm/fp32/bngrad_input.h"
 #include "src/runtime//kernel/arm/opclib/batch_norm.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 // using mindspore::lite::REG_OP;
 using mindspore::schema::PrimitiveType_BNGradInput;

 namespace mindspore::kernel {
 int BNGradInputCPUKernel::Init() {
  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
  workspace_size = 5 * bn_param->channels;
  workspace = new float[workspace_size];

  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
    return RET_ERROR;
  }
  auto *input_tensor = inputs_.at(0);
  // auto *weight_tensor = inputs_.at(1);
  auto *out_tensor = outputs_.at(0);
  auto in_shape = input_tensor->shape();
  out_tensor->set_shape(in_shape);
  out_tensor->set_data_type(input_tensor->data_type());
  return RET_OK;
 }

 int BNGradInputCPUKernel::ReSize() { return RET_OK; }

 /*
 according to https://wiseodd.github.io/techblog/2016/07/04/batchnorm
 */

 int BNGradInputCPUKernel::Run() {
  // std::cout << "run succ" << std::endl;
  auto *input_x = inputs_.at(0);
  auto *input_yt = inputs_.at(1);
  auto *input_scale = inputs_.at(2);
  auto *output_grad = outputs_.at(0);
  // Tensor *bias = input[5];
  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
  int batch = bn_param->batch;
  int channels = bn_param->channels;
  int spatial = bn_param->spatial;
  float eps = bn_param->eps;
  std::fill(workspace, workspace + workspace_size, 0.f);

  float *mean = workspace;
  float *variance = mean + channels;
  float *mean_delta = variance + channels;
  float *variance_delta = mean_delta + channels;
  float *mean_add_delta = variance_delta + channels;

  float *x = reinterpret_cast<float *>(input_x->Data());
  float *yt = reinterpret_cast<float *>(input_yt->Data());
  float *scale = reinterpret_cast<float *>(input_scale->Data());
  float *out = reinterpret_cast<float *>(output_grad->Data());

  std::copy(yt, yt + batch * channels * spatial, out);
  meanVar(x, batch, spatial, channels, mean, variance);
  scaleBias(scale, batch, channels, spatial, out);
  meanDelta(out, spatial, channels, eps, variance, mean_delta);
  varianceDelta(x, out, mean, variance, batch, channels, spatial, eps, variance_delta);
  meanAdd(x, mean, variance_delta, batch, channels, spatial, mean_add_delta, mean_delta);
  NormalizeDelta(x, mean, variance, mean_delta, variance_delta, batch, channels, eps, spatial, out);
  return RET_OK;
 }

 kernel::LiteKernel *CpuBNGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_BNGradInput);
  //  parameter->name = opDef.name()->str().data();
  //  parameter->type = opDef.attr_type();
  auto *kernel = new (std::nothrow) BNGradInputCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);
  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BNGradInput, CpuBNGradInputFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/bngrad_input.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class BNGradInputCPUKernel : public LiteKernel {
 public:
  explicit BNGradInputCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {}
  ~BNGradInputCPUKernel() override { delete workspace; }

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float *workspace;
  int workspace_size;
 };
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BNGRAD_INPUT_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.cc
@@ -0,0 +1,156 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/convolution_grad_filter.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/pack.h"
 #include "src/runtime/kernel/arm/opclib/pack_ext.h"
 #include "src/runtime/kernel/arm/opclib/fp32/gemm.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Conv2DGradFilter;

 namespace mindspore::kernel {
 int ConvolutionGradFilterCPUKernel::Init() {
  // dy is in input 0
  // x is in input 1
  // dw is output 0

  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
    return RET_ERROR;
  }

  auto *input_tensor = inputs_.at(1);
  MS_ASSERT(input_tensor != nullptr);
  auto *dy = inputs_.at(0);
  MS_ASSERT(dy != nullptr);
  auto *weight_tensor = outputs_.at(0);
  MS_ASSERT(weight_tensor != nullptr);

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  conv_param->output_batch_ = this->inputs_.at(0)->shape().at(kNHWC_N);
  conv_param->input_batch_ = this->inputs_.at(1)->shape().at(kNHWC_N);
  conv_param->input_h_ = this->inputs_.at(1)->shape().at(kNHWC_H);
  conv_param->input_w_ = this->inputs_.at(1)->shape().at(kNHWC_W);
  // assume OutCh|kh|kw|In
  conv_param->input_channel_ = this->inputs_.at(1)->shape().at(kNHWC_C);
  conv_param->output_channel_ = this->outputs_.at(0)->shape().at(kNHWC_N);

  int ws_size = conv_param->output_h_ * conv_param->output_w_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
                conv_param->input_channel_ / conv_param->group_;

  workspace = new float[ws_size];

  int output_w = 0;
  int output_h = 0;
  output_h = dy->shape()[kNHWC_H];
  output_w = dy->shape()[kNHWC_W];

  std::vector<int> out_shape(4);
  out_shape.at(0) = conv_param->output_channel_;
  out_shape.at(1) = conv_param->kernel_h_;
  out_shape.at(2) = conv_param->kernel_w_;
  out_shape.at(3) = conv_param->input_channel_ / conv_param->group_;

  // weight is output
  weight_tensor->set_shape(out_shape);
  weight_tensor->set_data_type(input_tensor->data_type());

  conv_param->output_h_ = output_h;
  conv_param->output_w_ = output_w;

  return RET_OK;
 }

 int ConvolutionGradFilterCPUKernel::ReSize() { return 0; }

 int ConvolutionGradFilterCPUKernel::Run() {
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  auto *input_dy = inputs_.at(0);
  auto *input_x = inputs_.at(1);
  auto *out_dw = outputs_.at(0);

  auto x_addr = reinterpret_cast<float *>(input_x->Data());
  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
  auto dw_addr = reinterpret_cast<float *>(out_dw->Data());

  int i, j;
  int nweights = out_dw->ElementsNum();
  int in_ch = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int k_h = conv_param->kernel_h_;  // out_dw->shape()[1];
  int k_w = conv_param->kernel_w_;  // out_dw->shape()[2];
  int batch = conv_param->output_batch_;
  int out_ch = conv_param->output_channel_;
  int groups = conv_param->group_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int m = out_h * out_w;
  int n = k_h * k_w * in_ch / groups;
  int k = out_ch / groups;

  // zero out pointer
  memset(dw_addr, 0, out_dw->Size());

  for (i = 0; i < batch; ++i) {
    for (j = 0; j < groups; ++j) {
      float *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups);
      float *mat_b = workspace;
      float *mat_c = dw_addr + j * nweights / groups;
      float *im = x_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups);

      im2row_hwc(im, mat_b, conv_param);
      gemm(1, 1, k, n, m, 1, mat_a, out_ch, mat_b, m, 1, mat_c, n);
    }
  }

  // std::cout << "run succ" << std::endl;
  return RET_OK;
 }

 kernel::LiteKernel *CpuConvGradFilterFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                       const std::vector<lite::tensor::Tensor *> &outputs,
                                                       OpParameter *opParameter, const lite::Context *ctx,
                                                       const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DGradFilter);

  auto *kernel = new (std::nothrow) ConvolutionGradFilterCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2DGradFilter, CpuConvGradFilterFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class ConvolutionGradFilterCPUKernel : public LiteKernel {
 public:
  explicit ConvolutionGradFilterCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                          const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {}
  ~ConvolutionGradFilterCPUKernel() override { delete workspace; }

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float *workspace;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_FILTER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.cc
@@ -0,0 +1,136 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/convolution_grad_input.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/pack.h"
 #include "src/runtime/kernel/arm/opclib/pack_ext.h"
 #include "src/runtime/kernel/arm/opclib/fp32/gemm.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::schema::PrimitiveType_Conv2DGradInput;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 int ConvolutionGradInputCPUKernel::Init() {
  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
    return RET_ERROR;
  }

  auto *dy_tensor = inputs_.at(kInputIndex);
  MS_ASSERT(dy_tensor != nullptr);
  auto *weight_tensor = inputs_.at(kWeightIndex);
  MS_ASSERT(weight_tensor != nullptr);
  auto *dx_tensor = outputs_.at(kOutputIndex);
  MS_ASSERT(dx_tensor != nullptr);

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
  conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];

  conv_param->input_h_ = dx_tensor->shape()[(kNHWC_H)];
  conv_param->input_w_ = dx_tensor->shape()[(kNHWC_W)];

  // assume OutCh|kh|kw|In
  conv_param->input_channel_ = dx_tensor->shape()[(kNHWC_C)];
  conv_param->output_channel_ = weight_tensor->shape()[(kNHWC_N)];

  // TBD
  conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
  conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];

  int ws_size = conv_param->output_h_ * conv_param->output_w_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
                conv_param->input_channel_ / conv_param->group_;

  workspace = new float[ws_size];
  return 0;
 }

 int ConvolutionGradInputCPUKernel::ReSize() { return 0; }

 int ConvolutionGradInputCPUKernel::Run() {
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  auto *input_dy = inputs_.at(0);
  auto *input_w = inputs_.at(1);
  auto *out_dx = outputs_.at(0);

  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
  auto w_addr = reinterpret_cast<float *>(input_w->Data());
  auto dx_addr = reinterpret_cast<float *>(out_dx->Data());

  int i, j;
  int nweights = input_w->ElementsNum();
  int in_ch = conv_param->input_channel_;
  int in_h = conv_param->input_h_;
  int in_w = conv_param->input_w_;
  int k_h = conv_param->kernel_h_;  // out_dw->shape()[1];
  int k_w = conv_param->kernel_w_;  // out_dw->shape()[2];
  int batch = conv_param->output_batch_;
  int out_ch = conv_param->output_channel_;
  int groups = conv_param->group_;
  int out_h = conv_param->output_h_;
  int out_w = conv_param->output_w_;

  int m = out_h * out_w;
  int n = k_w * k_h * in_ch / groups;
  int k = out_ch / groups;

  memset(dx_addr, 0, sizeof(float) * batch * in_ch * in_h * in_w);

  for (i = 0; i < batch; ++i) {
    for (j = 0; j < groups; ++j) {
      float *mat_a = dy_addr + (i * groups) * m * k + j * (out_ch / groups);
      float *mat_b = w_addr + j * nweights / groups;
      float *mat_c = workspace;
      gemm(0, 0, m, n, k, 1, mat_a, out_ch, mat_b, n, 0, mat_c, n);
      col2im_hwc(mat_c, dx_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups), conv_param);
    }
  }

  // std::cout << "run succ" << std::endl;
  return 0;
 }

 kernel::LiteKernel *CpuConvGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                      const std::vector<lite::tensor::Tensor *> &outputs,
                                                      OpParameter *opParameter, const lite::Context *ctx,
                                                      const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2DGradInput);

  auto *kernel = new (std::nothrow) ConvolutionGradInputCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (0 != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_Conv2DGradInput, CpuConvGradInputFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h
@@ -0,0 +1,41 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class ConvolutionGradInputCPUKernel : public LiteKernel {
 public:
  explicit ConvolutionGradInputCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                         const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {}
  ~ConvolutionGradInputCPUKernel() override { delete workspace; }

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float *workspace;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_CONVOLUTION_GRAD_INPUT_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.cc
@@ -0,0 +1,78 @@

 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/fp32/opt_momentum.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::schema::PrimitiveType_OptMomentum;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {

 int OptMomentumCPUKernel::ReSize() { return 0; }

 int OptMomentumCPUKernel::Run() {
  if (inputs_.size() != 5 || !outputs_.empty()) {
    MS_LOG(ERROR) << "OptMomentumCPUKernel error input output size!";
    return RET_ERROR;
  }

  if (inputs_[0]->ElementsNum() != inputs_[1]->ElementsNum() ||
      inputs_[0]->ElementsNum() != inputs_[3]->ElementsNum()) {
    MS_LOG(ERROR) << "error input data size!";
    return RET_ERROR;
  }
  auto weight = reinterpret_cast<float *>(inputs_[0]->Data());
  auto accumulate = reinterpret_cast<float *>(inputs_[1]->Data());
  float learning_rate = reinterpret_cast<float *>(inputs_[2]->Data())[0];
  auto gradient = reinterpret_cast<float *>(inputs_[3]->Data());
  float moment = reinterpret_cast<float *>(inputs_[4]->Data())[0];
  size_t elem_num = inputs_[0]->ElementsNum();
  for (size_t i = 0; i < elem_num; ++i) {
    accumulate[i] = accumulate[i] * moment + gradient[i];
    weight[i] -= accumulate[i] * learning_rate;
  }
  return RET_OK;
 }

 int OptMomentumCPUKernel::Init() { return 0; }

 kernel::LiteKernel *CpuOptMomentumFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc) {
  MS_ASSERT(desc.type == schema::PrimitiveType_OptMomentum);
  auto *kernel = new (std::nothrow) OptMomentumCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (0 != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_OptMomentum, CpuOptMomentumFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/opt_momentum.h
@@ -0,0 +1,40 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_FP32_OPT_MOMENTUM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPT_MOMENTUM_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class OptMomentumCPUKernel : public LiteKernel {
 public:
  explicit OptMomentumCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                   const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {}
  ~OptMomentumCPUKernel() override {}

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPT_MOMENTUM_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.cc
@@ -0,0 +1,195 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/pooling_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/opclib/fp32/pooling.h"
 #include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_PoolingGrad;

 namespace mindspore::kernel {
 #if 0
 int PoolingGradCPUKernel::TfPadding(int input_w, int input_h, int &output_w, int &output_h) {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);

  auto stride_w = pool_param->stride_w_;
  auto stride_h = pool_param->stride_h_;
  auto window_w = pool_param->window_w_;
  auto window_h = pool_param->window_h_;
  auto pad_up = pool_param->pad_u_;
  auto pad_down = pool_param->pad_d_;
  auto pad_left = pool_param->pad_l_;
  auto pad_right = pool_param->pad_r_;
  if (pool_param->pad_mode_ == PADMODE_SAME) {
    output_w = ceil(input_w / stride_w);
    output_h = ceil(input_h / stride_h);
  } else {
    output_w = ceil((input_w + pad_left + pad_right - window_w + 1) / stride_w);
    output_h = ceil((input_h + pad_up + pad_down - window_h + 1) / stride_h);
  }
  return RET_OK;
 }

 int PoolingGradCPUKernel::CaffePadding(int input_w, int input_h, int &output_w, int &output_h) {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);

  auto round_mode = pool_param->round_mode_;
  auto stride_w = pool_param->stride_w_;
  auto stride_h = pool_param->stride_h_;
  auto window_w = pool_param->window_w_;
  auto window_h = pool_param->window_h_;
  auto pad_up = pool_param->pad_u_;
  auto pad_down = pool_param->pad_d_;
  auto pad_left = pool_param->pad_l_;
  auto pad_right = pool_param->pad_r_;
  if (round_mode == ROUNDMODE_FLOOR && false) {
    output_w = floor((input_w + pad_left + pad_right - window_w) / stride_w + 1);
    output_h = floor((input_h + pad_up + pad_down - window_h) / stride_h + 1);
  } else if (round_mode == ROUNDMODE_CEIL || true) {
    output_w = ceil((input_w + pad_left + pad_right - window_w) / stride_w + 1);
    output_h = ceil((input_h + pad_up + pad_down - window_h) / stride_h + 1);
  } else {
    MS_LOG(ERROR) << "round mode not support.";
  }

  if (pad_left > 0 || pad_up > 0) {
    if ((output_w - 1) * stride_w >= input_w + pad_left) {
      --output_w;
    }
    if ((output_h - 1) * stride_h >= input_h + pad_up) {
      --output_h;
    }
  }
  return RET_OK;
 }

 int PoolingGradCPUKernel::OnnxPadding(int input_w, int input_h, int &output_w, int &output_h) {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *> (opParameter);

  auto round_mode = pool_param->round_mode_;
  auto stride_w = pool_param->stride_w_;
  auto stride_h = pool_param->stride_h_;
  auto window_w = pool_param->window_w_;
  auto window_h = pool_param->window_h_;
  auto pad_up = pool_param->pad_u_;
  auto pad_down = pool_param->pad_d_;
  auto pad_left = pool_param->pad_l_;
  auto pad_right = pool_param->pad_r_;
  if (round_mode == ROUNDMODE_FLOOR) {
    output_w = floor((input_w + pad_left + pad_right - window_w) / stride_w + 1);
    output_h = floor((input_h + pad_up + pad_down - window_h) / stride_h + 1);
  } else if (round_mode == ROUNDMODE_CEIL) {
    MS_LOG(ERROR) << "RoundMode_CEIL mode not support.";
  } else {
    MS_LOG(ERROR) << "OnnxPadding round mode not support.";
  }
  return RET_OK;
 }
 #endif

 int PoolingGradCPUKernel::Init() {
  // InferShape():
  // auto *in_tensor = reinterpret_cast<float *>(inputs_.at(0)->Data());
  // auto *x_tensor = reinterpret_cast<float *>(inputs_.at(1)->Data());

  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);

  auto in_shape = inputs_.at(0)->shape();
  int input_h = in_shape.at(1);
  int input_w = in_shape.at(2);

  if (pool_param->global_) {
    pool_param->window_w_ = input_w;
    pool_param->window_h_ = input_h;
  }

  // Emir -- here I assume we get the outputshape in the output tensor
  auto *out_tensor = outputs_.front();
  auto out_shape = out_tensor->shape();

 #if 0
  int output_w = 0, output_h = 0;
  auto fmk_type = pool_param->fmk_type_;
  switch (fmk_type) {
    case lite::FmkType_TF:
      break;
    case lite::FmkType_CAFFE:
      CaffePadding(input_w, input_h, output_w, output_h);
      break;
    case lite::FmkType_ONNX:
      OnnxPadding(input_w, input_h, output_w, output_h);
      break;
    case lite::FmkType_MS:
      break;
    case lite::FmkType_TFLITE:
      TfPadding(input_w, input_h, output_w, output_h);
      break;
    default:
      MS_LOG(ERROR) << "Not support this framework.";
  }
  std::vector<int> out_shape{in_tensor->shape()};
  out_shape.at(1) = output_h;
  out_shape.at(2) = output_w;
 #endif
  out_tensor->set_shape(out_shape);
  out_tensor->set_data_type(inputs_.at(0)->data_type());
  return RET_OK;
 }

 int PoolingGradCPUKernel::ReSize() { return RET_OK; }

 int PoolingGradCPUKernel::Run() {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);
  auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data());

  if (pool_param->max_pooling_) {
    auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data());
    MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param);
  } else {
    AvgPoolingGrad(input_ptr, output_ptr, pool_param);
  }
  return RET_OK;
 }

 kernel::LiteKernel *CpuPoolingGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_PoolingGrad);

  auto *kernel = new (std::nothrow) PoolingGradCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);
  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_PoolingGrad, CpuPoolingGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling_grad.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 using mindspore::schema::PadMode;
 using mindspore::schema::PoolMode;
 using mindspore::schema::QuantType;
 using mindspore::schema::RoundMode;

 class PoolingGradCPUKernel : public LiteKernel {
 public:
  explicit PoolingGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {}
  ~PoolingGradCPUKernel() override = default;

  // int TfPadding(int input_w, int input_h, int &output_w, int &output_h);
  // int CaffePadding(int input_w, int input_h, int &output_w, int &output_h);
  // int OnnxPadding(int input_w, int input_h, int &output_w, int &output_h);

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  uint8_t data_shape_{0};
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POOLING_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.cc
@@ -0,0 +1,67 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/power_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"

 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_PowerGrad;

 namespace mindspore::kernel {
 int PowerGradCPUKernel::Init() { return RET_OK; }

 int PowerGradCPUKernel::ReSize() { return RET_OK; }

 int PowerGradCPUKernel::Run() {
  auto dy_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto x_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
  auto dx_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto size = inputs_.at(0)->ElementsNum();

  Power(x_addr, dx_addr, size, power_ - 1, scale_, shift_);
  ElementMul(dx_addr, dy_addr, dx_addr, size);
  float scale = scale_ * power_;
  for (int i = 0; i < size; i++) {
    dx_addr[i] *= scale;
  }

  return RET_OK;
 }

 kernel::LiteKernel *CpuPowerGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
                                                  const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_PowerGrad);
  auto *kernel = new (std::nothrow) PowerGradCPUKernel(opParameter, inputs, outputs);
  auto ret = kernel->Init();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_PowerGrad, CpuPowerGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power_grad.h
@@ -0,0 +1,49 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"
 #include "src/runtime/kernel/arm/opclib/power.h"

 namespace mindspore::kernel {
 class PowerGradCPUKernel : public LiteKernel {
 public:
  PowerGradCPUKernel(OpParameter *param, const std::vector<lite::tensor::Tensor *> &inputs,
                          const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(param, inputs, outputs) {
        PowerParameter *power_param = reinterpret_cast<PowerParameter *>(param);
        power_ = power_param->power_;
        scale_ = power_param->scale_;
        shift_ = power_param->shift_;
  }
  ~PowerGradCPUKernel() override = default;

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float power_;
  float scale_;
  float shift_;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_POWER_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.cc
@@ -0,0 +1,145 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h"
 #include "src/runtime/kernel/arm/opclib/fp32/softmax.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_SoftmaxCrossEntropy;

 namespace mindspore::kernel {

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ReSize() { return RET_OK; }

 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const int *labels, const float *losses,
                                                                      float *output) const {
  float total_loss = 0;
  for (int i = 0; i < param->batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(EXCEPTION) << "label value must >= 0";
    }
    size_t label = labels[i];
    if (label > param->number_of_classes_) {
      MS_LOG(EXCEPTION) << "error label input!";
    } else {
      total_loss -= logf(losses[i * param->number_of_classes_ + label]);
    }
  }
  output[0] = total_loss / param->batch_size_;
 }

 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses,
                                                                   float *output) const {
  size_t row_start = 0;
  for (int i = 0; i < param->batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(EXCEPTION) << "label value must >= 0";
    }
    size_t label = labels[i];
    if (label > param->number_of_classes_) {
      MS_LOG(EXCEPTION) << "error label input!";
    }
    for (size_t j = 0; j < param->number_of_classes_; ++j) {
      size_t index = row_start + j;
      if (j == label) {
        output[index] = (losses[index] - 1) / param->batch_size_;
      } else {
        output[index] = losses[index] / param->batch_size_;
      }
    }
    row_start += param->number_of_classes_;
  }
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Run() {
  auto ins = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto labels = reinterpret_cast<int *>(inputs_.at(1)->Data());
  auto out = reinterpret_cast<float *>(outputs_.at(0)->Data());
  float *grads = NULL;
  if (is_train()) {  // outputs_.size() > 1)
    grads = reinterpret_cast<float *>(outputs_.at(0)->Data());
  }
  size_t data_size = inputs_.at(0)->ElementsNum();
  float *losses = new (std::nothrow) float[data_size];
  MS_ASSERT(losses != nullptr);
  std::fill(losses, losses + data_size, 0);

  MS_ASSERT(out != nullptr);
  MS_ASSERT(labels != nullptr);
  MS_ASSERT(ins != nullptr);

  SoftmaxParameter sm_params;
  sm_params.n_dim_ = param->n_dim_;
  sm_params.element_size_ = data_size;
  sm_params.axis_ = 1;
  for (int i = 0; i < 4; i++)  // softmax has only 4 params in shape
    sm_params.input_shape_[i] = param->input_shape_[i];
  float sum_data[sm_params.input_shape_[sm_params.axis_]];
  Softmax(ins, losses, sum_data, &sm_params);

  if (is_train()) {
    GradPostExecute(labels, losses, grads);
  } else {
    ForwardPostExecute(labels, losses, out);
  }
  return RET_OK;
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Init() {
  auto dims = inputs_[0]->shape();
  param->n_dim_ = 2;
  param->number_of_classes_ = dims[1];
  param->batch_size_ = dims[0];
  for (unsigned int i = 0; i < dims.size(); i++) param->input_shape_[i] = dims[i];
  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "softmax entropy loss should have two inputs";
    return RET_ERROR;
  }
  auto *in0 = inputs_.front();
  if (in0 == nullptr) {
    MS_LOG(ERROR) << "softmax etropy loss in0 have no data";
    return RET_ERROR;
  }

  return RET_OK;
 }

 kernel::LiteKernel *CpuSoftmaxCrossEntropyFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                            const std::vector<lite::tensor::Tensor *> &outputs,
                                                            OpParameter *opParameter, const lite::Context *ctx,
                                                            const kernel::KernelKey &desc) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_SoftmaxCrossEntropy);
  auto *kernel = new (std::nothrow) SparseSoftmaxCrossEntropyWithLogitsCPUKernel(opParameter, inputs, outputs);
  MS_ASSERT(kernel != nullptr);
  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SoftmaxCrossEntropy, CpuSoftmaxCrossEntropyFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_softmax_cross_entropy_with_logits.h
@@ -0,0 +1,50 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"
 #include "src/runtime/kernel/arm/opclib/fp32/softmax_grad.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"

 namespace mindspore::kernel {

 class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LiteKernel {
 public:
  explicit SparseSoftmaxCrossEntropyWithLogitsCPUKernel(OpParameter *parameter,
                                                        const std::vector<lite::tensor::Tensor *> &inputs,
                                                        const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {
    param = reinterpret_cast<SoftmaxCrossEntropyParameter *>(parameter);
  }
  ~SparseSoftmaxCrossEntropyWithLogitsCPUKernel() override = default;

  void ForwardPostExecute(const int *labels, const float *losses, float *output) const;
  void GradPostExecute(const int *labels, const float *losses, float *output) const;

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  SoftmaxCrossEntropyParameter *param;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/activation_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/activation_grad.h
@@ -0,0 +1,88 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_

 #include <math.h>
 #include "src/runtime/kernel/arm/opclib/op_base.h"
 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic.h"
 #include "src/runtime/kernel/arm/opclib/errorcode.h"

 struct ActivationGradParameter {
  OpParameter op_parameter{};
  int type_;
  float alpha_{0.01};
 };

 inline int ReluGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    dst[i] = src1[i] > 0 ? 1.0f : 0.0f;
  }
  ElementMul(src0, dst, dst, length);
  return OPCLIB_OK;
 }

 inline int Relu6Grad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    if (src1[i] < 0) {
      dst[i] = 0;
    } else {
      dst[i] = src1[i] > 6.0f ? 0.0f : 1.0f;
    }
  }
  ElementMul(src0, dst, dst, length);
  return OPCLIB_OK;
 }

 inline int LReluGrad(float *src0, float *src1, int length, float *dst, float alpha) {
  for (int i = 0; i < length; ++i) {
    dst[i] = src1[i] > 0.0f ? 1.0f : alpha;
  }
  ElementMul(src0, dst, dst, length);
  return OPCLIB_OK;
 }

 inline int SigmoidGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
  }
  return OPCLIB_OK;
 }

 inline int TanhGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    dst[i] = (1.0f - (src1[i] * src1[i])) * src0[i];
  }
  return OPCLIB_OK;
 }

 inline int HSwishGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
    dst[i] = tmp * src0[i];
  }
  return OPCLIB_OK;
 }

 inline int HSigmoidGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
    dst[i] = tmp * src0[i];
  }
  return OPCLIB_OK;
 }

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_ACTIVATION_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.cc
@@ -0,0 +1,120 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <algorithm>
 #include <cmath>
 #include "src/runtime/kernel/arm/opclib/batch_norm.h"

 static void sumSpatialBatch(const float *in, int size, int ch, float *out) {
  std::fill(out, out + ch, 0.f);
  for (int i = 0; i < size; i++) {
    const float *ptr = in + i * ch;
    for (int c = 0; c < ch; c++) {
      out[c] += ptr[c];
    }
  }
 }

 void scaleBias(const float *scales, int batch, int n, int size, float *output) {
  for (int i = 0; i < batch * size; i++)
    for (int c = 0; c < n; c++) output[i * n + c] *= scales[c];
 }

 void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
               float *out) {
  int b, f, i;
  for (b = 0; b < batch; ++b) {
    for (i = 0; i < spatial; ++i) {
      for (f = 0; f < filters; ++f) {
        int index = b * filters * spatial + i * filters + f;
        out[index] = (x[index] - mean[f]) / (std::sqrt(variance[f]) + eps);
      }
    }
  }
 }

 void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates) {
  int i, b, f;
  std::fill(scale_updates, scale_updates + n, 0.f);
  for (b = 0; b < batch; ++b) {
    for (i = 0; i < size; ++i) {
      for (f = 0; f < n; ++f) {
        int index = (b * size + i) * n + f;
        scale_updates[f] += delta[index] * x_norm[index];
      }
    }
  }
 }

 void meanVar(const float *in, int batch, int spatial, int ch, float *mean, float *var) {
  float N = batch * spatial;
  sumSpatialBatch(in, N, ch, mean);
  for (int f = 0; f < ch; ++f) mean[f] /= N;
  std::fill(var, var + ch, 0.f);
  for (int i = 0; i < N; i++) {
    for (int f = 0; f < ch; f++) {
      float x = in[i * ch + f];
      var[f] += (x - mean[f]) * (x - mean[f]);
    }
  }
  for (int f = 0; f < ch; f++) var[f] /= N;
 }

 void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta) {
  sumSpatialBatch(yt, size, ch, mean_delta);
  for (int i = 0; i < ch; i++) mean_delta[i] *= -1.f / std::sqrt((variance[i] + eps));
 }

 void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
             float *mean_add, float *mean_delta) {
  int i, k;
  std::fill(mean_add, mean_add + filters, 0.f);
  for (k = 0; k < spatial * batch; ++k) {
    for (i = 0; i < filters; ++i) {
      int index = k * filters + i;
      mean_add[i] += x[index] - mean[i];
    }
  }
  for (i = 0; i < filters; ++i) {
    mean_add[i] *= variance_delta[i] * (-2.f / (spatial * batch));
    mean_delta[i] += mean_add[i];
  }
 }

 void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int filters,
                   int spatial, float eps, float *variance_delta) {
  int i, k;
  std::fill(variance_delta, variance_delta + filters, 0.f);
  for (k = 0; k < batch * spatial; k++) {
    for (i = 0; i < filters; i++) {
      int index = k * filters + i;
      variance_delta[i] += delta[index] * (x[index] - mean[i]);
    }
  }
  for (i = 0; i < filters; i++) variance_delta[i] *= -.5 * pow(variance[i] + eps, (-3.f / 2.f));
 }

 void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta) {
  int f, k;
  for (k = 0; k < batch * spatial; k++) {
    for (f = 0; f < filters; f++) {
      int index = k * filters + f;
      delta[index] = delta[index] * 1. / (std::sqrt(variance[f] + eps)) +
                     variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) +
                     mean_delta[f] / (spatial * batch);
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/batch_norm.h
@@ -0,0 +1,39 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_BACKEND_ARM_BATCH_NORM_H_
 #define MINDSPORE_LITE_SRC_BACKEND_ARM_BATCH_NORM_H_

 struct bnParameter {
  int batch;
  int channels;
  int spatial;
  float eps;
 };
 void scaleBias(const float *scales, int batch, int n, int size, float *output);
 void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
               float *out);
 void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates);
 void meanVar(const float *in, int batch, int size, int ch, float *mean, float *var);
 void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta);
 void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int ch,
                   int spatial, float eps, float *variance_delta);
 void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
             float *mean_add, float *mean_delta);
 void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta);

 #endif
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.cc
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h"

 void ElementDivNegSquare(const float *nom, const float *denom, float *output, int element_size) {
  for (int i = 0; i < element_size; i++) {
    output[i] = -nom[i] / (denom[i] * denom[i]);
  }
 }

 void ElementMulAndDivNegSquare(const float *a, const float *b, const float *denom, float *output, int element_size) {
  for (int i = 0; i < element_size; i++) {
    output[i] = -a[i] * b[i] / (denom[i] * denom[i]);
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/arithmetic_grad.h
@@ -0,0 +1,22 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_

 void ElementDivNegSquare(const float *nom, const float *denom, float *output, int element_size);
 void ElementMulAndDivNegSquare(const float *a, const float *b, const float *denom, float *output, int element_size);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_ARITHMETIC_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.cc
@@ -0,0 +1,108 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/opclib/fp32/gemm.h"

 static void gemm_nn(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_B, int ldb, float *mat_c,
                    int ldc) {
  int i, j, k;
  for (i = 0; i < M; ++i) {
    for (k = 0; k < K; ++k) {
      float a = alpha * mat_a[i * lda + k];
      for (j = 0; j < N; ++j) {
        mat_c[i * ldc + j] += a * mat_B[k * ldb + j];
      }
    }
  }
 }

 static void gemm_nt(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
                    int ldc) {
  int i, j, k;
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0;
      for (k = 0; k < K; ++k) {
        sum += alpha * mat_a[i * lda + k] * mat_b[j * ldb + k];
      }
      mat_c[i * ldc + j] += sum;
    }
  }
 }

 static void gemm_tn(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
                    int ldc) {
  int i, j, k;
  for (i = 0; i < M; ++i) {
    for (k = 0; k < K; ++k) {
      float a = alpha * mat_a[k * lda + i];
      for (j = 0; j < N; ++j) {
        mat_c[i * ldc + j] += a * mat_b[k * ldb + j];
      }
    }
  }
 }

 static void gemm_tt(int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b, int ldb, float *mat_c,
                    int ldc) {
  int i, j, k;
  for (i = 0; i < M; ++i) {
    for (j = 0; j < N; ++j) {
      float sum = 0;
      for (k = 0; k < K; ++k) {
        sum += alpha * mat_a[i + k * lda] * mat_b[k + j * ldb];
      }
      mat_c[i * ldc + j] += sum;
    }
  }
 }

 // mat_c = alpha*op( mat_a )*op( mat_b ) + beta*C
 // M - number of rows of matrix a
 // N - number of cols of matrix b
 // K - number of cols of matrix a

 void gemm(int transpose_a, int transpose_b, int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b,
          int ldb, float beta, float *mat_c, int ldc) {
  // printf("cpu: %d %d %d %d %d %f %d %d %f %d\n",TA, TB, M, N, K, ALPHA, lda, ldb, BETA, ldc);
  if (beta >= 0.f && beta <= 0.f) {
    for (int i = 0; i < M; ++i) {
      for (int j = 0; j < N; ++j) {
        mat_c[i * ldc + j] = 0;
      }
    }
  } else if (beta < 1.f || beta > 1.f) {
    for (int i = 0; i < M; ++i) {
      for (int j = 0; j < N; ++j) {
        mat_c[i * ldc + j] *= beta;
      }
    }
  }

  int t;

  for (t = 0; t < M; ++t) {
    if (!transpose_a && !transpose_b) {
      gemm_nn(1, N, K, alpha, mat_a + t * lda, lda, mat_b, ldb, mat_c + t * ldc, ldc);
    } else if (transpose_a && !transpose_b) {
      gemm_tn(1, N, K, alpha, mat_a + t, lda, mat_b, ldb, mat_c + t * ldc, ldc);
    } else if (!transpose_a && transpose_b) {
      gemm_nt(1, N, K, alpha, mat_a + t * lda, lda, mat_b, ldb, mat_c + t * ldc, ldc);
    } else {
      gemm_tt(1, N, K, alpha, mat_a + t, lda, mat_b, ldb, mat_c + t * ldc, ldc);
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/gemm.h
@@ -0,0 +1,23 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_

 void gemm(int transpose_a, int transpose_b, int M, int N, int K, float alpha, float *mat_a, int lda, float *mat_b,
          int ldb, float beta, float *mat_c, int ldc);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_GEMM_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.cc
@@ -0,0 +1,149 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cstdint>
 #include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"

 void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param) {
  int stride_w = pooling_param->stride_w_;
  int stride_h = pooling_param->stride_h_;
  int pad_w = pooling_param->pad_l_;
  int pad_h = pooling_param->pad_u_;
  int win_w = pooling_param->window_w_;
  int win_h = pooling_param->window_h_;
  int channel = pooling_param->input_channel_;
  int in_w = pooling_param->input_w_;
  int in_h = pooling_param->input_h_;
  int output_w = pooling_param->output_w_;
  int output_h = pooling_param->output_h_;
  int output_batch = pooling_param->output_batch_;

  const float *inPtr;
  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;

  // int pad_top = padding[2];

  float kk = static_cast<float>(win_h * win_w);

  for (uint16_t ib = 0; ib < output_batch; ib++) {
    // int in_batch_offset = batch * in_h * in_w * channel;
    // int out_batch_offset = batch * output_h * output_w * channel;
    // out = grads->getData(ib*grads->imgSize());
    // inPtr = in->getData(ib*in->imgSize());
    float *out;
    out = &output_ptr[(ib * output_h * output_w)];
    inPtr = reinterpret_cast<const float *>(&input_ptr[(ib * in_h * in_w)]);
    if (1) {  // in->layout() == Tensor::nhwc)
      // iterate over yt
      for (uint16_t yh = 0; yh < in_h; yh++) {
        for (uint16_t yw = 0; yw < in_w; yw++) {
          for (uint16_t ic = 0; ic < channel; ic++) {
            int idx = (yw + yh * in_w) * channel + ic;  // (ic*in_h*in_w) + (in_w*yh) + yw;
            float delta = inPtr[idx] / kk;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= output_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= output_w)) {
                  continue;
                }
                // out[(ic*output_h*output_w) + (xh*output_w) + xw] += delta;
                out[(xw + output_w * xh) * channel + ic] += delta;
              }
            }
          }
        }
      }
    } else {  // nchw
      for (uint16_t ic = 0; ic < channel; ic++) {
        // iterate over yt
        for (uint16_t yh = 0; yh < in_h; yh++) {
          for (uint16_t yw = 0; yw < in_w; yw++) {
            int idx = (ic * in_h * in_w) + (in_w * yh) + yw;
            float delta = inPtr[idx] / kk;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= output_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= output_w)) {
                  continue;
                }
                out[(ic * output_h * output_w) + (xh * output_w) + xw] += delta;
              }
            }
          }
        }
      }
    }
  }
 }

 void MaxPoolingGrad(const float *dy, const int *indices, float *output_ptr, PoolingParameter *pooling_param) {
  // int stride_w = pooling_param->stride_w_;
  // int stride_h = pooling_param->stride_h_;
  // int pad_w = pooling_param->pad_l_;
  // int pad_h = pooling_param->pad_u_;
  // int win_w = pooling_param->window_w_;
  // int win_h = pooling_param->window_h_;
  int channel = pooling_param->input_channel_;
  int in_w = pooling_param->input_w_;
  int in_h = pooling_param->input_h_;
  int output_w = pooling_param->output_w_;
  int output_h = pooling_param->output_h_;
  int output_batch = pooling_param->output_batch_;

  int out_img_size =
    output_h * output_w;  // Emir -- in original code this varible is calculated according to input size ??
  int ind_img_size = in_h * in_w;
  // const int w_pad = (output_w + pad_w + pad_w);

  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;

  const float *yt = reinterpret_cast<const float *>(dy);
  const int *pos = reinterpret_cast<const int *>(indices);
  float *out;

  if (1) {  // grads->layout() == Tensor::nhwc)
    for (int ib = 0; ib < output_batch; ib++) {
      out = &(output_ptr[ib * output_w * output_w * channel]);
      for (int ix = 0; ix < ind_img_size; ix++) {
        for (int cix = 0; cix < channel; cix++) {
          int idx = (*pos) * channel + cix;
          out[idx] += *yt;
          pos++;
          yt++;
        }
      }
    }
  } else {
    for (int ib = 0; ib < output_batch; ib++) {
      out = &output_ptr[(ib * out_img_size)];
      for (int cix = 0; cix < channel; cix++) {
        for (int ix = 0; ix < ind_img_size; ix++) {
          int idx = cix * output_h * output_w + *pos;  // cord_y*output_w + cord_x;
          out[idx] += *yt;
          pos++;
          yt++;
        }
      }
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/pooling_grad.h
@@ -0,0 +1,25 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_

 #include "src/runtime/kernel/arm/opclib/fp32/pooling.h"

 void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param);
 void MaxPoolingGrad(const float *dy, const int *indices_ptr, float *output_ptr, PoolingParameter *pooling_param);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_POOLING_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.cc
@@ -0,0 +1,130 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cstddef>
 #include <algorithm>
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h"

 static inline bool NextIndex(const int num_dims, const int *dims, int *current) {
  int carry = 1;
  for (int idx = num_dims - 1; idx >= 0; --idx) {
    int current_val = current[idx] + carry;
    if (dims[idx] == current_val) {
      current[idx] = 0;
    } else {
      current[idx] = current_val;
      carry = 0;
      break;
    }
  }
  return (carry == 0);
 }

 static inline size_t GetInputOffset(const int num_dims, const int *dims, const int *iter) {
  size_t offset = 0;
  for (int idx = 0; idx < num_dims; ++idx) {
    offset = offset * (size_t)(dims[idx]) + (size_t)(iter[idx]);
  }

  return offset;
 }

 static inline size_t GetOutputOffset(const int num_dims, const int *dims, const int *iter, const int num_axis,
                                     const int *axes) {
  size_t offset = 0;
  for (int idx = 0; idx < num_dims; ++idx) {
    // if we need to skip this axis
    bool is_axis = false;
    for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
      if (idx == axes[axis_idx]) {
        is_axis = true;
        break;
      }
    }

    if (!is_axis) {
      offset = offset * (size_t)(dims[idx]) + (size_t)(iter[idx]);
    }
  }
  return offset;
 }

 void ReduceMeanByAxes(const float *input_data, int *input_iter, const int *input_dims, int input_num_dims,
                      const int *axes, int num_axes, float *output_data, const int *output_dims, int output_num_dims) {
  size_t num_outputs = 1;
  for (int idx = 0; idx < output_num_dims; ++idx) {
    size_t current = (size_t)(output_dims[idx]);
    num_outputs *= current;
  }

  // Reset input iterator.
  for (int idx = 0; idx < input_num_dims; ++idx) {
    input_iter[idx] = 0;
  }
  // Iterate through input_data.
  do {
    size_t input_offset = GetInputOffset(input_num_dims, input_dims, input_iter);
    size_t output_offset = GetOutputOffset(input_num_dims, input_dims, input_iter, num_axes, axes);
    output_data[output_offset] += input_data[input_offset];
  } while (NextIndex(input_num_dims, input_dims, input_iter));

  // Calculate mean by dividing output_data by num of aggregated element.
  size_t num_elements_in_axis = 1;
  for (int idx = 0; idx < num_axes; ++idx) {
    size_t current = (size_t)(input_dims[axes[idx]]);
    num_elements_in_axis *= current;
  }

  for (size_t idx = 0; idx < num_outputs; ++idx) {
    output_data[idx] = output_data[idx] / static_cast<float>(num_elements_in_axis);
  }
 }

 float ReduceMeanAll(const float *src, int size) {
  float sum = 0;
  for (int i = 0; i < size; ++i) {
    sum += src[i];
  }
  return sum / size;
 }

 void ReduceSumByAxes(const float *input, const int *input_dims, float *output, const int *output_dims, int num_dims) {
  int num_outputs = 1;
  int same_shape = true;
  for (int idx = 0; idx < num_dims; ++idx) {
    num_outputs *= output_dims[idx];
    if (output_dims[idx] != input_dims[idx]) same_shape = false;
  }
  if (same_shape) {
    std::copy(input, input + num_outputs * sizeof(float), output);
    // memcpy(output, input, num_outputs*sizeof(float));
    return;
  }

  for (int idx = 0; idx < num_outputs; ++idx) output[idx] = 0;  // zero output

  int input_iter[8] = {0};
  int axes[5] = {0};
  int num_axes = 0;
  for (int i = 0; i < num_dims; i++)
    if (output_dims[i] == 1) axes[num_axes++] = i;

  // Iterate through input_data.
  do {
    size_t input_offset = GetInputOffset(num_dims, input_dims, input_iter);
    size_t output_offset = GetOutputOffset(num_dims, input_dims, input_iter, num_axes, axes);
    output[output_offset] += input[input_offset];
  } while (NextIndex(num_dims, input_dims, input_iter));
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce_grad.h
@@ -0,0 +1,24 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_

 float ReduceMeanAll(const float *src, int size);
 void ReduceSumByAxes(const float *input, const int *input_dims, float *output, const int *output_dims, int num_dims);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_REDUCE_GRAD_H_

--- a/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/softmax_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/fp32/softmax_grad.h
@@ -0,0 +1,29 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_

 #include "src/runtime/kernel/arm/opclib/op_base.h"

 struct SoftmaxCrossEntropyParameter {
    OpParameter op_parameter;
    int32_t batch_size_;
    unsigned int number_of_classes_;
    int n_dim_;
    int input_shape_[5];
 };
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_FP32_SOFTMAX_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.cc
@@ -0,0 +1,176 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <string.h>
 #include "src/runtime/kernel/arm/opclib/pack_ext.h"

 static int is_a_ge_zero_and_a_lt_b(int a, int b) { return (unsigned)(a) < (unsigned)(b); }

 void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;

  const int stride_h = conv_param->stride_h_;
  const int stride_w = conv_param->stride_w_;

  const int dilation_h = conv_param->dilation_h_;
  const int dilation_w = conv_param->dilation_w_;

  const int kernel_h = conv_param->kernel_h_;
  const int kernel_w = conv_param->kernel_w_;

  const int in_height = conv_param->input_h_;
  const int in_width = conv_param->input_w_;

  const int output_h = conv_param->output_h_;
  const int output_w = conv_param->output_w_;
  const int channels = conv_param->input_channel_ / conv_param->group_;
  const int tot_channels = conv_param->input_channel_;

  int /*channel,*/ kernel_row, kernel_col, output_rows, output_col;

  int row_stride_offset = 0;

  for (output_rows = output_h; output_rows; output_rows--) {
    int col_stride_offset = 0;
    for (output_col = output_w; output_col; output_col--) {
      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;

          if (is_a_ge_zero_and_a_lt_b(input_row, in_height) && is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
            const int offset = (input_row * in_width + input_col) * tot_channels;
            memcpy(data_col, in_data + offset, sizeof(float) * channels);
            data_col += channels;
          } else {
            memset(data_col, 0, sizeof(float) * channels);
            data_col += channels;
          }
        }
      }
      col_stride_offset += stride_w;
    }
    row_stride_offset += stride_h;
  }
 }

 // output matrix is (kernel_h*kernel_w*channels)X(output_h*output_w)
 void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;

  const int stride_h = conv_param->stride_h_;
  const int stride_w = conv_param->stride_w_;

  const int dilation_h = conv_param->dilation_h_;
  const int dilation_w = conv_param->dilation_w_;

  const int kernel_h = conv_param->kernel_h_;
  const int kernel_w = conv_param->kernel_w_;

  const int in_height = conv_param->input_h_;
  const int in_width = conv_param->input_w_;

  const int output_h = conv_param->output_h_;
  const int output_w = conv_param->output_w_;
  const int channels = conv_param->input_channel_ / conv_param->group_;
  const int tot_channels = conv_param->input_channel_;

  int channel, kernel_row, kernel_col, output_rows, output_col;

  for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
    for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
      for (channel = 0; channel < channels; channel++) {
        int input_row = -pad_up + kernel_row * dilation_h;
        for (output_rows = output_h; output_rows; output_rows--) {
          if (!is_a_ge_zero_and_a_lt_b(input_row, in_height)) {
            for (output_col = output_w; output_col; output_col--) {
              *(data_row++) = 0;
            }
          } else {
            int input_col = -pad_left + kernel_col * dilation_w;
            for (output_col = output_w; output_col; output_col--) {
              if (is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
                const int offset = (input_row * in_width + input_col) * tot_channels + channel;
                *(data_row++) = in_data[offset];
              } else {
                *(data_row++) = 0;
              }
              input_col += stride_w;
            }
          }
          input_row += stride_h;
        }
      }
    }
  }
 }

 void col2im_hwc(const float *data_col, float *data_im, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;

  const int stride_h = conv_param->stride_h_;
  const int stride_w = conv_param->stride_w_;

  const int dilation_h = conv_param->dilation_h_;
  const int dilation_w = conv_param->dilation_w_;

  const int kernel_h = conv_param->kernel_h_;
  const int kernel_w = conv_param->kernel_w_;

  const int in_height = conv_param->input_h_;
  const int in_width = conv_param->input_w_;

  const int output_h = conv_param->output_h_;
  const int output_w = conv_param->output_w_;
  const int channels = conv_param->input_channel_ / conv_param->group_;
  const int tot_channels = conv_param->input_channel_;

  int kernel_row, kernel_col, output_rows, output_col;

  int row_stride_offset = 0;

  for (output_rows = output_h; output_rows; output_rows--) {
    int col_stride_offset = 0;
    for (output_col = output_w; output_col; output_col--) {
      for (kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
        int input_row = -pad_up + kernel_row * dilation_h + row_stride_offset;
        for (kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
          int input_col = -pad_left + kernel_col * dilation_w + col_stride_offset;

          if (is_a_ge_zero_and_a_lt_b(input_row, in_height) && is_a_ge_zero_and_a_lt_b(input_col, in_width)) {
            int offset = (input_row * in_width + input_col) * tot_channels;
            float *data_im_ptr = &data_im[offset];
            for (int i = 0; i < channels; i++) {
              data_im_ptr[i] += data_col[i];
            }
          }
          data_col += channels;
        }
      }
      col_stride_offset += stride_w;
    }
    row_stride_offset += stride_h;
  }
 }
--- a/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.h
+++ b/mindspore/lite/src/runtime/kernel/arm/opclib/pack_ext.h
@@ -0,0 +1,26 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H_

 #include "src/runtime/kernel/arm/opclib/conv_parameter.h"

 void im2col_hwc(const float *in_data, float *data_col, ConvParameter *conv_param);
 void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param);
 void col2im_hwc(const float *data_col, float *data_im, ConvParameter *conv_param);

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_OPCLIB_PACK_EXT_H
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -152,6 +152,7 @@ set(TEST_LITE_SRC
        ${LITE_DIR}/src/scheduler.cc
        ${LITE_DIR}/src/common/graph_util.cc
        ${LITE_DIR}/src/common/file_utils.cc
        ${LITE_DIR}/src/common/file_utils_ext.cc
        ${LITE_DIR}/src/common/utils.cc
        ${LITE_DIR}/tools/common/graph_util.cc
        ${LITE_DIR}/tools/common/tensor_util.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/activation_grad_fp32_tests.cc
@@ -0,0 +1,312 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include <vector>

 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "mindspore/lite/src/ir/tensor.h"
 #include "mindspore/lite/src/lite_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/activation_grad.h"

 namespace mindspore {
 class TestActGradFp32 :  public mindspore::Common {
 public:
  TestActGradFp32() {}
 };

 TEST_F(TestActGradFp32, ReluGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/relu_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/relu_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    ReluGrad(yt_data, input_data, 50, output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    ReluGrad(yt_data, input_data, 50, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/relu_out_50.bin";

  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] output_data;
  delete yt_data;

  MS_LOG(INFO) << "ReluGradFp32 passed";
 }

 TEST_F(TestActGradFp32, Relu6GradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/relu6_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/relu6_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    Relu6Grad(yt_data, input_data, 50, output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    Relu6Grad(yt_data, input_data, 50, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/relu6_out_50.bin";
  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] output_data;
  delete yt_data;

  MS_LOG(INFO) << "Relu6GradFp32 passed";
 }

 TEST_F(TestActGradFp32, LReluGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/lrelu_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/lrelu_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    LReluGrad(yt_data, input_data, 50, output_data, 0.1);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    LReluGrad(yt_data, input_data, 50, output_data, 0.1);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/lrelu_out_50.bin";
  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] output_data;
  delete yt_data;

  MS_LOG(INFO) << "LReluGradFp32 passed";
 }

 TEST_F(TestActGradFp32, SigmoidGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/sigmoid_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/sigmoid_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    SigmoidGrad(yt_data, input_data, 50, output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    SigmoidGrad(yt_data, input_data, 50, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/sigmoid_out_50.bin";
  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);
  // lite::CompareOutput(output_data, output_path);

  delete input_data;
  delete[] output_data;
  delete yt_data;

  MS_LOG(INFO) << "SigmoidGradFp32 passed";
 }

 TEST_F(TestActGradFp32, tanhGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/tanh_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/tanh_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    TanhGrad(yt_data, input_data, 50, output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    TanhGrad(yt_data, input_data, 50, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/tanh_out_50.bin";
  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] output_data;
  delete yt_data;
  MS_LOG(INFO) << "TanhGradFp32 passed";
 }

 TEST_F(TestActGradFp32, hswishGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/hswish_x_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::string yt_path = "./test_data/activationGrad/hswish_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    HSwishGrad(yt_data, input_data, 50, output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    HSwishGrad(yt_data, input_data, 50, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/activationGrad/hswish_out_50.bin";
  int res = lite::CompareRelativeOutput(output_data, output_path);

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] output_data;
  delete yt_data;
  MS_LOG(INFO) << "hswishGradFp32 passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/arithmetic_grad_fp32_tests.cc
@@ -0,0 +1,497 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include <vector>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/fp32/reduce.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_grad.h"
 #include "mindspore/lite/src/kernel_registry.h"

 namespace mindspore {

 class TestArithmeticGradFp32 : public mindspore::Common {
 public:
  TestArithmeticGradFp32() {}
 };

 std::vector<lite::tensor::Tensor *> GenerateTensorsForTest(const char *test, int test_id) {
  size_t input_size;
  std::vector<int> large_dim({4, 6});
  std::vector<int> small_dim({6});
  int large_size = (4 * 6);
  int small_size = (1 * 6);
  char *dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_1_x1_4_6.bin");
  char *dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_1_x2_1_6.bin");

  if (test_id == 7) {
    large_dim = std::vector<int>({4, 5, 6});
    small_dim = std::vector<int>({6});
    large_size = (4 * 5 * 6);
    small_size = (6);
    dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_7_x1_4_5_6.bin");
    dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_7_x2_1_1_6.bin");
  }
  if (test_id >= 8) {
    large_dim = std::vector<int>({5, 4, 6});
    small_dim = std::vector<int>({5, 1, 6});
    large_size = (4 * 5 * 6);
    small_size = (5 * 6);
    dx1_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_8_x1_5_4_6.bin");
    dx2_file = const_cast<char *>("./test_data/operators/arithmetic_fp32_8_x2_5_1_6.bin");
  }

  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(test, &input_size));
  lite::tensor::Tensor *dy_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
  dy_tensor->SetData(dy_data);

  auto x1_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dx1_file, &input_size));
  lite::tensor::Tensor *x1_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
  x1_tensor->SetData(x1_data);

  auto x2_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dx2_file, &input_size));
  lite::tensor::Tensor *x2_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, small_dim);
  x2_tensor->SetData(x2_data);

  auto dx1_data = new float[large_size];
  lite::tensor::Tensor *dx1_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, large_dim);
  dx1_tensor->SetData(dx1_data);

  auto dx2_data = new float[small_size];
  lite::tensor::Tensor *dx2_tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, small_dim);
  dx2_tensor->SetData(dx2_data);

  std::vector<lite::tensor::Tensor *> ret_vector = {dy_tensor, x1_tensor, x2_tensor, dx1_tensor, dx2_tensor};
  return ret_vector;
 }

 TEST_F(TestArithmeticGradFp32, TestAddGradFp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_AddGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_1_dy_4_6.bin", 1);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_1_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestAddGradFp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestAddGrad2Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_AddGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_1_dy_4_6.bin", 1);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_1_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestAddGrad2Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestAddGrad3Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_AddGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_8_dy_5_4_6.bin", 8);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_AddGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_8_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestAddGrad3Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestSubGradFp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_SubGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_2_dy_4_6.bin", 2);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_SubGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_2_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_2_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestSubGradFp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestSubGrad2Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_SubGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_3_dy_4_6.bin", 3);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_SubGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_3_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_3_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestSubGrad2Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestMulGradFp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_MulGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_4_dy_4_6.bin", 4);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);

  int loop_count = 1000;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel_obj->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  printf("total cost (for %d loops): %lu us\n", loop_count, cost);
  // auto time_avg = cost / loop_count;
  // printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_4_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestMulGradFp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestMulGrad2Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_MulGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_4_dy_4_6.bin", 4);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_4_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestMulGrad2Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestMulGrad3Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_MulGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin", 9);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestMulGrad3Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestMulGrad4Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_MulGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_9_dy_5_4_6.bin", 9);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_MulGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_9_dx1_5_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestMulGrad4Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestDivGradFp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_DivGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_5_dy_4_6.bin", 5);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string output_path = "./test_data/operators/arithmetic_fp32_5_dx1_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), output_path));

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_5_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestDivGradFp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestDivGrad2Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_DivGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_6_dy_4_6.bin", 6);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[2], all_tensors[1]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[4], all_tensors[3]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[0]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_6_dx2_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[1]->Data()), dx2_path));

  std::string output_path = "./test_data/operators/arithmetic_fp32_6_dx1_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, TestDivGrad3Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_DivGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin", 10);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string dx1_path = "./test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), dx1_path));

  std::string output_path = "./test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestDivGrad3Fp32 passed";
 }

 TEST_F(TestArithmeticGradFp32, Test3DDivGrad2Fp32) {
  auto param = new ArithmeticParameter();
  param->op_parameter_.type_ = PrimitiveType_DivGrad;
  std::vector<lite::tensor::Tensor *> all_tensors =
    GenerateTensorsForTest("./test_data/operators/arithmetic_fp32_7_dy_4_5_6.bin", 7);

  std::vector<lite::tensor::Tensor *> inputs = {all_tensors[0], all_tensors[1], all_tensors[2]};
  std::vector<lite::tensor::Tensor *> outputs = {all_tensors[3], all_tensors[4]};
  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_DivGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(param), NULL, desc);
  kernel_obj->Run();

  float *output_ptr = reinterpret_cast<float *>(outputs[1]->Data());
  printf("==================output data=================\n");
  for (int i = 0; i < 6; i++) {
    std::cout << output_ptr[i] << " ,";
  }
  std::cout << std::endl;

  std::string dx1_path = "./test_data/operators/arithmetic_fp32_7_dx1_4_5_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(reinterpret_cast<float *>(outputs[0]->Data()), dx1_path));

  std::string output_path = "./test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/bias_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/bias_grad_fp32_tests.cc
@@ -0,0 +1,71 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/bias_grad.h"
 #include "mindspore/lite/src/kernel_registry.h"

 namespace mindspore {

 class TestBiasGradFp32 : public mindspore::Common {
 public:
  TestBiasGradFp32() {}
 };

 TEST_F(TestBiasGradFp32, BiasGradFp32) {
  // prepare stage
  auto bias_param = new ArithmeticParameter();

  size_t input_size;
  std::string input_path = "./test_data/operators/biasgradfp32_1_dy_10_28_28_7.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_dy({10, 28, 28, 7});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(input_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor};

  auto output_data = new float[7];
  std::vector<int> dim_dw({7});
  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  dw_tensor.SetData(output_data);
  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_BiasGrad};

  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(bias_param), NULL, desc);

  kernel_obj->Run();

  printf("==================output data=================\n");
  for (int i = 0; i < 7; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/operators/biasgradfp32_1_db_7.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  // delete[] output_data;
  delete bias_param;
  MS_LOG(INFO) << "BiasGradFp32 passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/convolution_grad_fp32_tests.cc
@@ -0,0 +1,521 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include <vector>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_filter.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32/convolution_grad_input.h"
 #include "mindspore/lite/src/runtime/kernel/arm/opclib/conv_parameter.h"
 #include "mindspore/lite/src/kernel_registry.h"

 namespace mindspore {
 class TestConvolutionGradFp32 :  public mindspore::Common {
 public:
  TestConvolutionGradFp32() {}
 };

 void InitConvParamGroup1FP32(ConvParameter *conv_param) {
  conv_param->input_batch_ = 1;
  conv_param->input_h_ = 28;
  conv_param->input_w_ = 28;
  conv_param->input_channel_ = 3;

  conv_param->output_batch_ = 1;
  conv_param->output_h_ = 28;
  conv_param->output_w_ = 28;
  conv_param->output_channel_ = 32;

  conv_param->kernel_h_ = 3;
  conv_param->kernel_w_ = 3;

  conv_param->stride_h_ = 1;
  conv_param->stride_w_ = 1;

  conv_param->dilation_h_ = 1;
  conv_param->dilation_w_ = 1;

  conv_param->pad_h_ = 1;
  conv_param->pad_w_ = 1;

  conv_param->group_ = 1;
  conv_param->is_relu_ = false;
  conv_param->is_relu6_ = false;
  conv_param->thread_num_ = 1;
 }

 void InitConvParamGroup3FP32(ConvParameter *conv_param) {
  InitConvParamGroup1FP32(conv_param);
  conv_param->group_ = 3;
  conv_param->output_channel_ = 18;
 }

 void InitConvParamGroup3Dilation2FP32(ConvParameter *conv_param) {
  InitConvParamGroup3FP32(conv_param);
  conv_param->dilation_h_ = 2;
  conv_param->dilation_w_ = 2;
  conv_param->output_h_ = 26;
  conv_param->output_w_ = 26;
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32FilterGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup1FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_1_28_28_32.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 28, 28, 32});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size =
    conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ * conv_param->input_channel_;

  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_x_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input_data);

  auto dw_data = new float[output_data_size];
  std::vector<int> dim_dw({32, 3, 3, 3});
  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  dw_tensor.SetData(dw_data);
  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dw_32_3_3_3.bin";
  auto res = lite::CompareRelativeOutput(dw_data, output_path);

  EXPECT_EQ(res, 0);

  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32InputGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup1FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_1_28_28_32.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 28, 28, 32});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  size_t w_size;
  std::string w_path = "./test_data/conv/convfp32_w_32_3_3_3.bin";
  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
  std::vector<int> dim_dw({32, 3, 3, 3});
  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  w_tensor.SetData(w_data);

  size_t output_data_size =
    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
  auto dx_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 28, 28, 3});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(dx_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
  // runtime part

  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dx_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);

  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32GroupFilterGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup3FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_g3_1_28_28_18.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 28, 28, 18});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
                            conv_param->input_channel_ / conv_param->group_;

  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_x_g3_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input_data);

  auto dw_data = new float[output_data_size];
  std::vector<int> dim_dw({18, 3, 3, 1});
  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  dw_tensor.SetData(dw_data);
  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dw_g3_18_3_3_3.bin";
  auto res = lite::CompareRelativeOutput(dw_data, output_path);
  EXPECT_EQ(res, 0);

  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32GroupInputGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup3FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_g3_1_28_28_18.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 28, 28, 18});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  size_t w_size;
  std::string w_path = "./test_data/conv/convfp32_w_g3_18_3_3_3.bin";
  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
  std::vector<int> dim_dw({18, 3, 3, 1});
  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  w_tensor.SetData(w_data);

  size_t output_data_size =
    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
  auto dx_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 28, 28, 3});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(dx_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
  // runtime part

  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dx_g3_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);

  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationFilterGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();

  InitConvParamGroup3Dilation2FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 26, 26, 18});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = conv_param->output_channel_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
                            conv_param->input_channel_ / conv_param->group_;

  size_t input_size;
  std::string input_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input_data);

  auto dw_data = new float[output_data_size];
  std::vector<int> dim_dw({18, 3, 3, 1});
  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  dw_tensor.SetData(dw_data);
  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradFilter};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin";
  auto res = lite::CompareRelativeOutput(dw_data, output_path);
  EXPECT_EQ(res, 0);
  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationInputGrad) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup3Dilation2FP32(conv_param);

  size_t dy_size;
  std::string dy_path = "./test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &dy_size));
  std::vector<int> dim_dy({1, 26, 26, 18});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

  size_t w_size;
  std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
  std::vector<int> dim_w({18, 3, 3, 1});
  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
  w_tensor.SetData(w_data);

  size_t output_data_size =
    conv_param->input_batch_ * conv_param->input_h_ * conv_param->input_w_ * conv_param->input_channel_;
  auto dx_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 28, 28, 3});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(dx_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &w_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};
  // runtime part

  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Conv2DGradInput};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), NULL, desc);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);

  delete kernel;
  delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 // TEST_F(TestConvolutionGradFp32, ConvGroupDilation) {
 //   // prepare stage
 //   auto conv_param = new ConvParameter();
 //   InitConvParamGroup3Dilation2FP32(conv_param);

 //   size_t x_size;
 //   std::string x_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
 //   auto x_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(x_path.c_str(), &x_size));
 //   std::vector<int> dim_x({1, 28, 28, 3});
 //   tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
 //   x_tensor.SetData(x_data);

 //   size_t w_size;
 //   std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
 //   auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
 //   std::vector<int> dim_w({18, 3, 3, 1});
 //   tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
 //   w_tensor.SetData(w_data);

 //   size_t output_data_size =
 //     conv_param->output_batch_ * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
 //   auto y_data = new float[output_data_size];
 //   std::vector<int> dim_y({1, 26, 26, 18});
 //   tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
 //   y_tensor.SetData(y_data);

 //   std::vector<tensor::Tensor *> inputs = {&x_tensor, &w_tensor};
 //   std::vector<tensor::Tensor *> outputs = {&y_tensor};
 //   // runtime part

 //   printf("Calculating runtime cost...\n");
 //   uint64_t time_avg = 0;

 //   lite::Context context;
 //   ;
 //   context.deviceCtx.type = lite::DT_CPU;
 //   context.threadNum = 1;

 //   kernel::KernelKey desc = {kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Conv2D};
 //   auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
 //   auto kernel = creator(inputs, outputs, (OpParameter *)conv_param, &context, desc);

 //   kernel->train();
 //   EXPECT_EQ(kernel->is_train(), 1);

 //   // warm up loop
 //   for (int i = 0; i < 3; i++) {
 //     kernel->Run();
 //   }

 //   int loop_count = 100;
 //   auto time_start = mindspore::lite::GetTimeUs();
 //   for (int i = 0; i < loop_count; i++) {
 //     kernel->Run();
 //   }
 //   auto time_end = mindspore::lite::GetTimeUs();
 //   auto cost = time_end - time_start;
 //   time_avg = cost / loop_count;
 //   printf("single thread running time : %f ms\n", time_avg / 1000.0f);

 //   std::string output_path = "./test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin";
 //   auto res = lite::CompareRelativeOutput(y_data, output_path);
 //   EXPECT_EQ(res, 0);

 //   delete kernel;
 //   delete conv_param;

 //   MS_LOG(INFO) << "TestConvolutionFp32 Filter Grad passed";
 // }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pooling_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32/pooling_grad_fp32_tests.cc
@@ -0,0 +1,332 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <iostream>
 #include <memory>
 #include "mindspore/lite/include/context.h"
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "src/common/utils.h"
 #include "src/common/file_utils.h"
 #include "src/runtime/kernel/arm/fp32/pooling_grad.h"
 #include "src/runtime/kernel/arm/opclib/fp32/pooling_grad.h"

 namespace mindspore {
 class TestPoolingGradFp32 :  public mindspore::Common {
 public:
  TestPoolingGradFp32() {}
 };

 void InitPoolingParamFP32(PoolingParameter *pooling_param) {
  pooling_param->input_batch_ = 1;
  pooling_param->input_h_ = 28;
  pooling_param->input_w_ = 28;
  pooling_param->input_channel_ = 3;

  pooling_param->output_batch_ = 1;
  pooling_param->output_h_ = 28;
  pooling_param->output_w_ = 28;
  pooling_param->output_channel_ = 32;

  pooling_param->window_h_ = 3;
  pooling_param->window_w_ = 3;

  pooling_param->stride_h_ = 1;
  pooling_param->stride_w_ = 1;

  pooling_param->pad_u_ = 1;
  pooling_param->pad_d_ = 1;
  pooling_param->pad_l_ = 1;
  pooling_param->pad_r_ = 1;
  pooling_param->thread_num_ = 1;
 }

 TEST_F(TestPoolingGradFp32, AvgPoolingGradFp32) {
  // prepare stage
  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);
  pooling_param->output_channel_ = 3;

  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size =
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;

  size_t input_size;
  std::string input_path = "./test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    AvgPoolingGrad(input_data, output_data, pooling_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    AvgPoolingGrad(input_data, output_data, pooling_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  delete input_data;
  delete[] output_data;
  delete pooling_param;
  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
 }

 TEST_F(TestPoolingGradFp32, AvgPoolingKernelGradFp32) {
  // prepare stage
  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);

  pooling_param->output_channel_ = 3;

  // runtime part
  printf("Calculating runtime cost...\n");
  // uint64_t time_avg = 0;
  size_t output_data_size =
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;

  size_t input_size;
  std::string input_path = "./test_data/pooling/avgpoolgradfp32_1_dy_1_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_dy({1, 28, 28, 3});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(input_data);

  std::string input1_path = "./test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin";
  input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};

  auto output_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 28, 28, 3});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(output_data);
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};

  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(pooling_param), NULL, desc);

  kernel_obj->Run();

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  // delete[] output_data;
  delete pooling_param;
  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
 }

 TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
  // prepare stage
  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);
  pooling_param->output_channel_ = 3;
  pooling_param->avg_pooling_ = false;
  pooling_param->max_pooling_ = true;
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size =
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;

  size_t input_size;
  std::string i_path = "./test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin";
  auto ill_data = reinterpret_cast<int64_t *>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));
  auto i_data = new int[output_data_size];
  for (uint32_t i = 0; i < output_data_size; i++) {
    i_data[i] = static_cast<int>(ill_data[i]);
  }

  std::string dy_path = "./test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  delete pooling_param;
  delete[] output_data;
  MS_LOG(INFO) << "TestMaxPoolingGradFp32 passed";
 }

 #if 0
 TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
  // prepare stage
  auto maxpool = new PoolingParameter();
  InitPoolingParamFP32(maxpool);
  maxpool->avg_pooling_ = false;
  maxpool->max_pooling_ = true;
  maxpool->input_h_ = 30;
  maxpool->input_w_ = 30;
  maxpool->input_channel_ = 3;

  maxpool->output_batch_ = 1;
  maxpool->output_h_ = 10;
  maxpool->output_w_ = 10;
  maxpool->output_channel_ = 3;
  maxpool->stride_h_ = 3;
  maxpool->stride_w_ = 3;

  maxpool->pad_u_ = 0;
  maxpool->pad_d_ = 0;
  maxpool->pad_l_ = 0;
  maxpool->pad_r_ = 0;

  size_t input_size;
  size_t y_data_size = maxpool->output_batch_ * maxpool->output_channel_ * maxpool->output_h_ * maxpool->output_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_2_x_1_30_30_3.bin", &input_size));
  std::vector<int> dim_x({1, 30, 30, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);
  std::vector<lite::tensor::Tensor *> maxpool_inputs = {&x_tensor};

  auto y_data = new float[y_data_size];
  std::vector<int> dim_y({1, 10, 10, 3});
  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
  y_tensor.SetData(y_data);

  auto ind_data = new int[y_data_size];
  lite::tensor::Tensor ind_tensor(TypeId::kNumberTypeInt32, dim_y);
  ind_tensor.SetData(ind_data);

  std::vector<lite::tensor::Tensor *> maxpool_outputs = {&y_tensor, &ind_tensor};

  kernel::KernelKey maxpool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_Pooling};
  auto maxpool_creator = lite::KernelRegistry::GetInstance()->GetCreator(maxpool_desc);
  auto maxpoolobj = maxpool_creator(maxpool_inputs, maxpool_outputs, reinterpret_cast<OpParameter *>(maxpool),
                                    NULL, maxpool_desc);
  maxpoolobj->Run();

  printf("==================indices data=================\n");
  for (int i = 0; i < 10; i++) {
    std::cout << ind_data[i] << " ,";
  }
  std::cout << std::endl;

  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);
  pooling_param->avg_pooling_ = false;
  pooling_param->max_pooling_ = true;
  pooling_param->input_h_ = 10;
  pooling_param->input_w_ = 10;
  pooling_param->input_channel_ = 3;

  pooling_param->output_batch_ = 1;
  pooling_param->output_h_ = 30;
  pooling_param->output_w_ = 30;
  pooling_param->output_channel_ = 3;

  // runtime part
  printf("Calculating runtime cost...\n");
  // uint64_t time_avg = 0;
  size_t output_data_size =
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;

  auto dy_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_2_dy_1_10_10_3.bin", &input_size));
  std::vector<int> dim_dy({1, 3, 10, 10});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(dy_data);

 #if 0
  std::string i_path = "./test_data/pooling/maxpoolgradfp32_2_i_1_3_10_10.bin";
  auto ill_data = reinterpret_cast<int64_t*>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));
  auto i_data = new int[output_data_size];
  for (int i=0; i < output_data_size; i++)
    i_data[i] = static_cast<int>(ill_data[i]);
  std::vector<int> dim_ind({1, 3, 10, 10});
  lite::tensor::Tensor ind_tensor(TypeId::kNumberTypeInt32, dim_ind);
  ind_tensor.SetData(i_data);
 #endif

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &ind_tensor};

  auto output_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 3, 30, 30});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(output_data);
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(pooling_param), NULL, desc);
  kernel_obj->Run();

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/maxpoolgradfp32_2_dx_1_30_30_3.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  // delete[] output_data;
  delete pooling_param;
  MS_LOG(INFO) << "TestMaxPoolingKernelGradFp32 passed";
 }
 #endif  // if 0 before MaxPoolingKernelGradFp32
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_out_50.bin
@@ -0,0 +1 @@
  "x>#Ď>K9�>pR
>)J	>¤á4>™K>¤ĹZ>ńß>ŢÝ>ńńL>‚µ=ËQ>Ń*^>MÖ>&¶>6>Sş>đ*�>ÉN>Ë-ý=Ó+L>ÜvK>+A}>wě^>$ďQ>´Ús>ł/W>ó×Ď=Mţ'>9[*>#%†<#�>CÖ>>ˇ‚>$ÁŻ=Gţj>ňě>Ă7*>´Ă2>łĆ6>•™>ń1p>ős#>Y)>çôk>9ď÷=´ŘŔ=lQ0>ű—w>
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_x_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_x_50.bin
@@ -0,0 +1 @@
 èM®?Ç·ú¾Ôå?	
¿H2¿|Ý7>0á?dyX?C�.¿\fT¾¼@?ªÍ³¿Öö¾Àg?Lwñ¾«˜Å¾E�¾Š9&¿7AÎ?†T?öXF¿4Å?â–?žÒ¹?(k?´0?¬¤?¤VH?-–¿Tz@½&À²»Ç"-ÀÞ1¿£wñ¾šÕË?·Fº¿¼�?�«ç¾¢D¼¶Â>’øY>ãÌ¿_pœ?ÄØ¾í]ç¼	ç’?À%R¿5§¿KsË=ó?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hsigmoid_yt_50.bin
@@ -0,0 +1 @@
 Ь╨?╢6V?ЯUл?╗ШS?=ОM?;╘┤?3P≤?;╓?ИоE?мLn?u╣≥?▐!?╠ЗV?═╕?sаW?9_?яe?}≈H?hюд?▌ ?XБ=?ч ≥?%≥≤?ЮП╫?Y1╖?[s²?Д╤?фc║?ЖА?tЩ{?у┬?╣7и=╣DK?eаW?щЯц?шп?╣>╟?kcY?╓S?г├?┴?_fQ?u%╢?П-u?≈}?╜В╟?kС9?┤╒?=└?Э╠╧?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_out_50.bin
@@ -0,0 +1 @@
 v╚≈=qиы╫ьBs>Эл╬лй╬─вQ=@ъ<U█и?P2╪4[Ф?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_x_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_x_50.bin
@@ -0,0 +1 @@
 /�ηΏΦ�ƒΏ™5±>Β"Ώ†¥\Ώ��Ώ=`ΞΏώ;�?σε»Ώ�¥©?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/hswish_yt_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_out_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_y_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_y_50.bin
@@ -0,0 +1 @@
 грНїЖ&Н5SaНГЏА?tЩ?Џ�@WЕ,М�СО2еН&�8?;VММ�В?цЉЁ?$х�?5ЮљМpNђМъF7О:�Ё?�5VН:Ю�?їm
Нѕ,!@фљО`|в>VўЕНи�№М
?_бB?0дНвУ"?уЬяН�!>%џ=Ћ�,?��Мѓ>Йа�?�;?ъqЙНGh�?7џЩ<бНР�U>=дх?р-Нaыp?Ђеg?��й>Цr@XА>
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/lrelu_yt_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_out_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_y_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_y_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu6_yt_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_out_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_y_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_y_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/relu_yt_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_out_50.bin
@@ -0,0 +1 @@
 âË]>Eí>òJn>bK>Œ8£=Š<¾P—>&”“>óg>]±Ð<WBX=S¯t>ä;�>ã¤Q>I�>¢¦º=\ƒ>ºéS>	Å€=äC*>šK=ën>IyŠ>„Š“>¾l‚>/=—>rp>Ÿ‹”>ü«>Ž(	>ûÁ[>-ï–>{ëj=Ç4’>C¾”>«eŽ>D”>B“=ü£=x”>/m�>v¾j>P–>Ävï=PÊ•>“·=�3>vN=œ �>Ó‚—>
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_y_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_y_50.bin
@@ -0,0 +1 @@
 wá>õ“¢>XOï>?áÑ>ƒ�h>é¿Å=¸¡%?±o:?9¸é>"qö=ù¨7>‡ õ>??µ ×>9Œ?Á´{>št?D2\?½J>nž¸>°�1>ÈÞï>�OF?/Ç?7y?J0?ÍeT?A?F$¦>'¾Ÿ>Abß>¹Œ#?"m@>Ë<?›8?ï?•?ýÑZ>—$i>8?à*C?�)ì>rï3?á†’>óX?9y>¿ª¿>^ó2>SÔ??w!'?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/sigmoid_yt_50.bin
@@ -0,0 +1 @@
 wa?ő“"?XOo??áQ?��č>éżE>¸ˇĄ?±oş?9¸i?"qv>ů¨·>‡ u??�?µ W?9Ś“?Á´ű>št�?D2Ü?˝Ę>nž8?°Ź±>ČŢo?ŤOĆ?/Ç™?7y‚?J°?ÍeÔ?Aś?F$&?'ľ?Ab_?ąŚŁ?"mŔ>ËĽ?›¸?ď�?•›?ýŃÚ>—$é>¸?ŕ*Ă?�)l?rďł?á†?óXź?9ů>żŞ??^ó˛>SÔż?w!§?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_out_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_out_50.bin
@@ -0,0 +1,2 @@
 „ù@?^û*¿Su>?‹“¿(1?YÙ?O]Í>8�©>yåÍ½â³h>Y·:¿¬×Ÿ<e
 ?@?C?È‘¾®�C?6GU¾páž>¶_=¿I³¿0`Í>0¾>ÝŽ9?Úÿ;?Gs*>e3>£”?¯Ê‘>œ»;?(ô,?õ&¿3*ˆ¾©Ü?ŠŸC?çC¿w<2?š�ð=ôKý>%HC¿ß¾ñ%8?òMâ>£¥œ¾ºñ~>'uû>Jß¿ÙI>^4Y¾uZ?ó¿
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_y_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_y_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_yt_50.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/activationGrad/tanh_yt_50.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_18_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_18_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_1_28_28_32.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_1_28_28_32.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_1_28_28_18.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_1_28_28_18.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dy_g3_d2_1_26_26_18.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_32_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_32_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_18_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_18_3_3_3.bin
@@ -0,0 +1,4 @@
 ‹F
½.¸¿NÓ2¾œ³Õ?ó»¾©`?°�Í½ÕåØ¿S¿Ä¯¿”2x¿¥R=}è%À–Tá?9¢>¾Ró?E„Ã¿†Ö?ÇÕÜ>©´?®Á@< ƒ¿ˆ�*¿Fü‰¿sÑ?¹Ýh¿ˆêý>ðÛ¾i) ¿�W>+
Ò;ôÎä=y«@\ô?ð¿V=~?ú)Ð¾‡Ï¬?HF}?åžÕ¿˜Õ«?Fê“¿±E
 ?ŽG¿¾»Ã¿#Î¼¿>P¨D>‘¼È>J‹Å?gNð¾Y,	<¹Öˆ¿�–u?Y_À"¶�¾ ñ4?À¢f¾ôœ¿x‹¾	YÀ7ü;Â¡Ì¾…ÑÚ?Ö)™?£°©?€ì…?Ö]@-Ç/¼z²b?Áäï¿Y¸¾ñ e?MÖý¾	/6?¦"¿ë‡?œt«?ŽØT?;	-¾½Ø1?,6?¿•¨¿.ª>nÉÞ>8«D?�Ç¾
 ãF¿Ö+j?~B?
 ê¿»¤P¾æ›Ç?Šœ?t ¾½ªek?›ûI?W³J>®ó&?æÑ ?;ñ;ƒéš¿Êjä¿¾=¾í±¿È¾sg»?¡ÝÀ¿[kÄ?‚âr?Ý–c>.þ¾äÏl¼žjy?¥DÊ>S«î¼“¬Â¿º‡?lìÒ?rS¾Ùq´¾Åä?#m(@±_?>±¿l ‚¿Ž%6À˜¢?’<j¾¹z>h%¿Ðké¾>Þ=4 ù¿¨Œö?ªÅ‹¼o´J¾—û¾ �¿¥§s>•¾fW—?8c;?‚ä�?Æk»¾Š:º?bQ1¿ƒ¤Y>ypþ>½nW=úz¿|S:?P‚’?çrð?€KŠ?å˜¿k†µ¿wØâ>‘„-¼æ«>³Ví?Å~>
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_10x4.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_10x4.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_4x10.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_a_4x10.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_10x5.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_10x5.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_5x10.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_b_5x10.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_c_4x5.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/matmul/matmulfp32_c_4x5.bin
@@ -0,0 +1 @@
 .Àø@8�|À¸A-=,Ëú¿fAQÀ>2@du¤Ài}?ÿÃÀtÏ4@œˆš@2ªÊÀüzN@Ò¥¿Ô£x@Þ&ó½(Ó‚½‚¸e@�g�¼
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx1_5_4_6.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_10_dy_5_4_6.bin
@@ -0,0 +1 @@
 Ñ&J¿vÿ�ÀB×AoLÝ?ˆÍƒÀI?ïÁ…óÌ¿ÜÐA6µÔ½ÏS¹¾?Á…—¸Áãþ�Àd>‘À5²®?±Çi¾’’Ó¿`ó@Úý ¾¿uº@G×Ý@`M>Aü¹Àv>B)¦ä>c”¾ÉÁ@$Ì/AwèŒA‡àÁŒˆË¿ß^ ¿ñ0ÁkÜ¾È�£AfûÁèrñ>0xË¿€c¬ÀR†?ÂvuÁÊ=ˆ¿`,>pÅ”?aKÇÀªû@ðóîÁÓÀy?¼×¾ÿdb¿3ž?¹@Ú¤À¬�’¿eK?üÑÀÐ§9¿)ÖÓ?u£u@ýþ?¡"á=ûP’>bæë>vá°@þNl@ÅÃ�ÀUî>otÀ
’ð?Æ*�@y	Í¾³õ	@´ÖAv‘©¿_„¿×
ä¿¶�î¿\�¿™qØ?˜w@–0ÀÝ»¶¾©j€?òï÷ÀAq;Àb½oÀ¡Ž,¿Ò3@ûI`?3sfÀlø‡À€Ê@@IÚ?˜£§¿Ž·?
¡AC_>=ƒL�¿†Äó@„œÏ?	.–¿ïà¢@xy`? ´á¿¿�ŒA3
3¿³Ë‘?Ãn�?.À¾ûžÛ=é\B@/õÀõAç>B_‚¿»�©¾Kø¿–Ç<wÀWcz=m9=
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx1_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx1_4_6.bin
@@ -0,0 +1 @@
 œwÓ¿Öè�?8ðÖ>GF¼?-¢^À�.;?º£÷¿V]«?K“@zè:À} QÀ’Põ¾qÂH¿Í Å¿‚‘ã?¯Õ,@~Ð?LP§>7>ÄP«¿¥¼q@Ÿå¿P«š>ƒ¬·@
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx2_1_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dx2_1_6.bin
@@ -0,0 +1 @@
 L&†À”œ¾h>)AÃ[7?Á.šÀO2Ê@
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dy_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_dy_4_6.bin
@@ -0,0 +1 @@
 œwÓ¿Öè�?8ðÖ>GF¼?-¢^À�.;?º£÷¿V]«?K“@zè:À} QÀ’Põ¾qÂH¿Í Å¿‚‘ã?¯Õ,@~Ð?LP§>7>ÄP«¿¥¼q@Ÿå¿P«š>ƒ¬·@
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x1_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x1_4_6.bin
@@ -0,0 +1 @@
 ˆÁŠ½.%Ž?ý•Ž¿.s?d¤“¿-f×=<V¾nß”?`Dz?�tŸ¿´"†¿?ýÿ¾e|»>�~�¾Þ¾šIÈ?ˆ
³??³Æ½ÌY?ü¼3¾B‘?Ž¯X½f¦<?ä'@
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x2_1_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_1_x2_1_6.bin
@@ -0,0 +1 @@
 kB����t�?[�[�����T�>
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx1_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx1_4_6.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx2_1_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dx2_1_6.bin
@@ -0,0 +1 @@
 ‹þÀ2øòÀ ©)A%»„ÀdÈ‘À°º	À
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dy_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_2_dy_4_6.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx1_4_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx1_4_6.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx2_1_6.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/operators/arithmetic_fp32_3_dx2_1_6.bin
@@ -0,0 +1 @@
 ‹þÀ2øòÀ ©)A%»„ÀdÈ‘À°º	À