train on device

5 years ago · b601603b55
--- a/mindspore/lite/include/model.h
+++ b/mindspore/lite/include/model.h
@@ -69,6 +69,7 @@ class MS_API Model {

  /// \brief Free MetaGraph in MindSpore Lite Model.
  void FreeMetaGraph();
  ModelImpl *model_impl() {return model_impl_;}

 protected:
  ModelImpl *model_impl_ = nullptr;
--- a/mindspore/lite/include/train_session.h
+++ b/mindspore/lite/include/train_session.h
@@ -0,0 +1,63 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
 #define MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
 #include <vector>
 #include <string>
 #include <unordered_map>
 // #include "include/lite_session.h"
 #include "src/lite_session.h"

 namespace mindspore {
 namespace lite {
 class Model;
 }
 namespace lite::tensor {
 class Tensor;
 }
 namespace session {

 class TrainSession : public lite::LiteSession {
 public:
  TrainSession();
  ~TrainSession() = default;

  int RunGraph(const session::KernelCallBack &before = nullptr,
               const session::KernelCallBack &after = nullptr) override;

  int CompileGraph(lite::Model *model) override;
  virtual void ReplaceOps();
  virtual void* ExportToBuf(void* buf, size_t* len) const;

  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> GetOutputs() const;
  std::vector<tensor::MSTensor *> GetOutputsByName(const std::string &node_name) const;

  virtual void train();
  bool is_train() { return train_mode_ == true; }
  virtual void eval();
  bool is_eval() { return train_mode_ == false; }

 protected:
  bool  train_mode_ = false;
  lite::Model* model_ = nullptr;
  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> ext_output_map_;


  // private:
 };
 }  // namespace session
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
--- a/mindspore/lite/nnacl/fp32_grad/activation_grad.c
+++ b/mindspore/lite/nnacl/fp32_grad/activation_grad.c
@@ -13,9 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "nnacl/activation_grad.h"

 int ReluGrad(float *src0, float *src1, int length, float *dst) {
 #include <math.h>
 #include "nnacl/op_base.h"
 #include "nnacl/fp32/arithmetic.h"
 #include "nnacl/fp32_grad/activation_grad.h"
 #include "nnacl/errorcode.h"

 inline int ReluGrad(float *src0, float *src1, int length, float *dst) {
  for (int i = 0; i < length; ++i) {
    dst[i] = src1[i] > 0 ? 1.0f : 0.0f;
  }
--- a/mindspore/lite/nnacl/fp32_grad/activation_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/activation_grad.h
--- a/mindspore/lite/nnacl/fp32_grad/batch_norm.c
+++ b/mindspore/lite/nnacl/fp32_grad/batch_norm.c
@@ -13,11 +13,11 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <string.h>
 #include <math.h>
 #include <string.h>
 #include "nnacl/fp32_grad/batch_norm.h"

 static void sumSpatialBatch(const float *in, int size, int ch, float *out) {
 void sumSpatialBatch(const float *in, int size, int ch, float *out) {
  memset(out, 0, ch * sizeof(float));
  for (int i = 0; i < size; i++) {
    const float *ptr = in + i * ch;
@@ -32,49 +32,53 @@ void scaleBias(const float *scales, int batch, int n, int size, float *output) {
    for (int c = 0; c < n; c++) output[i * n + c] *= scales[c];
 }

 void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
 void normalize(const float *x, const float *mean, const float *invar, int batch, int filters, int spatial,
               float *out) {
  int b, f, i;
  for (b = 0; b < batch; ++b) {
    for (i = 0; i < spatial; ++i) {
      for (f = 0; f < filters; ++f) {
        int index = b * filters * spatial + i * filters + f;
        out[index] = (x[index] - mean[f]) / (sqrt(variance[f]) + eps);
        out[index] = (x[index] - mean[f]) * invar[f];
      }
    }
  }
 }

 void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates) {
 void backwardScale(const float *x, const float *mean, const float *invar, const float *delta, int batch,
                   int n, int size, float *scale_updates) {
  int i, b, f;
  memset(scale_updates, 0, n * sizeof(float));
  for (b = 0; b < batch; ++b) {
    for (i = 0; i < size; ++i) {
      for (f = 0; f < n; ++f) {
        int index = (b * size + i) * n + f;
        scale_updates[f] += delta[index] * x_norm[index];
        float x_norm = (x[index] - mean[f]) * invar[f];
        scale_updates[f] += delta[index] * x_norm;
      }
    }
  }
 }

 void meanVar(const float *in, int batch, int spatial, int ch, float *mean, float *var) {
 void meanVar(const float *in, int batch, int spatial, int ch, float eps, float *mean, float *invar) {
  float N = batch * spatial;
  sumSpatialBatch(in, N, ch, mean);
  for (int f = 0; f < ch; ++f) mean[f] /= N;
  memset(var, 0, ch * sizeof(float));
  for (int i = 0; i < N; i++) {
    for (int f = 0; f < ch; f++) {
      float x = in[i * ch + f];
      var[f] += (x - mean[f]) * (x - mean[f]);
  for (int f = 0; f < ch; ++f) {
    mean[f] /= N;
  }
  for (int f=0; f< ch; f++) {
    float tvar = 0;
    for (int i =0; i< N; i++) {
      float x = in[i*ch +f];
      tvar += (x-mean[f]) *(x-mean[f]);
    }
    invar[f] = 1.0f/(sqrt(tvar/N+eps));
  }
  for (int f = 0; f < ch; f++) var[f] /= N;
 }

 void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta) {
 void meanDelta(float *yt, int size, int ch,  float *invar, float *mean_delta) {
  sumSpatialBatch(yt, size, ch, mean_delta);
  for (int i = 0; i < ch; i++) mean_delta[i] *= -1.f / sqrt((variance[i] + eps));
  for (int i = 0; i < ch; i++) mean_delta[i] *= -invar[i];
 }

 void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
@@ -93,8 +97,8 @@ void meanAdd(const float *x, const float *mean, const float *variance_delta, int
  }
 }

 void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int filters,
                   int spatial, float eps, float *variance_delta) {
 void varianceDelta(const float *x, const float *delta, const float *mean, const float *invar, int batch, int filters,
                   int spatial, float *variance_delta) {
  int i, k;
  memset(variance_delta, 0, filters * sizeof(float));
  for (k = 0; k < batch * spatial; k++) {
@@ -103,16 +107,16 @@ void varianceDelta(const float *x, const float *delta, const float *mean, const
      variance_delta[i] += delta[index] * (x[index] - mean[i]);
    }
  }
  for (i = 0; i < filters; i++) variance_delta[i] *= -.5 * pow(variance[i] + eps, (-3.f / 2.f));
  for (i = 0; i < filters; i++) variance_delta[i] *= -.5 * 1.0f/(invar[i]*invar[i]*invar[i]);
 }

 void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta) {
 void NormalizeDelta(const float *x, const float *mean, const float *invar, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float *delta) {
  int f, k;
  for (k = 0; k < batch * spatial; k++) {
    for (f = 0; f < filters; f++) {
      int index = k * filters + f;
      delta[index] = delta[index] * 1. / (sqrt(variance[f] + eps)) +
      delta[index] = delta[index] * invar[f] +
                     variance_delta[f] * 2. * (x[index] - mean[f]) / (spatial * batch) +
                     mean_delta[f] / (spatial * batch);
    }
--- a/mindspore/lite/nnacl/fp32_grad/batch_norm.h
+++ b/mindspore/lite/nnacl/fp32_grad/batch_norm.h
@@ -17,28 +17,33 @@
 #ifndef MINDSPORE_LITE_NNACL_FP32_BATCH_NORM_H_
 #define MINDSPORE_LITE_NNACL_FP32_BATCH_NORM_H_

 typedef struct bnParameter {
  int batch;
  int channels;
  int spatial;
  float eps;
 } bnParameter;
 #include "nnacl/op_base.h"

 typedef struct BNGradParameter {
  OpParameter op_parameter_;
  float epsilon_;
  float momentum_;
 } BNGradParameter;

 #ifdef __cplusplus
 extern "C" {
 #endif


 void sumSpatialBatch(const float *in, int size, int ch, float *out);
 void scaleBias(const float *scales, int batch, int n, int size, float *output);
 void normalize(const float *x, const float *mean, const float *variance, float eps, int batch, int filters, int spatial,
 void normalize(const float *x, const float *mean, const float *invar, int batch, int filters, int spatial,
               float *out);
 void backwardScale(const float *x_norm, const float *delta, int batch, int n, int size, float *scale_updates);
 void meanVar(const float *in, int batch, int size, int ch, float *mean, float *var);
 void meanDelta(float *yt, int size, int ch, float eps, float *variance, float *mean_delta);
 void varianceDelta(const float *x, const float *delta, const float *mean, const float *variance, int batch, int ch,
                   int spatial, float eps, float *variance_delta);
 void backwardScale(const float *x, const float *mean, const float *invar, const float *delta, int batch,
                   int n, int size, float *scale_updates);
 void meanVar(const float *in, int batch, int size, int ch, float eps, float *mean, float *invar);
 void meanDelta(float *yt, int size, int ch, float *invar, float *mean_delta);
 void varianceDelta(const float *x, const float *delta, const float *mean, const float *invar, int batch, int ch,
                   int spatial, float *variance_delta);
 void meanAdd(const float *x, const float *mean, const float *variance_delta, int batch, int filters, int spatial,
             float *mean_add, float *mean_delta);
 void NormalizeDelta(const float *x, const float *mean, const float *variance, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float eps, float *delta);
 void NormalizeDelta(const float *x, const float *mean, const float *invar, const float *mean_delta,
                    const float *variance_delta, int batch, int filters, int spatial, float *delta);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32_grad/pack_ext.c
+++ b/mindspore/lite/nnacl/fp32_grad/pack_ext.c
@@ -125,9 +125,9 @@ void im2row_hwc(const float *in_data, float *data_row, ConvParameter *conv_param
 }

 void col2im_hwc(const float *data_col, float *data_im, ConvParameter *conv_param) {
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_w_;
  const int pad_left = /*conv_param->pad_l_*/ conv_param->pad_l_;
  // const int pad_right =  /*conv_param->pad_r_*/conv_param->pad_w_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_h_;
  const int pad_up = /*conv_param->pad_u_*/ conv_param->pad_u_;
  // const int pad_down =   /*conv_param->pad_d/*/conv_param->pad_h_;

  const int stride_h = conv_param->stride_h_;
--- a/mindspore/lite/nnacl/fp32_grad/pooling_grad.c
+++ b/mindspore/lite/nnacl/fp32_grad/pooling_grad.c
@@ -13,7 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <cstdint>
 #include <stdint.h>
 #include <float.h>
 #include "nnacl/fp32_grad/pooling_grad.h"

 void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param) {
@@ -31,33 +32,37 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter
  int output_batch = pooling_param->output_batch_;

  const float *inPtr = NULL;
  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;
  // for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;
  for (int i = 0; i < in_h * in_w * channel * output_batch; i++) output_ptr[i] = 0.0;

  float kk = (float)(win_h * win_w);

  for (uint16_t ib = 0; ib < output_batch; ib++) {
    float *out;
    out = &output_ptr[(ib * output_h * output_w)];
    inPtr = (float *)(&input_ptr[(ib * in_h * in_w)]);
    // out = &output_ptr[(ib * output_h * output_w)];
    out = &output_ptr[(ib * in_h * in_w * channel)];
    // inPtr = (float *)(&input_ptr[(ib * in_h * in_w)]);
    inPtr = (float *)(&input_ptr[(ib * output_h * output_w * channel)]);
    if (1) {  // in->layout() == Tensor::nhwc)
      // iterate over yt
      for (uint16_t yh = 0; yh < in_h; yh++) {
        for (uint16_t yw = 0; yw < in_w; yw++) {
      for (uint16_t yh = 0; yh < output_h; yh++) {
        for (uint16_t yw = 0; yw < output_w; yw++) {
          for (uint16_t ic = 0; ic < channel; ic++) {
            int idx = (yw + yh * in_w) * channel + ic;  // (ic*in_h*in_w) + (in_w*yh) + yw;
            int idx = (yw + yh * output_w) * channel + ic;  // (ic*in_h*in_w) + (in_w*yh) + yw;
            float delta = inPtr[idx] / kk;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= output_h)) {
              if ((xh < 0) || (xh >= in_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= output_w)) {
                if ((xw < 0) || (xw >= in_w)) {
                  continue;
                }
                // out[(ic*output_h*output_w) + (xh*output_w) + xw] += delta;
                out[(xw + output_w * xh) * channel + ic] += delta;

                // out[(xw + output_w * xh) * channel + ic] += delta;
                out[(xw + in_w * xh) * channel + ic] += delta;
              }
            }
          }
@@ -66,21 +71,22 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter
    } else {  // nchw
      for (uint16_t ic = 0; ic < channel; ic++) {
        // iterate over yt
        for (uint16_t yh = 0; yh < in_h; yh++) {
          for (uint16_t yw = 0; yw < in_w; yw++) {
            int idx = (ic * in_h * in_w) + (in_w * yh) + yw;
        for (uint16_t yh = 0; yh < output_h; yh++) {
          for (uint16_t yw = 0; yw < output_w; yw++) {
            int idx = (ic * output_h * output_w) + (output_w * yh) + yw;
            float delta = inPtr[idx] / kk;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= output_h)) {
              if ((xh < 0) || (xh >= in_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= output_w)) {
                if ((xw < 0) || (xw >= in_w)) {
                  continue;
                }
                out[(ic * output_h * output_w) + (xh * output_w) + xw] += delta;
                // out[(ic * output_h * output_w) + (xh * output_w) + xw] += delta;
                out[(ic * in_h * in_w) + (xh * in_w) + xw] += delta;
              }
            }
          }
@@ -90,7 +96,14 @@ void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter
  }
 }

 void MaxPoolingGrad(const float *dy, const int *indices, float *output_ptr, PoolingParameter *pooling_param) {
 void MaxPoolingGrad(const float *input_ptr, const float *dx_ptr, const float *dy_ptr, float *output_ptr,
                    PoolingParameter *pooling_param) {
  int stride_w = pooling_param->stride_w_;
  int stride_h = pooling_param->stride_h_;
  int pad_w = pooling_param->pad_l_;
  int pad_h = pooling_param->pad_u_;
  int win_w = pooling_param->window_w_;
  int win_h = pooling_param->window_h_;
  int channel = pooling_param->input_channel_;
  int in_w = pooling_param->input_w_;
  int in_h = pooling_param->input_h_;
@@ -98,38 +111,73 @@ void MaxPoolingGrad(const float *dy, const int *indices, float *output_ptr, Pool
  int output_h = pooling_param->output_h_;
  int output_batch = pooling_param->output_batch_;

  int out_img_size =
    output_h * output_w;  // Emir -- in original code this varible is calculated according to input size ??
  int ind_img_size = in_h * in_w;
  // const int w_pad = (output_w + pad_w + pad_w);
  const float *inPtr;
  const float *dyPtr;

  for (int i = 0; i < in_h * in_w * channel * output_batch; i++) output_ptr[i] = 0.0;

  for (uint16_t ib = 0; ib < output_batch; ib++) {
    float *out;
    out = &output_ptr[(ib * in_h * in_w * channel)];
    inPtr = (const float *)(&input_ptr[(ib * in_h * in_w * channel)]);
    dyPtr = (const float *)(&dy_ptr[(ib * output_h * output_w * channel)]);

  for (int i = 0; i < output_h * output_w * channel * output_batch; i++) output_ptr[i] = 0.0;
    if (1) {  // nhwc
      for (uint16_t yh = 0; yh < output_h; yh++) {
        for (uint16_t yw = 0; yw < output_w; yw++) {
          for (uint16_t ic = 0; ic < channel; ic++) {
            int idx = (yw + yh * output_w) * channel + ic;

  const float *yt = (const float *)(dy);
  const int *pos = (const int *)(indices);
  float *out = NULL;
            float delta = dyPtr[idx];
            float max_val = -FLT_MAX;
            int max_idx = 0;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= in_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= in_w)) {
                  continue;
                }

  if (1) {  // grads->layout() == Tensor::nhwc)
    for (int ib = 0; ib < output_batch; ib++) {
      out = &(output_ptr[ib * output_w * output_w * channel]);
      for (int ix = 0; ix < ind_img_size; ix++) {
        for (int cix = 0; cix < channel; cix++) {
          int idx = (*pos) * channel + cix;
          out[idx] += *yt;
          pos++;
          yt++;
                if (inPtr[(xw + in_w * xh) * channel + ic] > max_val) {
                  max_val = inPtr[(xw + in_w * xh) * channel + ic];
                  max_idx = (xw + in_w * xh) * channel + ic;
                }
              }
            }
            out[max_idx] += delta;
          }
        }
      }
    }
  } else {
    for (int ib = 0; ib < output_batch; ib++) {
      out = &output_ptr[(ib * out_img_size)];
      for (int cix = 0; cix < channel; cix++) {
        for (int ix = 0; ix < ind_img_size; ix++) {
          int idx = cix * output_h * output_w + *pos;  // cord_y*output_w + cord_x;
          out[idx] += *yt;
          pos++;
          yt++;
    } else {  // nchw
      for (uint16_t yh = 0; yh < output_h; yh++) {
        for (uint16_t yw = 0; yw < output_w; yw++) {
          for (uint16_t ic = 0; ic < channel; ic++) {
            int idx = (ic * output_h * output_w) + (output_w * yh) + yw;
            float delta = dyPtr[idx];
            float max_val = -FLT_MAX;
            int max_idx = 0;
            for (int32_t kh = 0; kh < win_h; kh++) {
              int xh = yh * stride_h + kh - pad_h;
              if ((xh < 0) || (xh >= in_h)) {
                continue;
              }
              for (int32_t kw = 0; kw < win_w; kw++) {
                int xw = yw * stride_w + kw - pad_w;
                if ((xw < 0) || (xw >= in_w)) {
                  continue;
                }
                if (inPtr[(ic * in_h * in_w) + (xh * in_w) + xw] > max_val) {
                  max_val = inPtr[(ic * in_h * in_w) + (xh * in_w) + xw];
                  max_idx = (ic * in_h * in_w) + (xh * in_w) + xw;
                }
              }
            }
            out[max_idx] += delta;
          }
        }
      }
    }
--- a/mindspore/lite/nnacl/fp32_grad/pooling_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/pooling_grad.h
@@ -23,7 +23,9 @@
 extern "C" {
 #endif
 void AvgPoolingGrad(const float *input_ptr, float *output_ptr, PoolingParameter *pooling_param);
 void MaxPoolingGrad(const float *dy, const int *indices_ptr, float *output_ptr, PoolingParameter *pooling_param);
 // void MaxPoolingGrad(const float *dy, const int *indices_ptr, float *output_ptr, PoolingParameter *pooling_param);
 void MaxPoolingGrad(const float *input_ptr, const float *dx_ptr, const float *dy_ptr, float *output_ptr,
                    PoolingParameter *pooling_param);
 #ifdef __cplusplus
 }
 #endif
--- a/mindspore/lite/nnacl/fp32_grad/reduce_grad.c
+++ b/mindspore/lite/nnacl/fp32_grad/reduce_grad.c
@@ -13,10 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <string.h>
 #include "nnacl/fp32_grad/reduce_grad.h"

 static inline bool NextIndex(const int num_dims, const int *dims, int *current) {
 static inline int NextIndex(const int num_dims, const int *dims, int *current) {
  int carry = 1;
  for (int idx = num_dims - 1; idx >= 0; --idx) {
    int current_val = current[idx] + carry;
@@ -45,10 +45,10 @@ static inline size_t GetOutputOffset(const int num_dims, const int *dims, const
  size_t offset = 0;
  for (int idx = 0; idx < num_dims; ++idx) {
    // if we need to skip this axis
    bool is_axis = false;
    int is_axis = 0;
    for (int axis_idx = 0; axis_idx < num_axis; ++axis_idx) {
      if (idx == axes[axis_idx]) {
        is_axis = true;
        is_axis = 1;
        break;
      }
    }
@@ -101,10 +101,10 @@ float ReduceMeanAll(const float *src, int size) {

 void ReduceSumByAxes(const float *input, const int *input_dims, float *output, const int *output_dims, int num_dims) {
  int num_outputs = 1;
  int same_shape = true;
  int same_shape = 1;
  for (int idx = 0; idx < num_dims; ++idx) {
    num_outputs *= output_dims[idx];
    if (output_dims[idx] != input_dims[idx]) same_shape = false;
    if (output_dims[idx] != input_dims[idx]) same_shape = 0;
  }
  if (same_shape) {
    memcpy(output, input, num_outputs * sizeof(float));
--- a/mindspore/lite/nnacl/fp32_grad/reduce_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/reduce_grad.h
@@ -17,8 +17,7 @@
 #ifndef MINDSPORE_LITE_NNACL_FP32_REDUCE_GRAD_H_
 #define MINDSPORE_LITE_NNACL_FP32_REDUCE_GRAD_H_

 #include <cstddef.h>
 #include <algorithm.h>
 #include <stddef.h>

 #ifdef __cplusplus
 extern "C" {
--- a/mindspore/lite/nnacl/fp32_grad/softmax_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/softmax_grad.h
@@ -20,7 +20,7 @@
 #include "nnacl/op_base.h"

 typedef struct SoftmaxCrossEntropyParameter {
  OpParameter op_parameter;
  OpParameter op_parameter_;
  int32_t batch_size_;
  unsigned int number_of_classes_;
  int n_dim_;
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -178,8 +178,8 @@ union PrimitiveType {
    Conv2DGradFilter,
    Conv2DGradInput,
    PoolingGrad,
    BNGradInput,
    OptMomentum,
    BNGrad,
    ApplyMomentum,
    BiasGrad,
    SoftmaxCrossEntropy,
    AddGrad,
@@ -190,6 +190,7 @@ union PrimitiveType {
    ActivationGrad,
    PriorBox,
    SpaceToBatchND,
    Depend,
    Return,
    MakeTuple,
    ToFormat,
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -149,7 +149,8 @@ table Activation {
    alpha: float = 0.2;
 }
 table ActivationGrad {
    type: ActivationGradType = 0;
    type: ActivationType = 0;
    alpha: float = 0.2;
 }


@@ -230,6 +231,9 @@ table SoftmaxCrossEntropy {
    axis: [int];
 }

 table make_tuple {
 }


 table PoolingGrad {
    format: Format = 0;
@@ -390,10 +394,11 @@ table DeConv2D {
    hasBias: bool = false;
    activationType: ActivationType = 0;
 }
 table BNGradInput {
 table BNGrad {
    eps : float;
    channels: int;
    momentum: float;
 }

 table Scale {
    axis: int;
 }
@@ -841,7 +846,10 @@ table SquaredDifference {
 table TupleGetItem {
 }

 table OptMomentum {
 table ApplyMomentum {
    gradientScale: float;
    useLocking: bool;
    useNesterov: bool;
 }


@@ -884,6 +892,10 @@ table ToFormat {
    dstT: int;
 }


 table Depend {
 }

 table Return {
 }

--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -27,7 +27,7 @@ set(LITE_SRC
    )

 if (SUPPORT_GPU)
  set(LITE_SRC
 set(LITE_SRC
      ${LITE_SRC}
          ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/subgraph_opencl_kernel.cc
          ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc
@@ -36,6 +36,24 @@ if (SUPPORT_GPU)
          ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_runtime.cc
          ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_wrapper.cc
      )
 endif()


 if (SUPPORT_TRAIN)
    set(ANF_SRC
             ${ANF_SRC}

            )
    set(PASS_SRC)
    set(LITE_SRC
            ${LITE_SRC}
            ${ANF_SRC}
        #   ${CMAKE_CURRENT_SOURCE_DIR}/train/ops/train_ops.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/train/train_populate_parameter.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/train/train_session.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/lite_session.cc
            )
            
 endif ()

 file(GLOB_RECURSE C_OPS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/ops/*.cc)
--- a/mindspore/lite/src/common/file_utils.cc
+++ b/mindspore/lite/src/common/file_utils.cc
@@ -110,6 +110,7 @@ int CompareOutputData(float *output_data, float *correct_data, int data_size) {
    }
  }
  error /= data_size;

  if (error > 0.0001) {
    printf("has accuracy error!\n");
    printf("%f\n", error);
@@ -118,12 +119,14 @@ int CompareOutputData(float *output_data, float *correct_data, int data_size) {
  return 0;
 }

 void CompareOutput(float *output_data, std::string file_path) {
 int  CompareOutput(float *output_data, std::string file_path) {
  size_t output_size;
  auto ground_truth = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
  size_t output_num = output_size / sizeof(float);
  printf("output num : %zu\n", output_num);
  CompareOutputData(output_data, ground_truth, output_num);
  int res = CompareOutputData(output_data, ground_truth, output_num);
  delete [] ground_truth;
  return res;
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/common/file_utils.h
+++ b/mindspore/lite/src/common/file_utils.h
@@ -47,7 +47,7 @@ void WriteToTxt(const std::string& file_path, void *data, size_t element_size) {
 int WriteToBin(const std::string& file_path, void *data, size_t size);

 int CompareOutputData(float *output_data, float *correct_data, int data_size);
 void CompareOutput(float *output_data, std::string file_path);
 int  CompareOutput(float *output_data, std::string file_path);

 std::string GetAndroidPackageName();
 std::string GetAndroidPackagePath();
--- a/mindspore/lite/src/common/file_utils_ext.cc
+++ b/mindspore/lite/src/common/file_utils_ext.cc
@@ -47,7 +47,9 @@ int CompareRelativeOutput(float *output_data, std::string file_path) {
  auto ground_truth = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &output_size));
  size_t output_num = output_size / sizeof(float);
  std::cout << "output num : " << output_num << "\n";
  return CompareOutputRelativeData(output_data, ground_truth, output_num);
  int res = CompareOutputRelativeData(output_data, ground_truth, output_num);
  delete [] ground_truth;
  return res;
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/executor.cc
+++ b/mindspore/lite/src/executor.cc
@@ -39,6 +39,10 @@ int Executor::Run(std::vector<tensor::Tensor *> &in_tensors, std::vector<tensor:
    }
  }
  kernel::LiteKernelUtil::InitTensorRefCount(kernels);
  for (auto out_tensor : out_tensors) {  // increase RefCount of output tensors, such that Run will not free them
    out_tensor->SetRefCount(out_tensor->RefCount() + 1);
  }

  for (auto *kernel : kernels) {
    MS_ASSERT(nullptr != kernel);

@@ -48,6 +52,8 @@ int Executor::Run(std::vector<tensor::Tensor *> &in_tensors, std::vector<tensor:
        MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
      }
    }
    // JBDEBUG
    // std::cout << "executing kernel " << kernel->name() << "\n";
    auto ret = kernel->Run();
    if (0 != ret) {
      MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
--- a/mindspore/lite/src/lite_kernel.h
+++ b/mindspore/lite/src/lite_kernel.h
@@ -27,7 +27,6 @@
 #include "src/ir/tensor.h"
 #include "include/errorcode.h"


 // using mindspore::kernel::AddressPtr;
 namespace mindspore::kernel {
 using mindspore::lite::RET_ERROR;
--- a/mindspore/lite/src/model.cc
+++ b/mindspore/lite/src/model.cc
@@ -112,11 +112,11 @@ int ModelImpl::BuildOps() {

 Model *Model::Import(const char *model_buf, size_t size) {
  auto model = new Model();
  model->model_impl_ = ModelImpl::Import(model_buf, size);
  if (model_buf == nullptr) {
    MS_LOG(ERROR) << "model buf is null";
    return nullptr;
  }
  model->model_impl_ = ModelImpl::Import(model_buf, size);
  if (model->model_impl_ == nullptr) {
    MS_LOG(ERROR) << "model impl is null";
    return nullptr;
--- a/mindspore/lite/src/ops/activation_grad.cc
+++ b/mindspore/lite/src/ops/activation_grad.cc
@@ -20,11 +20,11 @@ namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int ActivationGrad::GetType() const { return this->primitive_->value.AsActivationGrad()->type; }

 float ActivationGrad::GetAlpha() const { return this->primitive_->value.AsActivationGrad()->alpha; }
 void ActivationGrad::SetType(int type) {
  this->primitive_->value.AsActivationGrad()->type = (schema::ActivationGradType)type;
  this->primitive_->value.AsActivationGrad()->type = (schema::ActivationType)type;
 }

 void ActivationGrad::SetAlpha(float alpha) { this->primitive_->value.AsActivationGrad()->alpha = alpha; }
 #else
 int ActivationGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
  MS_ASSERT(nullptr != primitive);
@@ -40,7 +40,7 @@ int ActivationGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flat
  return RET_OK;
 }
 int ActivationGrad::GetType() const { return this->primitive_->value_as_ActivationGrad()->type(); }

 float ActivationGrad::GetAlpha() const { return this->primitive_->value_as_ActivationGrad()->alpha(); }
 #endif
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/activation_grad.h
+++ b/mindspore/lite/src/ops/activation_grad.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_ACTIVATION_GRAD_H_
 #define LITE_MINDSPORE_LITE_C_OPS_ACTIVATION_GRAD_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_ACTIVATION_GRAD_H_
 #define MINDSPORE_LITE_SRC_OPS_ACTIVATION_GRAD_H_

 #include <vector>
 #include <set>
@@ -32,13 +32,15 @@ class ActivationGrad : public PrimitiveC {
  ActivationGrad() = default;
  explicit ActivationGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
  void SetType(int type);
  void SetAlpha(float alpha);
 #else
  ActivationGrad() = default;

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int GetType() const;
  float GetAlpha() const;
 };
 }  // namespace lite
 }  // namespace mindspore
 #endif  // LITE_MINDSPORE_LITE_C_OPS_ACTIVATION_GRAD_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_ACTIVATION_GRAD_H_
--- a/mindspore/lite/src/ops/apply_momentum.cc
+++ b/mindspore/lite/src/ops/apply_momentum.cc
@@ -0,0 +1,64 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "src/ops/apply_momentum.h"
 namespace mindspore {
 namespace lite {


 #ifdef PRIMITIVE_WRITEABLE

 #else
 int ApplyMomentum::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
  MS_ASSERT(nullptr != primitive);
  MS_ASSERT(nullptr != fbb);
  auto attr = primitive->value_as_ApplyMomentum();
  if (attr == nullptr) {
    MS_LOG(ERROR) << "value_as_ApplyMomentum return nullptr";
    return RET_ERROR;
  }
  auto val_offset = schema::CreateApplyMomentum(*fbb);
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_ActivationGrad, val_offset.o);
  fbb->Finish(prim_offset);
  return RET_OK;
 }
 #endif

 int ApplyMomentum::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
  if (5 != inputs.size()) {
    MS_LOG(ERROR) << "ApplyMomentum should have at 5 input tensors";
    return RET_ERROR;
  }
  // if (outputs.empty()) {
  //  MS_LOG(ERROR) << "ApplyMomentumCPUKernel error input output size!";
  //  return RET_ERROR;
  // }

  if (inputs[0]->ElementsNum() != inputs[1]->ElementsNum() || inputs[0]->ElementsNum() != inputs[3]->ElementsNum() ||
      inputs[2]->ElementsNum() != 1 || inputs[4]->ElementsNum() != 1) {
    MS_LOG(ERROR) << "error input data size!";
    return RET_ERROR;
  }
  if (!outputs.empty()) {
    auto *out = outputs.front();
    MS_ASSERT(out != nullptr);
    out->set_data_type(inputs[0]->data_type());
    out->SetFormat(inputs[0]->GetFormat());
  }

  return RET_OK;
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/apply_momentum.h
+++ b/mindspore/lite/src/ops/apply_momentum.h
@@ -0,0 +1,44 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_OPS_APPLY_MOMENTUM_H_
 #define MINDSPORE_LITE_SRC_OPS_APPLY_MOMENTUM_H_

 #include <vector>
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"

 namespace mindspore {
 namespace lite {
 class ApplyMomentum : public PrimitiveC {
 public:
 #ifdef PRIMITIVE_WRITEABLE
  MS_DECLARE_PARENT(ApplyMomentum, PrimitiveC);
  ApplyMomentum() = default;
  explicit ApplyMomentum(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
  ApplyMomentum() = default;

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // MINDSPORE_LITE_SRC_OPS_APPLY_MOMENTUM_H_
--- a/mindspore/lite/src/ops/arithmetic_grad.cc
+++ b/mindspore/lite/src/ops/arithmetic_grad.cc
@@ -0,0 +1,108 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/ops/arithmetic_grad.h"
 #include "include/errorcode.h"
 #include "utils/log_adapter.h"
 #include "src/ir/tensor.h"

 namespace mindspore {
 namespace lite {
 int ArithmeticGrad::InferShape(std::vector<lite::tensor::Tensor *> inputs_,
                               std::vector<lite::tensor::Tensor *> outputs_) {
  if (inputs_.size() != 3) {
    MS_LOG(ERROR) << "The number of input must be 3";
    return RET_ERROR;
  }
  if (outputs_.size() != 2) {
    MS_LOG(ERROR) << "The number of output must be 2";
    return RET_ERROR;
  }
  auto dy = inputs_[0];
  auto x1 = inputs_[1];
  auto x2 = inputs_[2];
  auto dx1 = outputs_[0];
  auto dx2 = outputs_[1];

  MS_ASSERT(dy != nullptr);
  MS_ASSERT(x1 != nullptr);
  MS_ASSERT(x2 != nullptr);
  MS_ASSERT(dx1 != nullptr);
  MS_ASSERT(dx2 != nullptr);

  auto inShape0 = x1->shape();
  auto inShape1 = x2->shape();
  auto outShape = dy->shape();

  if ((Type() == schema::PrimitiveType_AddGrad) || (Type() == schema::PrimitiveType_SubGrad)) {
    ndim_ = outShape.size();
    auto fillDimNum0 = outShape.size() - inShape0.size();
    auto fillDimNum1 = outShape.size() - inShape1.size();
    int j0 = 0;
    int j1 = 0;
    for (unsigned int i = 0; i < outShape.size(); i++) {
      x1_shape_[i] = (i < fillDimNum0) ? 1 : inShape0[j0++];
      x2_shape_[i] = (i < fillDimNum1) ? 1 : inShape1[j1++];
      dy_shape_[i] = outShape[i];
    }
  } else {
    // if (inShape0.size() < inShape1.size())
    if (dx1->ElementsNum() < dx2->ElementsNum()) {
      ndim_ = inShape1.size();
      auto fillDimNum = inShape1.size() - inShape0.size();  // This will not work for batch!
      int j = 0;
      for (unsigned int i = 0; i < inShape1.size(); i++) {
        if (i < fillDimNum) {
          x2_shape_[i] = 1;
        } else {
          x2_shape_[i] = inShape0[j++];
        }
        x1_shape_[i] = inShape1[i];
        dy_shape_[i] = outShape[i];
      }
    } else if (dx2->ElementsNum() < dx1->ElementsNum()) {  // if (inShape0.size() > inShape1.size())
      ndim_ = inShape0.size();
      broadcasting_ = true;
      ndim_ = inShape0.size();
      int j = 0;
      auto fillDimNum = inShape0.size() - inShape1.size();
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        if (i < fillDimNum) {
          x2_shape_[i] = 1;
        } else {
          x2_shape_[i] = inShape1[j++];
        }
        x1_shape_[i] = inShape0[i];
        dy_shape_[i] = outShape[i];
      }
    } else {
      broadcasting_ = false;
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        x2_shape_[i] = inShape1[i];
        x1_shape_[i] = inShape0[i];
        dy_shape_[i] = outShape[i];
      }
    }
  }

  dx1->set_shape(x1->shape());
  dx2->set_shape(x2->shape());
  dx1->set_data_type(dy->data_type());
  dx2->set_data_type(dy->data_type());
  return RET_OK;
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/arithmetic_grad.h
+++ b/mindspore/lite/src/ops/arithmetic_grad.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_OPS_ARITHMETIC_GRAD_H_
 #define MINDSPORE_LITE_SRC_OPS_ARITHMETIC_GRAD_H_

 #include <vector>
 #include <set>
 #include <cmath>
 #include "ir/dtype/type_id.h"
 #include "src/ops/primitive_c.h"

 namespace mindspore {
 namespace lite {
 class ArithmeticGrad : public PrimitiveC {
 public:
 #ifdef PRIMITIVE_WRITEABLE
  MS_DECLARE_PARENT(ArithmeticGrad, PrimitiveC);
  ArithmeticGrad() = default;
  explicit ArithmeticGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
 #else
  //  explicit Arithmetic(schema::Primitive *primitive) : PrimitiveC(primitive) {}
  ArithmeticGrad() = default;
  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override {
    return RET_ERROR;
  }
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
  bool Broadcasting() { return this->broadcasting_; }
  int NDims() { return this->ndim_; }
  std::vector<int> dyShape() { return this->dy_shape_; }
  std::vector<int> x1Shape() { return this->x1_shape_; }
  std::vector<int> x2Shape() { return this->x2_shape_; }

 protected:
  bool broadcasting_ = false;
  int ndim_;
  std::vector<int> dy_shape_;
  std::vector<int> x1_shape_;
  std::vector<int> x2_shape_;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // MINDSPORE_LITE_SRC_OPS_ARITHMETIC_GRAD_H_
--- a/mindspore/lite/src/ops/bias_grad.cc
+++ b/mindspore/lite/src/ops/bias_grad.cc
@@ -48,6 +48,32 @@ std::vector<int> BiasGrad::GetAxis() const {
  return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }

 int BiasGrad::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
  if (1 != inputs.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one input";
    return RET_ERROR;
  }
  if (1 != outputs.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one output";
    return RET_ERROR;
  }
  auto *in0 = inputs.front();
  auto *out = outputs.front();
  MS_ASSERT(in0 != nullptr);
  MS_ASSERT(out != nullptr);
  auto inshape = in0->shape();
  int ndim = inshape.size();
  for (int i = 0; i < ndim - 1; i++) {
    inshape[i] = 1;
  }
  out->set_shape(inshape);
  out->set_data_type(in0->data_type());
  out->SetFormat(in0->GetFormat());

  return RET_OK;
 }


 #endif
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/bias_grad.h
+++ b/mindspore/lite/src/ops/bias_grad.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_BIAS_GRAD_H_
 #define LITE_MINDSPORE_LITE_C_OPS_BIAS_GRAD_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_BIAS_GRAD_H_
 #define MINDSPORE_LITE_SRC_OPS_BIAS_GRAD_H_

 #include <vector>
 #include <set>
@@ -38,10 +38,11 @@ class BiasGrad : public PrimitiveC {
  BiasGrad() = default;

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
  int InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) override;
 #endif
  std::vector<int> GetAxis() const;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_BIAS_GRAD_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_BIAS_GRAD_H_
--- a/mindspore/lite/src/ops/bn_grad_input.cc
+++ b/mindspore/lite/src/ops/bn_grad_input.cc
@@ -14,33 +14,33 @@
 * limitations under the License.
 */

 #include "src/ops/bn_grad_input.h"
 #include "src/ops/bn_grad.h"

 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 float BNGradInput::GetEps() const { return this->primitive_->value.AsBNGradInput()->eps; }
 int BNGradInput::GetChannels() const { return this->primitive_->value.AsBNGradInput()->channels; }
 float BNGrad::GetEps() const { return this->primitive_->value.AsBNGrad()->eps; }
 float BNGrad::GetMomentum() const { return this->primitive_->value.AsBNGrad()->momentum; }

 void BNGradInput::SetEps(float eps) { this->primitive_->value.AsBNGradInput()->eps = eps; }
 void BNGradInput::SetChannels(int channels) { this->primitive_->value.AsBNGradInput()->channels = channels; }
 void BNGrad::SetEps(float eps) { this->primitive_->value.AsBNGrad()->eps = eps; }
 void BNGrad::SetMomentum(float momentum) { this->primitive_->value.AsBNGrad()->momentum = momentum; }

 #else
 int BNGradInput::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
 int BNGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
  MS_ASSERT(nullptr != primitive);
  MS_ASSERT(nullptr != fbb);
  auto attr = primitive->value_as_BNGradInput();
  auto attr = primitive->value_as_BNGrad();
  if (attr == nullptr) {
    MS_LOG(ERROR) << "value_as_BNGradInput return nullptr";
    return RET_ERROR;
  }
  auto val_offset = schema::CreateBNGradInput(*fbb, attr->eps(), attr->channels());
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BNGradInput, val_offset.o);
  auto val_offset = schema::CreateBNGrad(*fbb, attr->eps(), attr->momentum());
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BNGrad, val_offset.o);
  fbb->Finish(prim_offset);
  return RET_OK;
 }
 float BNGradInput::GetEps() const { return this->primitive_->value_as_BNGradInput()->eps(); }
 int BNGradInput::GetChannels() const { return this->primitive_->value_as_BNGradInput()->channels(); }
 float BNGrad::GetEps() const { return this->primitive_->value_as_BNGrad()->eps(); }
 float BNGrad::GetMomentum() const { return this->primitive_->value_as_BNGrad()->momentum(); }

 #endif
 }  // namespace lite
--- a/mindspore/lite/src/ops/bn_grad_input.h
+++ b/mindspore/lite/src/ops/bn_grad_input.h
@@ -25,21 +25,20 @@

 namespace mindspore {
 namespace lite {
 class BNGradInput : public PrimitiveC {
 class BNGrad : public PrimitiveC {
 public:
 #ifdef PRIMITIVE_WRITEABLE
  MS_DECLARE_PARENT(BNGradInput, PrimitiveC);
  BNGradInput() = default;
  explicit BNGradInput(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
  MS_DECLARE_PARENT(BNGrad, PrimitiveC);
  BNGrad() = default;
  explicit BNGrad(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
  void SetEps(float eps);
  void SetChannels(int channels);
  void SetMomentum(float momentum);
 #else
  BNGradInput() = default;

  BNGrad() = default;
  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  float GetEps() const;
  int GetChannels() const;
  float GetMomentum() const;
 };
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/conv2d_grad_filter.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_filter.cc
@@ -105,5 +105,47 @@ int Conv2DGradFilter::GetActivationType() const {
 }

 #endif

 int Conv2DGradFilter::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
  if (3 != inputs.size()) {
    MS_LOG(ERROR) << "Conv2d Grad Filter should have 3 inputs";
    return RET_ERROR;
  }
  if (1 != outputs.size()) {
    MS_LOG(ERROR) << "Conv2d Grad Filter should have one output";
    return RET_ERROR;
  }

  auto *in0 = inputs.at(0);
  auto *in = inputs.at(2);
  MS_ASSERT(out != nullptr);

  std::vector<int> output_shape;
  int *out_shape = reinterpret_cast<int *>(in->Data());
  int new_size = in->ElementsNum();
  if (in0->GetFormat() == in->GetFormat()) {
    for (int i = 0; i < new_size; i++) output_shape.push_back(out_shape[i]);
  } else {
    if ((in0->GetFormat() == schema::Format_NHWC) && (in->GetFormat() == schema::Format_NCHW)) {
      output_shape.push_back(out_shape[0]);
      output_shape.push_back(out_shape[2]);
      output_shape.push_back(out_shape[3]);
      output_shape.push_back(out_shape[1]);
    } else {
      MS_LOG(ERROR) << "Shape covnert is not supported";
      return RET_ERROR;
    }
  }

  auto *out = outputs.at(0);
  MS_ASSERT(out != nullptr);

  out->set_shape(output_shape);
  out->set_data_type(in0->data_type());
  out->SetFormat(in0->GetFormat());

  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/conv2d_grad_filter.h
+++ b/mindspore/lite/src/ops/conv2d_grad_filter.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_FILTER_H_
 #define LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_FILTER_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_FILTER_H_
 #define MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_FILTER_H_

 #include <vector>
 #include <set>
@@ -53,6 +53,7 @@ class Conv2DGradFilter : public PrimitiveC {

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
  int GetFormat() const;
  int GetGroup() const;
  int GetChannelIn() const;
@@ -74,4 +75,4 @@ class Conv2DGradFilter : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_FILTER_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_FILTER_H_
--- a/mindspore/lite/src/ops/conv2d_grad_input.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_input.cc
@@ -103,5 +103,46 @@ int Conv2DGradInput::GetActivationType() const {
 }

 #endif

 int Conv2DGradInput::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
  if (3 != inputs.size()) {
    MS_LOG(ERROR) << "Conv2d Grad Input should have 3 inputs";
    return RET_ERROR;
  }
  if (1 != outputs.size()) {
    MS_LOG(ERROR) << "Conv2d Grad input should have one output";
    return RET_ERROR;
  }

  auto *in0 = inputs.at(0);
  auto *in = inputs.at(2);
  MS_ASSERT(out != nullptr);

  std::vector<int> output_shape;
  int *out_shape = reinterpret_cast<int *>(in->Data());
  int new_size = in->ElementsNum();
  if (in0->GetFormat() == in->GetFormat()) {
    for (int i = 0; i < new_size; i++) output_shape.push_back(out_shape[i]);
  } else {
    if ((in0->GetFormat() == schema::Format_NHWC) && (in->GetFormat() == schema::Format_NCHW)) {
      output_shape.push_back(out_shape[0]);
      output_shape.push_back(out_shape[2]);
      output_shape.push_back(out_shape[3]);
      output_shape.push_back(out_shape[1]);
    } else {
      MS_LOG(ERROR) << "Shape covnert is not supported";
      return RET_ERROR;
    }
  }

  auto *out = outputs.at(0);
  MS_ASSERT(out != nullptr);
  out->set_shape(output_shape);
  out->set_data_type(in0->data_type());
  out->SetFormat(in0->GetFormat());

  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/conv2d_grad_input.h
+++ b/mindspore/lite/src/ops/conv2d_grad_input.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_INPUT_H_
 #define LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_INPUT_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_INPUT_H_
 #define MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_INPUT_H_

 #include <vector>
 #include <set>
@@ -53,6 +53,7 @@ class Conv2DGradInput : public PrimitiveC {

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
  int GetFormat() const;
  int GetGroup() const;
  int GetChannelIn() const;
@@ -74,4 +75,4 @@ class Conv2DGradInput : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_CONV2_D_GRAD_INPUT_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_CONV2D_GRAD_INPUT_H_
--- a/mindspore/lite/src/ops/dedepthwise_conv2d.h
+++ b/mindspore/lite/src/ops/dedepthwise_conv2d.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_DE_DEPTHWISE_CONV2_D_H_
 #define LITE_MINDSPORE_LITE_C_OPS_DE_DEPTHWISE_CONV2_D_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_DEDEPTHWISE_CONV2D_H_
 #define MINDSPORE_LITE_SRC_OPS_DEDEPTHWISE_CONV2D_H_

 #include <vector>
 #include <set>
@@ -84,4 +84,4 @@ class DeDepthwiseConv2D : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_DE_DEPTHWISE_CONV2_D_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_DEDEPTHWISE_CONV2D_H_
--- a/mindspore/lite/src/ops/depthwise_conv2d.h
+++ b/mindspore/lite/src/ops/depthwise_conv2d.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_DEPTHWISE_CONV2_D_H_
 #define LITE_MINDSPORE_LITE_C_OPS_DEPTHWISE_CONV2_D_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_DEPTHWISE_CONV2D_H_
 #define MINDSPORE_LITE_SRC_OPS_DEPTHWISE_CONV2D_H_

 #include <vector>
 #include <set>
@@ -94,4 +94,4 @@ class DepthwiseConv2D : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_DEPTHWISE_CONV2_D_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_DEPTHWISE_CONV2D_H_
--- a/mindspore/lite/src/ops/make_tuple.h
+++ b/mindspore/lite/src/ops/make_tuple.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
 #define LITE_MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
 #define MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
 #include <vector>
 #include "src/ops/primitive_c.h"

@@ -37,4 +37,4 @@ class MakeTuple : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_MAKE_TUPLE_H_
--- a/mindspore/lite/src/ops/pooling_grad.cc
+++ b/mindspore/lite/src/ops/pooling_grad.cc
@@ -86,5 +86,52 @@ int PoolingGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuf
  return RET_OK;
 }
 #endif

 int PoolingGrad::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
  MS_ASSERT(this->primitive != nullptr);
  auto input = inputs_.at(0);
  MS_ASSERT(input != nullptr);
  int input_h = input->shape().at(1);
  int input_w = input->shape().at(2);

  auto window_h = GetWindowH();
  auto window_w = GetWindowW();
  if (GetGlobal()) {
    window_h = input_h;
    window_w = input_w;
  }

  pad_l_ = GetPadLeft();
  pad_u_ = GetPadUp();
  pad_d_ = GetPadDown();
  pad_r_ = GetPadRight();
  if (GetPadMode() == schema::PadMode_SAME) {
    int output_w = std::ceil(static_cast<float>(input_w) / static_cast<float>(GetStrideW()));
    int output_h = std::ceil(static_cast<float>(input_h) / static_cast<float>(GetStrideH()));
    auto pad_h_all = ((output_h - 1) * GetStrideH() + (window_h - 1) + 1 - input_h);
    auto pad_w_all = ((output_w - 1) * GetStrideW() + (window_w - 1) + 1 - input_w);
    if (pad_h_all < 0) {
      pad_u_ = pad_d_ = 0;
    } else {
      pad_u_ = pad_h_all / 2;
      pad_d_ = pad_h_all - pad_u_;
    }
    if (pad_w_all < 0) {
      pad_l_ = pad_r_ = 0;
    } else {
      pad_l_ = pad_w_all / 2;
      pad_r_ = pad_w_all - pad_l_;
    }
  }
  auto grad_output = outputs_.at(0);
  // todo: fmk type
  auto output_shape = input->shape();
  grad_output->set_shape(output_shape);
  grad_output->set_data_type(input->data_type());
  // todo: temp fix
  grad_output->SetFormat(input->GetFormat());
  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/pooling_grad.h
+++ b/mindspore/lite/src/ops/pooling_grad.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_POOLING_GRAD_H_
 #define LITE_MINDSPORE_LITE_C_OPS_POOLING_GRAD_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_POOLING_GRAD_H_
 #define MINDSPORE_LITE_SRC_OPS_POOLING_GRAD_H_

 #include <vector>
 #include <set>
@@ -49,6 +49,7 @@ class PoolingGrad : public PrimitiveC {

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;
  int GetFormat() const;
  int GetPoolingMode() const;
  bool GetGlobal() const;
@@ -62,8 +63,14 @@ class PoolingGrad : public PrimitiveC {
  int GetPadLeft() const;
  int GetPadRight() const;
  int GetRoundMode() const;

 protected:
  int pad_u_ = 0;
  int pad_d_ = 0;
  int pad_l_ = 0;
  int pad_r_ = 0;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_POOLING_GRAD_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_POOLING_GRAD_H_
--- a/mindspore/lite/src/ops/power_grad.h
+++ b/mindspore/lite/src/ops/power_grad.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_POWER_GRAD_H_
 #define LITE_MINDSPORE_LITE_C_OPS_POWER_GRAD_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_POWER_GRAD_H_
 #define MINDSPORE_LITE_SRC_OPS_POWER_GRAD_H_

 #include <vector>
 #include <set>
@@ -46,4 +46,4 @@ class PowerGrad : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_POWER_GRAD_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_POWER_GRAD_H_
--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@@ -125,6 +125,21 @@
 #ifdef PRIMITIVE_WRITEABLE
 #include "tools/converter/quantizer/quantize_util.h"
 #endif

 #ifdef SUPPORT_TRAIN
 #include "src/ops/activation_grad.h"
 #include "src/ops/apply_momentum.h"
 #include "src/ops/bias_grad.h"
 #include "src/ops/pooling_grad.h"
 #include "src/ops/conv2d_grad_filter.h"
 #include "src/ops/conv2d_grad_input.h"
 #include "src/ops/power_grad.h"
 #include "src/ops/softmax_cross_entropy.h"
 #include "src/ops/bn_grad.h"
 #include "src/ops/arithmetic_grad.h"
 #endif


 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
@@ -353,6 +368,22 @@ std::shared_ptr<PrimitiveC> PrimitiveC::UnPackFromPrimitive(const Primitive &pri
    return NewPrimitiveC<TupleGetItem>(prim, inputs, quantType);
  } else if (op_type == "Softmax") {
    return NewPrimitiveC<SoftMax>(prim, inputs, quantType);
 #ifdef SUPPORT_TRAIN0
  } else if ((op_type == "ReluGrad" || op_type == "Relu6Grad" || op_type == "SigmoidGrad")) {
    return NewPrimitiveC<ActivationGrad>(prim, inputs, quantType);
  } else if ((op_type == "MaxPoolGrad") || (op_type == "MeanPoolGrad")) {
    return NewPrimitiveC<PoolingGrad>(prim, inputs, quantType);
  } else if (op_type == "Conv2DBackpropFilter") {
    return NewPrimitiveC<Conv2DGradFilter>(prim, inputs, quantType);
  } else if (op_type == "Conv2DBackpropInput") {
    return NewPrimitiveC<Conv2DGradInput>(prim, inputs, quantType);
  } else if (op_type == "BiasAddGrad") {
    return NewPrimitiveC<BiasGrad>(prim, inputs, quantType);
  } else if (op_type == "ApplyMomentum") {
    return NewPrimitiveC<ApplyMomentum>(prim, inputs, quantType);
  } else if (op_type == "BatchNormGrad") {
    return NewPrimitiveC<BNGrad>(prim, inputs, quantType);
 #endif
  } else {
    MS_LOG(ERROR) << "Unsupported primitive type in UnPackFromPrimitive : " << op_type;
    return nullptr;
@@ -565,6 +596,32 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitiveT(mindspore::schema::PrimitiveT
      return new SparseToDense(primitive);
    case schema::PrimitiveType_DetectionPostProcess:
      return new DetectionPostProcess(primitive);

 #ifdef SUPPORT_TRAIN
    case schema::PrimitiveType_ActivationGrad:
      return new ActivationGrad(primitive);
    case schema::PrimitiveType_PoolingGrad:
      return new PoolingGrad(primitive);
    case schema::PrimitiveType_Conv2DGradFilter:
      return new Conv2DGradFilter(primitive);
    case schema::PrimitiveType_Conv2DGradInput:
      return new Conv2DGradInput(primitive);
    case schema::PrimitiveType_BiasGrad:
      return new BiasGrad(primitive);
    case schema::PrimitiveType_ApplyMomentum:
      return new ApplyMomentum(primitive);
    case schema::PrimitiveType_BNGrad:
      return new BNGrad(primitive);
    case schema::PrimitiveType_AddGrad:
      return new ArithmeticGrad(primitive);
    case schema::PrimitiveType_SubGrad:
      return new ArithmeticGrad(primitive);
    case schema::PrimitiveType_MulGrad:
      return new ArithmeticGrad(primitive);
    case schema::PrimitiveType_DivGrad:
      return new ArithmeticGrad(primitive);
 #endif

    default:
      MS_LOG(ERROR) << "Unsupported primitive type in UnPackFromSchemaPrimitiveT : "
                    << schema::EnumNamePrimitiveType(op_type);
@@ -779,6 +836,31 @@ PrimitiveC *PrimitiveC::UnPackFromSchemaPrimitive(const schema::Primitive *primi
      return NewPrimitiveC<SparseToDense>(primitive);
    case schema::PrimitiveType_DetectionPostProcess:
      return NewPrimitiveC<DetectionPostProcess>(primitive);

 #ifdef SUPPORT_TRAIN
    case schema::PrimitiveType_ActivationGrad:
      return NewPrimitiveC<ActivationGrad>(primitive);
    case schema::PrimitiveType_PoolingGrad:
      return NewPrimitiveC<PoolingGrad>(primitive);
    case schema::PrimitiveType_Conv2DGradFilter:
      return NewPrimitiveC<Conv2DGradFilter>(primitive);
    case schema::PrimitiveType_Conv2DGradInput:
      return NewPrimitiveC<Conv2DGradInput>(primitive);
    case schema::PrimitiveType_BiasGrad:
      return NewPrimitiveC<BiasGrad>(primitive);
    case schema::PrimitiveType_ApplyMomentum:
      return NewPrimitiveC<ApplyMomentum>(primitive);
    case schema::PrimitiveType_BNGrad:
      return NewPrimitiveC<BNGrad>(primitive);
    case schema::PrimitiveType_AddGrad:
      return NewPrimitiveC<ArithmeticGrad>(primitive);
    case schema::PrimitiveType_SubGrad:
      return NewPrimitiveC<ArithmeticGrad>(primitive);
    case schema::PrimitiveType_MulGrad:
      return NewPrimitiveC<ArithmeticGrad>(primitive);
    case schema::PrimitiveType_DivGrad:
     return NewPrimitiveC<ArithmeticGrad>(primitive);
 #endif
    default:
      MS_LOG(ERROR) << "Unsupported primitive type in UnPackFromSchemaPrimitive : "
                    << schema::EnumNamePrimitiveType(op_type);
--- a/mindspore/lite/src/ops/reduce.cc
+++ b/mindspore/lite/src/ops/reduce.cc
@@ -115,7 +115,7 @@ constexpr size_t kInputSize = 1;
 constexpr size_t kOutputSize = 1;
 }  // namespace
 int Reduce::InferShape(std::vector<tensor::Tensor *> inputs_, std::vector<tensor::Tensor *> outputs_) {
  if (inputs_.size() != kInputSize || outputs_.size() != kOutputSize) {
  if (inputs_.size() < kInputSize || outputs_.size() != kOutputSize) {
    return RET_ERROR;
  }
  auto input = inputs_.front();
--- a/mindspore/lite/src/ops/reshape.h
+++ b/mindspore/lite/src/ops/reshape.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_RESHAPE_H_
 #define LITE_MINDSPORE_LITE_C_OPS_RESHAPE_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_RESHAPE_H_
 #define MINDSPORE_LITE_SRC_OPS_RESHAPE_H_

 #include <vector>
 #include <set>
@@ -50,4 +50,4 @@ class Reshape : public PrimitiveC {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_RESHAPE_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_RESHAPE_H_
--- a/mindspore/lite/src/ops/softmax_cross_entropy.cc
+++ b/mindspore/lite/src/ops/softmax_cross_entropy.cc
@@ -51,5 +51,31 @@ int SoftmaxCrossEntropy::UnPackToFlatBuilder(const schema::Primitive *primitive,
  return RET_OK;
 }
 #endif

 int SoftmaxCrossEntropy::InferShape(std::vector<tensor::Tensor *> inputs, std::vector<tensor::Tensor *> outputs) {
  if (1 > outputs.size()) {
    MS_LOG(ERROR) << "SoftmaxCrossEntropy should have at least one output";
    return RET_ERROR;
  }
  auto *in0 = inputs.front();
  MS_ASSERT(in0 != nullptr);
  auto *out = outputs.front();
  MS_ASSERT(out != nullptr);

  std::vector<int> outshape;
  outshape.push_back(1);
  out->set_shape(outshape);
  out->set_data_type(in0->data_type());

  if (1 < outputs.size()) {
    auto *grads = outputs.at(1);
    MS_ASSERT(grads != nullptr);
    grads->set_shape(in0->shape());
    grads->set_data_type(in0->data_type());
    grads->SetFormat(in0->GetFormat());
  }
  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/softmax_cross_entropy.h
+++ b/mindspore/lite/src/ops/softmax_cross_entropy.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef LITE_MINDSPORE_LITE_C_OPS_SOFTMAX_CROSS_ENTROPY_H_
 #define LITE_MINDSPORE_LITE_C_OPS_SOFTMAX_CROSS_ENTROPY_H_
 #ifndef MINDSPORE_LITE_SRC_OPS_SOFTMAX_CROSS_ENTROPY_H_
 #define MINDSPORE_LITE_SRC_OPS_SOFTMAX_CROSS_ENTROPY_H_

 #include <vector>
 #include <set>
@@ -39,9 +39,11 @@ class SoftmaxCrossEntropy : public PrimitiveC {

  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::tensor::Tensor *> inputs_, std::vector<lite::tensor::Tensor *> outputs_) override;

  std::vector<int> GetAxis() const;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // LITE_MINDSPORE_LITE_C_OPS_SOFTMAX_CROSS_ENTROPY_H_
 #endif  // MINDSPORE_LITE_SRC_OPS_SOFTMAX_CROSS_ENTROPY_H_
--- a/mindspore/lite/src/populate_parameter.cc
+++ b/mindspore/lite/src/populate_parameter.cc
@@ -1678,6 +1678,13 @@ PopulateParameterFunc PopulateParameterRegistry::GetParameterFunc(int type) {
  return populate_parameter_funcs_[schema::PrimitiveType(type)];
 }

 int PopulateParameterRegistry::AddPopulateParameterFunc(const schema::PrimitiveType &type, PopulateParameterFunc func) {
  if ((type <  schema::PrimitiveType_MIN)|| (type > schema::PrimitiveType_MAX))
    return -1;
  populate_parameter_funcs_[type]  = func;
  return 0;
 }

 OpParameter *PopulateParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
--- a/mindspore/lite/src/populate_parameter.h
+++ b/mindspore/lite/src/populate_parameter.h
@@ -30,12 +30,16 @@ class PopulateParameterRegistry {
  ~PopulateParameterRegistry() = default;

  static PopulateParameterRegistry *GetInstance();
  int AddPopulateParameterFunc(const schema::PrimitiveType &type, PopulateParameterFunc func);
  PopulateParameterFunc GetParameterFunc(int type);

 protected:
  PopulateParameterFunc populate_parameter_funcs_[schema::PrimitiveType_MAX + 1];
 };

 OpParameter *PopulateActivationParameter(const lite::PrimitiveC *primitive);
 OpParameter *PopulateArithmetic(const lite::PrimitiveC *primitive);

 OpParameter *PopulateParameter(const mindspore::lite::PrimitiveC *primitive);
 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_POPULATE_PARAMETER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/reduce_base.cc
@@ -37,8 +37,8 @@ constexpr size_t kOutputNum = 1;
 }  // namespace

 int ReduceBaseCPUKernel::CheckInputsOutputs() {
  if (in_tensors_.size() != kInputNum) {
    MS_LOG(ERROR) << "Reduce inputs size should be " << kInputNum << " but got " << in_tensors_.size();
  if (in_tensors_.size() < kInputNum) {
    MS_LOG(ERROR) << "Reduce inputs size should be at least " << kInputNum << " but got " << in_tensors_.size();
    return RET_ERROR;
  }
  if (out_tensors_.size() != kOutputNum) {
@@ -99,7 +99,15 @@ int ReduceBaseCPUKernel::Init() {
  if (reduce_param == nullptr) {
    return RET_NULL_PTR;
  }
  num_axes_ = reduce_param->num_axes_;
  if (in_tensors_.size() > 1) {
    auto axes_ptr = in_tensors_.at(1);
    num_axes_ = axes_ptr->ElementsNum();
    memcpy(axes_, axes_ptr->Data(), axes_ptr->Size());
  } else {
    num_axes_ = reduce_param->num_axes_;
    memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
  }

  mode_ = reduce_param->mode_;
  memcpy(axes_, reduce_param->axes_, sizeof(reduce_param->axes_));
  reduce_to_end_ = reduce_param->reduce_to_end_;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
@@ -15,6 +15,7 @@
 */

 #include "src/runtime/kernel/arm/fp32_grad/activation_grad.h"
 #include "nnacl/fp32_grad/activation_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/runtime_api.h"
@@ -24,41 +25,38 @@ using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::ActivationGradType_HSWISH;
 using mindspore::schema::ActivationGradType_LEAKY_RELU;
 using mindspore::schema::ActivationGradType_RELU;
 using mindspore::schema::ActivationGradType_RELU6;
 using mindspore::schema::ActivationType_HSWISH;
 using mindspore::schema::ActivationType_LEAKY_RELU;
 using mindspore::schema::ActivationType_RELU;
 using mindspore::schema::ActivationType_RELU6;
 using mindspore::schema::PrimitiveType_ActivationGrad;

 namespace mindspore::kernel {
 int ActivationGradCPUKernel::Init() {
  outputs_[0]->set_shape(inputs_[0]->shape());
  return RET_OK;
 }
 int ActivationGradCPUKernel::Init() { return RET_OK; }

 int ActivationGradCPUKernel::ReSize() { return RET_OK; }

 int ActivationGradCPUKernel::DoActivation(int task_id) {
  auto yt_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto input_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
  auto output_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto length = inputs_.at(0)->ElementsNum();
  auto yt_addr = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto input_addr = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
  auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
  int length = in_tensors_.at(0)->ElementsNum();

  auto error_code = RET_OK;

  if (type_ == schema::ActivationGradType_RELU) {
  if (param_act_grad_->type_ == schema::ActivationType_RELU) {
    error_code = ReluGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_RELU6) {
  } else if (param_act_grad_->type_ == schema::ActivationType_RELU6) {
    error_code = Relu6Grad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_LEAKY_RELU) {
    error_code = LReluGrad(yt_addr, input_addr, length, output_addr, alpha_);
  } else if (type_ == schema::ActivationGradType_SIGMOID) {
  } else if (param_act_grad_->type_ == schema::ActivationType_LEAKY_RELU) {
    error_code = LReluGrad(yt_addr, input_addr, length, output_addr, param_act_grad_->alpha_);
  } else if (param_act_grad_->type_ == schema::ActivationType_SIGMOID) {
    error_code = SigmoidGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_TANH) {
  } else if (param_act_grad_->type_ == schema::ActivationType_TANH) {
    error_code = TanhGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_HSWISH) {
  } else if (param_act_grad_->type_ == schema::ActivationType_HSWISH) {
    error_code = HSwishGrad(yt_addr, input_addr, length, output_addr);
  } else if (type_ == schema::ActivationGradType_HSIGMOID) {
  } else if (param_act_grad_->type_ == schema::ActivationType_HSIGMOID) {
    error_code = HSigmoidGrad(yt_addr, input_addr, length, output_addr);
  } else {
    MS_LOG(ERROR) << "Activation type error";
@@ -81,6 +79,12 @@ int ActivationGradRun(void *cdata, int task_id) {
 }

 int ActivationGradCPUKernel::Run() {
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return ret;
  }

  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationGradRun, this, thread_count_);
  if (error_code != RET_OK) {
    MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.h
@@ -20,8 +20,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "nnacl/activation_grad.h"
 #include "nnacl/fp32/activation.h"

 namespace mindspore::kernel {
 class ActivationGradCPUKernel : public LiteKernel {
@@ -30,9 +29,7 @@ class ActivationGradCPUKernel : public LiteKernel {
                                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                   const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(param, inputs, outputs, ctx, primitive) {
    ActivationGradParameter *param_act_grad = reinterpret_cast<ActivationGradParameter *>(param);
    type_ = param_act_grad->type_;
    alpha_ = param_act_grad->alpha_;
    param_act_grad_ = reinterpret_cast<ActivationParameter *>(param);
  }
  ~ActivationGradCPUKernel() override = default;

@@ -43,9 +40,9 @@ class ActivationGradCPUKernel : public LiteKernel {

 private:
  int thread_count_;
  int type_;
  float alpha_;
  ActivationParameter *param_act_grad_;
 };

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_ACTIVATION_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.cc
@@ -0,0 +1,105 @@

 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32_grad/apply_momentum.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
 #include "src/runtime/kernel/arm/fp32/nchw2nhwc.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_ApplyMomentum;

 namespace mindspore::kernel {

 int ApplyMomentumCPUKernel::ReSize() { return RET_OK; }

 int ApplyMomentumCPUKernel::Run() {
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }

  auto weight = reinterpret_cast<float *>(in_tensors_[0]->Data());
  auto accumulate = reinterpret_cast<float *>(in_tensors_[1]->Data());
  float learning_rate = reinterpret_cast<float *>(in_tensors_[2]->Data())[0];
  auto gradient = reinterpret_cast<float *>(in_tensors_[3]->Data());
  float moment = reinterpret_cast<float *>(in_tensors_[4]->Data())[0];
  size_t elem_num = in_tensors_[0]->ElementsNum();

  // align format
  if (in_tensors_[3]->shape().size() == 4 &&
      in_tensors_[3]->GetFormat() == schema::Format_NCHW &&
      in_tensors_[0]->GetFormat() == schema::Format_KHWC) {
    PackNCHWToNHWCFp32(gradient, workspace, in_tensors_[0]->Batch(), in_tensors_[0]->Height() * in_tensors_[0]->Width(),
                       in_tensors_[0]->Channel());
  } else {
    memcpy(workspace, gradient, in_tensors_[3]->ElementsNum() * sizeof(float));
  }

  for (size_t i = 0; i < elem_num; ++i) {
    accumulate[i] = accumulate[i] * moment + workspace[i];  // * (1.0 - moment);
    weight[i] -= accumulate[i] * learning_rate;
  }
  return RET_OK;
 }

 int ApplyMomentumCPUKernel::Init() {
  // Only for test with uninitialized Data
  size_t elem_num = in_tensors_[0]->ElementsNum();
  auto accumulate = reinterpret_cast<float *>(in_tensors_[1]->Data());
  for (int i =0; i < elem_num; i++) accumulate[i] = 0.0;

  workspace = new float[elem_num];
  return 0;
 }
 #if 0
 OpParameter *PopulateApplyMomentumParameter(const lite::Primitive *primitive) {
  OpParameter *param = new (std::nothrow) OpParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for OptMomentum failed.";
    return nullptr;
  }
  param->type_ = primitive->Type();
  return param;
 }
 #endif

 kernel::LiteKernel *CpuApplyMomentumFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc, const lite::PrimitiveC *primitive) {
  MS_ASSERT(desc.type == schema::PrimitiveType_ApplyMomentum);
  auto *kernel = new (std::nothrow) ApplyMomentumCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (0 != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_ApplyMomentum, CpuApplyMomentumFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/apply_momentum.h
@@ -14,28 +14,32 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_OPT_MOMENTUM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_OPT_MOMENTUM_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_APPLY_MOMENTUM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_APPLY_MOMENTUM_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class OptMomentumCPUKernel : public LiteKernel {
 class ApplyMomentumCPUKernel : public LiteKernel {
 public:
  explicit OptMomentumCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
  explicit ApplyMomentumCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~OptMomentumCPUKernel() override {}
  ~ApplyMomentumCPUKernel() override {delete [] workspace;}

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
    float *workspace;
 };

 // OpParameter *PopulateApplyMomentumParameter(const lite::Primitive *primitive);

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_OPT_MOMENTUM_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_APPLY_MOMENTUM_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.cc
@@ -14,11 +14,11 @@
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32_grad/arithmetic_grad.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "nnacl/fp32_grad/reduce_grad.h"
 #include "nnacl/fp32_grad/arithmetic_grad.h"
 #include "src/runtime/kernel/arm/fp32_grad/arithmetic_grad.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
@@ -33,108 +33,41 @@ constexpr int kArithGradOpOutputNum = 2;
 }  // namespace

 int ArithmeticGradCPUKernel::Init() {
  auto ret = InferShape();
  return ret;
 }

 int ArithmeticGradCPUKernel::InferShape() {
  if (inputs_.size() != kArithGradOpInputNum) {
    MS_LOG(ERROR) << "The number of input must be " << kArithGradOpInputNum;
    return RET_ERROR;
  }
  if (outputs_.size() != kArithGradOpOutputNum) {
    MS_LOG(ERROR) << "The number of output must be " << kArithGradOpOutputNum;
    return RET_ERROR;
  }
  auto dy = inputs_[0];
  auto x1 = inputs_[1];
  auto x2 = inputs_[2];
  auto dx1 = outputs_[0];
  auto dx2 = outputs_[1];
  auto dx1 = out_tensors_[0];
  auto dx2 = out_tensors_[1];

  MS_ASSERT(dy != nullptr);
  MS_ASSERT(x1 != nullptr);
  MS_ASSERT(x2 != nullptr);
  MS_ASSERT(dx1 != nullptr);
  MS_ASSERT(dx2 != nullptr);

  auto inShape0 = x1->shape();
  auto inShape1 = x2->shape();
  auto outShape = dy->shape();

  if ((type() == PrimitiveType_AddGrad) || (type() == PrimitiveType_SubGrad)) {
    arithmeticParameter_->ndim_ = outShape.size();
    auto fillDimNum0 = outShape.size() - inShape0.size();
    auto fillDimNum1 = outShape.size() - inShape1.size();
    int j0 = 0;
    int j1 = 0;
    for (unsigned int i = 0; i < outShape.size(); i++) {
      arithmeticParameter_->in_shape0_[i] = (i < fillDimNum0) ? 1 : inShape0[j0++];
      arithmeticParameter_->in_shape1_[i] = (i < fillDimNum1) ? 1 : inShape1[j1++];
      arithmeticParameter_->out_shape_[i] = outShape[i];
    }
  } else {
  if ((Type() == PrimitiveType_MulGrad) || (Type() == PrimitiveType_DivGrad)) {
    // if (inShape0.size() < inShape1.size())
    if (dx1->ElementsNum() < dx2->ElementsNum()) {
      arithmeticParameter_->ndim_ = inShape1.size();
      if (type() == PrimitiveType_MulGrad)
      if (Type() == PrimitiveType_MulGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul2L;
      else if (type() == PrimitiveType_DivGrad)
      else if (Type() == PrimitiveType_DivGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv2L;

      auto fillDimNum = inShape1.size() - inShape0.size();  // This will not work for batch!
      int j = 0;
      for (unsigned int i = 0; i < inShape1.size(); i++) {
        if (i < fillDimNum) {
          arithmeticParameter_->in_shape1_[i] = 1;
        } else {
          arithmeticParameter_->in_shape1_[i] = inShape0[j++];
        }
        arithmeticParameter_->in_shape0_[i] = inShape1[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    } else if (dx2->ElementsNum() < dx1->ElementsNum()) {  // if (inShape0.size() > inShape1.size())
      arithmeticParameter_->ndim_ = inShape0.size();
      if (type() == PrimitiveType_MulGrad)
      if (Type() == PrimitiveType_MulGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul1L;
      else if (type() == PrimitiveType_DivGrad)
      else if (Type() == PrimitiveType_DivGrad)
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradDiv1L;
      arithmeticParameter_->broadcasting_ = true;
      arithmeticParameter_->ndim_ = inShape0.size();
      int j = 0;
      auto fillDimNum = inShape0.size() - inShape1.size();
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        if (i < fillDimNum) {
          arithmeticParameter_->in_shape1_[i] = 1;
        } else {
          arithmeticParameter_->in_shape1_[i] = inShape1[j++];
        }
        arithmeticParameter_->in_shape0_[i] = inShape0[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    } else {
      arithmeticParameter_->broadcasting_ = false;
      for (unsigned int i = 0; i < inShape0.size(); i++) {
        arithmeticParameter_->in_shape1_[i] = inShape1[i];
        arithmeticParameter_->in_shape0_[i] = inShape0[i];
        arithmeticParameter_->out_shape_[i] = outShape[i];
      }
    }

    tile_data0 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
    tile_data0 = new (std::nothrow) float[in_tensors_.at(0)->ElementsNum()];
    if (tile_data0 == nullptr) {
      MS_LOG(ERROR) << "new data0 fail!";
      return RET_ERROR;
    }
    tile_data1 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
    tile_data1 = new (std::nothrow) float[in_tensors_.at(0)->ElementsNum()];
    if (tile_data1 == nullptr) {
      MS_LOG(ERROR) << "new data1 fail!";
      delete tile_data0;
      return RET_ERROR;
    }

    if (type() == PrimitiveType_DivGrad) {
      tile_data2 = new (std::nothrow) float[inputs_.at(0)->ElementsNum()];
    if (Type() == PrimitiveType_DivGrad) {
      tile_data2 = new (std::nothrow) float[in_tensors_.at(0)->ElementsNum()];
      if (tile_data2 == nullptr) {
        MS_LOG(ERROR) << "new data2 fail!";
        delete tile_data0;
@@ -144,10 +77,6 @@ int ArithmeticGradCPUKernel::InferShape() {
    }
  }

  dx1->set_shape(x1->shape());
  dx2->set_shape(x2->shape());
  dx1->set_data_type(dy->data_type());
  dx2->set_data_type(dy->data_type());
  return RET_OK;
 }

@@ -187,16 +116,16 @@ void ArithmeticGradCPUKernel::ArithmeticGradSub(float *dy, int dy_size, float *d

 void ArithmeticGradCPUKernel::ArithmeticGradMul(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1_data = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(in_tensors_[2]->Data());
  ElementMul(dy, x1_data, dx2, dy_size);
  ElementMul(dy, x2_data, dx1, dy_size);
 }

 void ArithmeticGradCPUKernel::ArithmeticGradMul1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1_data = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(in_tensors_[2]->Data());
  ElementMul(dy, x1_data, tile_data0, dy_size);
  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx2, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);
@@ -206,8 +135,8 @@ void ArithmeticGradCPUKernel::ArithmeticGradMul1L(float *dy, int dy_size, float

 void ArithmeticGradCPUKernel::ArithmeticGradMul2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1_data = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(in_tensors_[2]->Data());
  ElementMul(dy, x2_data, tile_data0, dy_size);
  ReduceSumByAxes(tile_data0, arithmeticParameter_->in_shape0_, dx1, arithmeticParameter_->in_shape1_,
                  arithmeticParameter_->ndim_);
@@ -217,16 +146,16 @@ void ArithmeticGradCPUKernel::ArithmeticGradMul2L(float *dy, int dy_size, float

 void ArithmeticGradCPUKernel::ArithmeticGradDiv(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                int dx2_size) {
  auto x1 = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2 = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1 = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2 = reinterpret_cast<float *>(in_tensors_[2]->Data());
  ElementDiv(dy, x2, dx1, dy_size);
  ElementMulAndDivNegSquare(dy, x1, x2, dx2, dy_size);
 }

 void ArithmeticGradCPUKernel::ArithmeticGradDiv1L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1_data = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(in_tensors_[2]->Data());

  ElementMul(x2_data, x2_data, dx2, dx2_size);
  ElementMul(x1_data, dy, dx1, dy_size);  // use dx1 buffer
@@ -243,8 +172,8 @@ void ArithmeticGradCPUKernel::ArithmeticGradDiv1L(float *dy, int dy_size, float

 void ArithmeticGradCPUKernel::ArithmeticGradDiv2L(float *dy, int dy_size, float *dx1, int dx1_size, float *dx2,
                                                  int dx2_size) {
  auto x1_data = reinterpret_cast<float *>(inputs_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(inputs_[2]->Data());
  auto x1_data = reinterpret_cast<float *>(in_tensors_[1]->Data());
  auto x2_data = reinterpret_cast<float *>(in_tensors_[2]->Data());

  // dx1 = dy/x2
  ElementDiv(dy, x2_data, tile_data0, dy_size);  // first multiply into temp
@@ -259,13 +188,13 @@ void ArithmeticGradCPUKernel::ArithmeticGradDiv2L(float *dy, int dy_size, float
 int ArithmeticGradCPUKernel::ReSize() { return RET_OK; }

 int ArithmeticGradCPUKernel::Run() {
  auto dy = reinterpret_cast<float *>(inputs_[0]->Data());
  auto dx1 = reinterpret_cast<float *>(outputs_[0]->Data());
  auto dx2 = reinterpret_cast<float *>(outputs_[1]->Data());
  auto dy = reinterpret_cast<float *>(in_tensors_[0]->Data());
  auto dx1 = reinterpret_cast<float *>(out_tensors_[0]->Data());
  auto dx2 = reinterpret_cast<float *>(out_tensors_[1]->Data());

  size_t dy_size = inputs_.at(0)->ElementsNum();
  size_t dx1_size = outputs_.at(0)->ElementsNum();
  size_t dx2_size = outputs_[1]->ElementsNum();
  size_t dy_size = in_tensors_.at(0)->ElementsNum();
  size_t dx1_size = out_tensors_.at(0)->ElementsNum();
  size_t dx2_size = out_tensors_[1]->ElementsNum();
  (this->*arithmetic_grad_)(dy, dy_size, dx1, dx1_size, dx2, dx2_size);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/arithmetic_grad.h
@@ -40,7 +40,7 @@ class ArithmeticGradCPUKernel : public LiteKernel {
                                   const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                   const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive), tile_data0(NULL), tile_data1(NULL), tile_data2(NULL) {
    switch (type()) {
    switch (Type()) {
      case PrimitiveType_MulGrad:
        arithmetic_grad_ = &ArithmeticGradCPUKernel::ArithmeticGradMul;  // this will be adjusted in InferShape
        break;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bias_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bias_grad.cc
@@ -27,33 +27,9 @@ using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_BiasGrad;

 namespace mindspore::kernel {
 int BiasGradCPUKernel::InferShape() {
  if (1 != this->inputs_.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one input";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "BiasGrad should have one output";
    return RET_ERROR;
  }
  auto *in0 = inputs_.front();
  auto *out = outputs_.front();
  MS_ASSERT(in0 != nullptr);
  MS_ASSERT(out != nullptr);
  auto inshape = in0->shape();
  int ndim = inshape.size();
  for (int i = 0; i < ndim - 1; i++) {
    inshape[i] = 1;
  }
  out->set_shape(inshape);
  out->set_data_type(in0->data_type());
  return RET_OK;
 }

 int BiasGradCPUKernel::Init() {
  MS_ASSERT(InferShape() == RET_OK);

  auto dims = inputs_[0]->shape();
  auto dims = in_tensors_[0]->shape();
  bias_param->ndim_ = dims.size();
  for (unsigned int i = 0; i < bias_param->ndim_; i++) {
    bias_param->in_shape0_[i] = dims[i];
@@ -75,8 +51,8 @@ int BiasGradCPUKernel::Run() {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }
  auto in = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto out = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto in = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto out = reinterpret_cast<float *>(out_tensors_.at(0)->Data());

  size_t nhw_size = 1;
  size_t channels = bias_param->in_shape0_[bias_param->ndim_ - 1];  // C in NHWC
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bias_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bias_grad.h
@@ -14,8 +14,8 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BIAS_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BIAS_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
@@ -35,7 +35,6 @@ class BiasGradCPUKernel : public LiteKernel {
  ~BiasGradCPUKernel() override = default;

  int Init() override;
  int InferShape();
  int ReSize() override;
  int Run() override;

@@ -44,4 +43,4 @@ class BiasGradCPUKernel : public LiteKernel {
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BIAS_GRAD_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BIAS_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.cc
@@ -14,11 +14,11 @@
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32_grad/bn_grad.h"
 #include <algorithm>
 #include <vector>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/fp32_grad/bn_grad.h"
 #include "nnacl/fp32_grad/batch_norm.h"
 #include "include/errorcode.h"

@@ -27,79 +27,103 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 // using mindspore::lite::REG_OP;
 using mindspore::schema::PrimitiveType_BNGradInput;
 using mindspore::schema::PrimitiveType_BNGrad;

 /*
 {dy}
 {x }
 {scale }
 {save_mean }
 {save_inv_variance }
 */
 namespace mindspore::kernel {
 int BNGradInputCPUKernel::Init() {
  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
  workspace_size = 5 * bn_param->channels;
  workspace = new  (std::nothrow) float[workspace_size];
  if (workspace == nullptr) {
    MS_LOG(ERROR) << "new workspace fail!";
    return RET_ERROR;
  }

  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
 #if 0
 OpParameter *PopulateBNGradParameter(const lite::Primitive *primitive) {
  BNGradParameter *param = new (std::nothrow) BNGradParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for conv grad filter failed.";
    return nullptr;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
  param->op_parameter_.type_ = primitive->Type();

  auto bngrad_primitive = primitive->Value()->value_as_BNGrad();
  param->epsilon_ = bngrad_primitive->eps();
  param->momentum_ = bngrad_primitive->momentum();
  return reinterpret_cast<OpParameter *>(param);
 }
 #endif
 int BNGradCPUKernel::Init() {
  auto *input_x = in_tensors_.at(1);
  int channels = input_x->shape().at(kNHWC_C);
  workspace_size = 5 * channels;
  workspace = new (std::nothrow) float[workspace_size];
  if (workspace == nullptr) {
    MS_LOG(ERROR) << "new workspace fail!";
    return RET_ERROR;
  }
  auto *input_tensor = inputs_.at(0);
  auto *out_tensor = outputs_.at(0);
  auto in_shape = input_tensor->shape();
  out_tensor->set_shape(in_shape);
  out_tensor->set_data_type(input_tensor->data_type());
  return RET_OK;
 }

 int BNGradInputCPUKernel::ReSize() { return RET_OK; }
 int BNGradCPUKernel::ReSize() { return RET_OK; }

 int BNGradInputCPUKernel::Run() {
  auto *input_x = inputs_.at(0);
  auto *input_yt = inputs_.at(1);
  auto *input_scale = inputs_.at(2);
  auto *output_grad = outputs_.at(0);
  auto bn_param = reinterpret_cast<bnParameter *>(opParameter);
  int batch = bn_param->batch;
  int channels = bn_param->channels;
  int spatial = bn_param->spatial;
  float eps = bn_param->eps;
 int BNGradCPUKernel::Run() {
  // std::cout << "run succ" << std::endl;
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }
  auto bn_param = reinterpret_cast<BNGradParameter *>(op_parameter_);
  auto *input_yt = in_tensors_.at(0);
  auto *input_x = in_tensors_.at(1);
  auto *input_scale = in_tensors_.at(2);
  auto *output_dx = out_tensors_.at(0);
  auto *output_scale = out_tensors_.at(1);
  auto *output_bias = out_tensors_.at(2);
  // Tensor *bias = input[5];
  int batch = input_x->Batch();
  int channels = input_x->Channel();
  int spatial = input_x->Height() * input_x->Width();
  float eps = bn_param->epsilon_;
  std::fill(workspace, workspace + workspace_size, 0.f);

  float *mean = workspace;
  float *variance = mean + channels;
  float *mean_delta = variance + channels;
  float *invar = mean + channels;
  float *mean_delta = invar + channels;
  float *variance_delta = mean_delta + channels;
  float *mean_add_delta = variance_delta + channels;

  float *x = reinterpret_cast<float *>(input_x->Data());
  float *yt = reinterpret_cast<float *>(input_yt->Data());
  float *scale = reinterpret_cast<float *>(input_scale->Data());
  float *out = reinterpret_cast<float *>(output_grad->Data());
  float *dx = reinterpret_cast<float *>(output_dx->Data());
  float *dscale = reinterpret_cast<float *>(output_scale->Data());
  float *dbias = reinterpret_cast<float *>(output_bias->Data());

  std::copy(yt, yt + batch * channels * spatial, out);
  meanVar(x, batch, spatial, channels, mean, variance);
  scaleBias(scale, batch, channels, spatial, out);
  meanDelta(out, spatial, channels, eps, variance, mean_delta);
  varianceDelta(x, out, mean, variance, batch, channels, spatial, eps, variance_delta);
  std::copy(yt, yt + batch * channels * spatial, dx);
  meanVar(x, batch, spatial, channels, eps, mean, invar);
  scaleBias(scale, batch, channels, spatial, dx);
  meanDelta(dx, spatial, channels, invar, mean_delta);
  varianceDelta(x, dx, mean, invar, batch, channels, spatial, variance_delta);
  meanAdd(x, mean, variance_delta, batch, channels, spatial, mean_add_delta, mean_delta);
  NormalizeDelta(x, mean, variance, mean_delta, variance_delta, batch, channels, eps, spatial, out);
  NormalizeDelta(x, mean, invar, mean_delta, variance_delta, batch, channels, spatial, dx);
  // dbias
  sumSpatialBatch(yt, batch * spatial, channels, dbias);
  // dscale
  backwardScale(x, mean, invar, yt, batch, channels, spatial, dscale);
  return RET_OK;
 }

 kernel::LiteKernel *CpuBNGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc,
                                                    const mindspore::lite::PrimitiveC *primitive) {
 kernel::LiteKernel *CpuBNGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                               const std::vector<lite::tensor::Tensor *> &outputs,
                                               OpParameter *opParameter, const lite::Context *ctx,
                                               const kernel::KernelKey &desc,
                                               const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_BNGradInput);
  auto *kernel = new (std::nothrow) BNGradInputCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  MS_ASSERT(desc.type == schema::PrimitiveType_BNGrad);
  auto *kernel = new (std::nothrow) BNGradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "new BNGradInputCPUKernel fail!";
    MS_LOG(ERROR) << "new BNGradCPUKernel fail!";
    return nullptr;
  }
  auto ret = kernel->Init();
@@ -112,5 +136,5 @@ kernel::LiteKernel *CpuBNGradInputFp32KernelCreator(const std::vector<lite::tens
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BNGradInput, CpuBNGradInputFp32KernelCreator)
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BNGrad, CpuBNGradFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/bn_grad.h
@@ -14,21 +14,25 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BNGRAD_INPUT_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BNGRAD_INPUT_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BN_GRAD_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BN_GRAD_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"


 namespace mindspore::kernel {
 class BNGradInputCPUKernel : public LiteKernel {



 class BNGradCPUKernel : public LiteKernel {
 public:
  explicit BNGradInputCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
  explicit BNGradCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~BNGradInputCPUKernel() override { delete workspace; }
  ~BNGradCPUKernel() override { delete workspace; }

  int Init() override;
  int ReSize() override;
@@ -38,5 +42,8 @@ class BNGradInputCPUKernel : public LiteKernel {
  float *workspace;
  int workspace_size;
 };

 // OpParameter *PopulateBNGradParameter(const lite::Primitive *primitive);

 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BNGRAD_INPUT_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_BN_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.cc
@@ -0,0 +1,121 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/arm/fp32_grad/convolution.h"
 #include "nnacl/fp32_grad/pack_ext.h"
 #include "nnacl/fp32_grad/gemm.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;

 namespace mindspore::kernel {
 int ConvolutionTrainCPUKernel::Init() {
  auto conv_param_ = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto *input_x = in_tensors_.at(kInputIndex);
  auto *input_weight = in_tensors_.at(kWeightIndex);
  auto *out_y = out_tensors_.at(kOutputIndex);

  conv_param_->output_batch_ = out_y->shape().at(kNHWC_N);
  conv_param_->input_batch_ = input_x->shape().at(kNHWC_N);
  conv_param_->input_h_ = input_x->shape().at(kNHWC_H);
  conv_param_->input_w_ = input_x->shape().at(kNHWC_W);
  conv_param_->output_h_ = out_y->shape().at(kNHWC_H);
  conv_param_->output_w_ = out_y->shape().at(kNHWC_W);
  conv_param_->input_channel_ = input_x->shape().at(kNHWC_C);
  conv_param_->output_channel_ = input_weight->shape().at(kNHWC_N);
  conv_param_->kernel_h_ = input_weight->shape().at(kNHWC_H);
  conv_param_->kernel_w_ = input_weight->shape().at(kNHWC_W);

  int ws_size = conv_param_->output_h_ * conv_param_->output_w_ * conv_param_->kernel_h_ * conv_param_->kernel_w_ *
                conv_param_->input_channel_ / conv_param_->group_;

  workspace = new float[ws_size];
  return RET_OK;
 }

 int ConvolutionTrainCPUKernel::ReSize() { return RET_OK; }

 int ConvolutionTrainCPUKernel::Run() {
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }
  auto conv_param_ = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto *input_x = in_tensors_.at(kInputIndex);
  auto *input_w = in_tensors_.at(kWeightIndex);
  auto *out_y = out_tensors_.at(kOutputIndex);

  auto x_addr = reinterpret_cast<float *>(input_x->Data());
  auto y_addr = reinterpret_cast<float *>(out_y->Data());
  auto w_addr = reinterpret_cast<float *>(input_w->Data());

  int i, j;
  int nweights = input_w->ElementsNum();
  int in_ch = conv_param_->input_channel_;
  int in_h = conv_param_->input_h_;
  int in_w = conv_param_->input_w_;
  int k_h = conv_param_->kernel_h_;
  int k_w = conv_param_->kernel_w_;
  int batch = conv_param_->output_batch_;
  int out_ch = conv_param_->output_channel_;  // out_y->shape()[3];
  int groups = conv_param_->group_;
  int out_h = conv_param_->output_h_;
  int out_w = conv_param_->output_w_;
  int m = out_h * out_w;
  int n = out_ch / groups;
  int k = k_h * k_w * in_ch / groups;

  memset(y_addr, 0, out_y->Size());

  for (i = 0; i < batch; ++i) {
    for (j = 0; j < groups; ++j) {
      float *mat_a = workspace;
      float *mat_b = w_addr + j * nweights / groups;
      float *mat_c = y_addr + (i * groups) * n * m + j * (out_ch / groups);
      float *im = x_addr + (i * groups) * (in_ch / groups) * in_h * in_w + j * (in_ch / groups);
      im2col_hwc(im, mat_a, conv_param_);
      gemm(0, 1, m, n, k, 1, mat_a, k, mat_b, k, 1, mat_c, out_ch);
    }
  }

  // std::cout << "run succ" << std::endl;
  return RET_OK;
 }

 kernel::LiteKernel *CpuConvTrainFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
                                                  const kernel::KernelKey &desc, const lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Conv2D);

  auto *kernel = new (std::nothrow) ConvolutionTrainCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.h
@@ -0,0 +1,47 @@
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_CONVOLUTION_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_CONVOLUTION_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 namespace mindspore::kernel {
 class ConvolutionTrainCPUKernel : public LiteKernel {
 public:
  explicit ConvolutionTrainCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                          const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                          const lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionTrainCPUKernel() override { delete [] workspace; }

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float *workspace;
 };

 kernel::LiteKernel *CpuConvTrainFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
                                                  const kernel::KernelKey &desc, const lite::PrimitiveC *primitive);
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_CONVOLUTION_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_filter.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_filter.cc
@@ -33,30 +33,24 @@ int ConvolutionGradFilterCPUKernel::Init() {
  // x is in input 1
  // dw is output 0

  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
    return RET_ERROR;
  }

  auto *input_tensor = inputs_.at(1);
  MS_ASSERT(input_tensor != nullptr);
  auto *dy = inputs_.at(0);
  MS_ASSERT(dy != nullptr);
  auto *weight_tensor = outputs_.at(0);
  auto *x_tensor = in_tensors_.at(1);
  MS_ASSERT(x_tensor != nullptr);
  auto *dy_tensor = in_tensors_.at(0);
  MS_ASSERT(dy_tensor != nullptr);
  auto *weight_tensor = out_tensors_.at(0);
  MS_ASSERT(weight_tensor != nullptr);

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  conv_param->output_batch_ = this->inputs_.at(0)->shape().at(kNHWC_N);
  conv_param->input_batch_ = this->inputs_.at(1)->shape().at(kNHWC_N);
  conv_param->input_h_ = this->inputs_.at(1)->shape().at(kNHWC_H);
  conv_param->input_w_ = this->inputs_.at(1)->shape().at(kNHWC_W);
  // assume OutCh|kh|kw|In
  conv_param->input_channel_ = this->inputs_.at(1)->shape().at(kNHWC_C);
  conv_param->output_channel_ = this->outputs_.at(0)->shape().at(kNHWC_N);
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
  conv_param->output_batch_ = dy_tensor->shape().at(kNHWC_N);
  conv_param->input_batch_ = x_tensor->shape().at(kNHWC_N);
  conv_param->input_h_ = x_tensor->shape().at(kNHWC_H);
  conv_param->input_w_ = x_tensor->shape().at(kNHWC_W);
  // assume OutCh|kh|kw|InCh
  conv_param->input_channel_ = x_tensor->shape().at(kNHWC_C);
  conv_param->output_channel_ = dy_tensor->shape().at(kNHWC_C);
  // TBD
  conv_param->output_h_ = dy_tensor->shape()[kNHWC_H];
  conv_param->output_w_ = dy_tensor->shape()[kNHWC_W];

  int ws_size = conv_param->output_h_ * conv_param->output_w_ * conv_param->kernel_h_ * conv_param->kernel_w_ *
                conv_param->input_channel_ / conv_param->group_;
@@ -67,34 +61,21 @@ int ConvolutionGradFilterCPUKernel::Init() {
    return RET_ERROR;
  }

  int output_w = 0;
  int output_h = 0;
  output_h = dy->shape()[kNHWC_H];
  output_w = dy->shape()[kNHWC_W];

  std::vector<int> out_shape(4);
  out_shape.at(0) = conv_param->output_channel_;
  out_shape.at(1) = conv_param->kernel_h_;
  out_shape.at(2) = conv_param->kernel_w_;
  out_shape.at(3) = conv_param->input_channel_ / conv_param->group_;

  // weight is output
  weight_tensor->set_shape(out_shape);
  weight_tensor->set_data_type(input_tensor->data_type());

  conv_param->output_h_ = output_h;
  conv_param->output_w_ = output_w;

  return RET_OK;
 }

 int ConvolutionGradFilterCPUKernel::ReSize() { return 0; }
 int ConvolutionGradFilterCPUKernel::ReSize() { return RET_OK; }

 int ConvolutionGradFilterCPUKernel::Run() {
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  auto *input_dy = inputs_.at(0);
  auto *input_x = inputs_.at(1);
  auto *out_dw = outputs_.at(0);
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto *input_dy = in_tensors_.at(0);
  auto *input_x = in_tensors_.at(1);
  auto *out_dw = out_tensors_.at(0);

  auto x_addr = reinterpret_cast<float *>(input_x->Data());
  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
@@ -135,7 +116,48 @@ int ConvolutionGradFilterCPUKernel::Run() {
  // std::cout << "run succ" << std::endl;
  return RET_OK;
 }
 #if 0
 OpParameter *PopulateConvolutionGradFilterParameter(const lite::Primitive *primitive) {
  ConvParameter *param = new (std::nothrow) ConvParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for conv grad filter failed.";
    return nullptr;
  }
  param->op_parameter_.type_ = primitive->Type();

  auto convg_primitive = primitive->Value()->value_as_Conv2DGradFilter();
  param->kernel_h_ = convg_primitive->kernelH();
  param->kernel_w_ = convg_primitive->kernelW();
  param->stride_h_ = convg_primitive->strideH();
  param->stride_w_ = convg_primitive->strideW();
  param->dilation_h_ = convg_primitive->dilateH();
  param->dilation_w_ = convg_primitive->dilateW();
  param->pad_h_ = convg_primitive->padUp();
  param->pad_w_ = convg_primitive->padLeft();
  param->pad_u_ = convg_primitive->padUp();
  param->pad_d_ = convg_primitive->padDown();
  param->pad_l_ = convg_primitive->padLeft();
  param->pad_r_ = convg_primitive->padRight();
  param->group_ = convg_primitive->group();
  auto act_type = convg_primitive->activationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      param->is_relu_ = true;
      param->is_relu6_ = false;
      break;
    case schema::ActivationType_RELU6:
      param->is_relu_ = false;
      param->is_relu6_ = true;
      break;
    default:
      param->is_relu_ = false;
      param->is_relu6_ = false;
      break;
  }

  return reinterpret_cast<OpParameter *>(param);
 }
 #endif
 kernel::LiteKernel *CpuConvGradFilterFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                       const std::vector<lite::tensor::Tensor *> &outputs,
                                                       OpParameter *opParameter, const lite::Context *ctx,
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_filter.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_filter.h
@@ -1,4 +1,4 @@
 /**
 /**
 * Copyright 2019 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -28,15 +28,17 @@ class ConvolutionGradFilterCPUKernel : public LiteKernel {
                                          const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                          const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionGradFilterCPUKernel() override { delete workspace; }
  ~ConvolutionGradFilterCPUKernel() override { delete [] workspace; }

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  float *workspace;
  float *workspace = nullptr;
 };


 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_CONVOLUTION_GRAD_FILTER_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_input.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_input.cc
@@ -29,23 +29,14 @@ using mindspore::schema::PrimitiveType_Conv2DGradInput;

 namespace mindspore::kernel {
 int ConvolutionGradInputCPUKernel::Init() {
  if (2 != this->inputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has 2 inputs";
    return RET_ERROR;
  }
  if (1 != this->outputs_.size()) {
    MS_LOG(ERROR) << "Conv2d Grad should has one output";
    return RET_ERROR;
  }

  auto *dy_tensor = inputs_.at(kInputIndex);
  auto *dy_tensor = in_tensors_.at(kInputIndex);
  MS_ASSERT(dy_tensor != nullptr);
  auto *weight_tensor = inputs_.at(kWeightIndex);
  auto *weight_tensor = in_tensors_.at(kWeightIndex);
  MS_ASSERT(weight_tensor != nullptr);
  auto *dx_tensor = outputs_.at(kOutputIndex);
  auto *dx_tensor = out_tensors_.at(kOutputIndex);
  MS_ASSERT(dx_tensor != nullptr);

  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
  conv_param->output_batch_ = dx_tensor->shape()[(kNHWC_N)];
  conv_param->input_batch_ = dy_tensor->shape()[(kNHWC_N)];

@@ -74,10 +65,16 @@ int ConvolutionGradInputCPUKernel::Init() {
 int ConvolutionGradInputCPUKernel::ReSize() { return 0; }

 int ConvolutionGradInputCPUKernel::Run() {
  auto conv_param = reinterpret_cast<ConvParameter *>(opParameter);
  auto *input_dy = inputs_.at(0);
  auto *input_w = inputs_.at(1);
  auto *out_dx = outputs_.at(0);
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }

  auto conv_param = reinterpret_cast<ConvParameter *>(op_parameter_);
  auto *input_dy = in_tensors_.at(0);
  auto *input_w = in_tensors_.at(1);
  auto *out_dx = out_tensors_.at(0);

  auto dy_addr = reinterpret_cast<float *>(input_dy->Data());
  auto w_addr = reinterpret_cast<float *>(input_w->Data());
@@ -116,6 +113,49 @@ int ConvolutionGradInputCPUKernel::Run() {
  return 0;
 }

 #if 0
 OpParameter *PopulateConvolutionGradInputParameter(const lite::Primitive *primitive) {
  ConvParameter *param = new (std::nothrow) ConvParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for conv grad input failed.";
    return nullptr;
  }
  param->op_parameter_.type_ = primitive->Type();

  auto convg_primitive = primitive->Value()->value_as_Conv2DGradInput();
  param->kernel_h_ = convg_primitive->kernelH();
  param->kernel_w_ = convg_primitive->kernelW();
  param->stride_h_ = convg_primitive->strideH();
  param->stride_w_ = convg_primitive->strideW();
  param->dilation_h_ = convg_primitive->dilateH();
  param->dilation_w_ = convg_primitive->dilateW();
  param->pad_h_ = convg_primitive->padUp();
  param->pad_w_ = convg_primitive->padLeft();
  param->pad_u_ = convg_primitive->padUp();
  param->pad_d_ = convg_primitive->padDown();
  param->pad_l_ = convg_primitive->padLeft();
  param->pad_r_ = convg_primitive->padRight();
  param->group_ = convg_primitive->group();
  auto act_type = convg_primitive->activationType();
  switch (act_type) {
    case schema::ActivationType_RELU:
      param->is_relu_ = true;
      param->is_relu6_ = false;
      break;
    case schema::ActivationType_RELU6:
      param->is_relu_ = false;
      param->is_relu6_ = true;
      break;
    default:
      param->is_relu_ = false;
      param->is_relu6_ = false;
      break;
  }

  return reinterpret_cast<OpParameter *>(param);
 }
 #endif

 kernel::LiteKernel *CpuConvGradInputFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                      const std::vector<lite::tensor::Tensor *> &outputs,
                                                      OpParameter *opParameter, const lite::Context *ctx,
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_input.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_input.h
@@ -28,7 +28,7 @@ class ConvolutionGradInputCPUKernel : public LiteKernel {
                                         const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                                         const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~ConvolutionGradInputCPUKernel() override { delete workspace; }
  ~ConvolutionGradInputCPUKernel() override { delete [] workspace; }

  int Init() override;
  int ReSize() override;
@@ -37,6 +37,9 @@ class ConvolutionGradInputCPUKernel : public LiteKernel {
 private:
  float *workspace;
 };

 // OpParameter *PopulateConvolutionGradInputParameter(const lite::Primitive *primitive);

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_CONVOLUTION_GRAD_INPUT_H
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/depend.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/depend.cc
@@ -0,0 +1,73 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <vector>
 #include "src/runtime/kernel/arm/fp32_grad/depend.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Depend;

 namespace mindspore::kernel {

 int DependCPUKernel::Init() {
  return RET_OK;
 }

 int DependCPUKernel::ReSize() { return 0; }

 int DependCPUKernel::Run() {
 #if 0
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }
  auto in = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto out = reinterpret_cast<float *>(out_tensors_.at(0)->Data());

  memcpy(out, in, in_tensors_.at(0)->Size());
 #endif
  return RET_OK;
 }

 kernel::LiteKernel *CpuDependFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                 const std::vector<lite::tensor::Tensor *> &outputs,
                                                 OpParameter *opParameter, const lite::Context *ctx,
                                                 const kernel::KernelKey &desc, const lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_Depend);
  auto *kernel =
    new (std::nothrow) DependCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeBool, PrimitiveType_Depend, CpuDependFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/depend.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/depend.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_DEPEND_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_DEPEND_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "nnacl/fp32/arithmetic.h"

 namespace mindspore::kernel {
 class DependCPUKernel : public LiteKernel {
 public:
  explicit DependCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                             const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                             const lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    param = parameter;
  }
  ~DependCPUKernel() override = default;

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  OpParameter *param;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_DEPEND_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/make_tuple.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/make_tuple.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_MAKE_TUPLE_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_MAKE_TUPLE_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "src/runtime/kernel/arm/nnacl/fp32/arithmetic.h"

 namespace mindspore::kernel {
 class MakeTupleCPUKernel : public LiteKernel {
 public:
  explicit MakeTupleCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                             const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                             const lite::Primitive *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    param = parameter;
  }
  ~MakeTupleCPUKernel() override = default;

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  OpParameter *param;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_MAKE_TUPLE_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/opt_momentum.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/opt_momentum.cc
@@ -1,87 +0,0 @@

 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/fp32_grad/opt_momentum.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_OptMomentum;

 namespace mindspore::kernel {

 int OptMomentumCPUKernel::ReSize() { return 0; }

 int OptMomentumCPUKernel::Run() {
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }
  if (inputs_.size() != 5 || !outputs_.empty()) {
    MS_LOG(ERROR) << "OptMomentumCPUKernel error input output size!";
    return RET_ERROR;
  }

  if (inputs_[0]->ElementsNum() != inputs_[1]->ElementsNum() ||
      inputs_[0]->ElementsNum() != inputs_[3]->ElementsNum()) {
    MS_LOG(ERROR) << "error input data size!";
    return RET_ERROR;
  }
  auto weight = reinterpret_cast<float *>(inputs_[0]->Data());
  auto accumulate = reinterpret_cast<float *>(inputs_[1]->Data());
  float learning_rate = reinterpret_cast<float *>(inputs_[2]->Data())[0];
  auto gradient = reinterpret_cast<float *>(inputs_[3]->Data());
  float moment = reinterpret_cast<float *>(inputs_[4]->Data())[0];
  size_t elem_num = inputs_[0]->ElementsNum();
  for (size_t i = 0; i < elem_num; ++i) {
    accumulate[i] = accumulate[i] * moment + gradient[i];
    weight[i] -= accumulate[i] * learning_rate;
  }
  return RET_OK;
 }

 int OptMomentumCPUKernel::Init() { return 0; }

 kernel::LiteKernel *CpuOptMomentumFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                    const std::vector<lite::tensor::Tensor *> &outputs,
                                                    OpParameter *opParameter, const lite::Context *ctx,
                                                    const kernel::KernelKey &desc,
                                                    const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(desc.type == schema::PrimitiveType_OptMomentum);
  auto *kernel = new (std::nothrow) OptMomentumCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "new OptMomentumCPUKernel fail!";
    return nullptr;
  }

  auto ret = kernel->Init();
  if (0 != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_OptMomentum, CpuOptMomentumFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.cc
@@ -20,6 +20,7 @@
 #include "nnacl/fp32/pooling.h"
 #include "nnacl/fp32_grad/pooling_grad.h"
 #include "include/errorcode.h"
 // #include "src/train/ops/train_ops.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
@@ -29,9 +30,15 @@ using mindspore::schema::PrimitiveType_PoolingGrad;

 namespace mindspore::kernel {
 int PoolingGradCPUKernel::Init() {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);

  auto in_shape = inputs_.at(0)->shape();
  auto in_shape = in_tensors_.at(0)->shape();
  auto out_shape = in_tensors_.at(1)->shape();

  if (pool_param->pool_mode_ == PoolMode_AvgPool) {
    in_shape = in_tensors_.at(1)->shape();
    out_shape = in_tensors_.at(0)->shape();
  }
  int input_h = in_shape.at(1);
  int input_w = in_shape.at(2);

@@ -40,25 +47,39 @@ int PoolingGradCPUKernel::Init() {
    pool_param->window_h_ = input_h;
  }

  pool_param->input_h_ = in_shape[kNHWC_H];
  pool_param->input_w_ = in_shape[kNHWC_W];
  pool_param->input_batch_ = in_shape[kNHWC_N];
  pool_param->input_channel_ = in_shape[kNHWC_C];

  // Emir -- here I assume we get the outputshape in the output tensor
  auto *out_tensor = outputs_.front();
  auto out_shape = out_tensor->shape();
  // auto *out_tensor = out_tensors_.front();
  // auto out_shape = in_tensors_.at(1)->shape();

  pool_param->output_h_ = out_shape[kNHWC_H];
  pool_param->output_w_ = out_shape[kNHWC_W];
  pool_param->output_batch_ = out_shape[kNHWC_N];
  pool_param->output_channel_ = out_shape[kNHWC_C];

  out_tensor->set_shape(out_shape);
  out_tensor->set_data_type(inputs_.at(0)->data_type());
  return RET_OK;
 }

 int PoolingGradCPUKernel::ReSize() { return RET_OK; }

 int PoolingGradCPUKernel::Run() {
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(opParameter);
  auto input_ptr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto output_ptr = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto prepare_ret = Prepare();
  if (prepare_ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
    return prepare_ret;
  }
  PoolingParameter *pool_param = reinterpret_cast<PoolingParameter *>(op_parameter_);
  auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto output_ptr = reinterpret_cast<float *>(out_tensors_.at(0)->Data());

  if (pool_param->pool_mode_ == PoolMode_MaxPool) {
    auto ind = reinterpret_cast<int *>(inputs_.at(1)->Data());
    MaxPoolingGrad(input_ptr, ind, output_ptr, pool_param);
    auto dx_ptr = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
    auto dy_ptr = reinterpret_cast<float *>(in_tensors_.at(2)->Data());
    MaxPoolingGrad(input_ptr, dx_ptr, dy_ptr, output_ptr, pool_param);
  } else {
    AvgPoolingGrad(input_ptr, output_ptr, pool_param);
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/pooling_grad.h
@@ -43,6 +43,7 @@ class PoolingGradCPUKernel : public LiteKernel {
 private:
  uint8_t data_shape_{0};
 };

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_POOLING_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/power_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/power_grad.cc
@@ -31,10 +31,10 @@ int PowerGradCPUKernel::Init() { return RET_OK; }
 int PowerGradCPUKernel::ReSize() { return RET_OK; }

 int PowerGradCPUKernel::Run() {
  auto dy_addr = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto x_addr = reinterpret_cast<float *>(inputs_.at(1)->Data());
  auto dx_addr = reinterpret_cast<float *>(outputs_.at(0)->Data());
  auto size = inputs_.at(0)->ElementsNum();
  auto dy_addr = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto x_addr = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
  auto dx_addr = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
  auto size = in_tensors_.at(0)->ElementsNum();

  float exp = power_ - 1;
  Power(x_addr, &exp, dx_addr, size, scale_, shift_, true);
@@ -47,6 +47,7 @@ int PowerGradCPUKernel::Run() {
  return RET_OK;
 }


 kernel::LiteKernel *CpuPowerGradFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                  const std::vector<lite::tensor::Tensor *> &outputs,
                                                  OpParameter *opParameter, const lite::Context *ctx,
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/power_grad.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/power_grad.h
@@ -45,6 +45,7 @@ class PowerGradCPUKernel : public LiteKernel {
  float scale_;
  float shift_;
 };

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_POWER_GRAD_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.cc
@@ -14,6 +14,7 @@
 * limitations under the License.
 */

 #include <math.h>
 #include "src/kernel_registry.h"
 #include "nnacl/softmax_parameter.h"
 #include "nnacl/fp32/softmax.h"
@@ -46,9 +47,10 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const int
  output[0] = total_loss / param->batch_size_;
 }

 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses,
 void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses, float *grads,
                                                                   float *output) const {
  size_t row_start = 0;
  float total_loss = 0;
  for (int i = 0; i < param->batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(EXCEPTION) << "label value must >= 0";
@@ -56,78 +58,88 @@ void SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *la
    size_t label = labels[i];
    if (label > param->number_of_classes_) {
      MS_LOG(EXCEPTION) << "error label input!";
    }
    for (size_t j = 0; j < param->number_of_classes_; ++j) {
      size_t index = row_start + j;
      if (j == label) {
        output[index] = (losses[index] - 1) / param->batch_size_;
      } else {
        output[index] = losses[index] / param->batch_size_;
    } else {
      total_loss -= logf(losses[i * param->number_of_classes_ + label]);
      for (size_t j = 0; j < param->number_of_classes_; ++j) {
        size_t index = row_start + j;
        if (j == label) {
          grads[index] = (losses[index] - 1) / param->batch_size_;
        } else {
          grads[index] = losses[index] / param->batch_size_;
        }
      }
    }
    row_start += param->number_of_classes_;
  }
  output[0] = total_loss / param->batch_size_;
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Run() {
  auto ins = reinterpret_cast<float *>(inputs_.at(0)->Data());
  auto labels = reinterpret_cast<int *>(inputs_.at(1)->Data());
  auto out = reinterpret_cast<float *>(outputs_.at(1)->Data());
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return ret;
  }

  auto ins = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto labels = reinterpret_cast<int *>(in_tensors_.at(1)->Data());
  float *out = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
  float *grads = NULL;
  if (is_train()) {  // outputs_.size() > 1)
    grads = reinterpret_cast<float *>(outputs_.at(0)->Data());
  if (is_train() && out_tensors_.size() > 1) {
    grads = reinterpret_cast<float *>(out_tensors_.at(1)->Data());
  }
  size_t data_size = inputs_.at(0)->ElementsNum();
  size_t data_size = in_tensors_.at(0)->ElementsNum();
  float *losses = new (std::nothrow) float[data_size];
  if (losses == nullptr) {
    MS_LOG(ERROR) << "losses is null";
    return nullptr;
    return RET_ERROR;
  }

  std::fill(losses, losses + data_size, 0);

  MS_ASSERT(out != nullptr);
  MS_ASSERT(labels != nullptr);
  MS_ASSERT(ins != nullptr);

  SoftmaxParameter sm_params;
  sm_params.n_dim_ = param->n_dim_;
  sm_params.element_size_ = data_size;
  sm_params.axis_ = 0;
  for (int i = 0; i < 4; i++)  // softmax has only 4 params in shape
    sm_params.input_shape_[i] = param->input_shape_[i];
  float sum_data[sm_params.input_shape_[sm_params.axis_]] = {0};
  std::fill(sum_data, sum_data + sm_params.input_shape_[sm_params.axis_], 0);
  Softmax(ins, losses, sum_data, &sm_params);

  std::fill(losses_, losses_ + data_size, 0);
  std::fill(sum_data_, sum_data_ + sm_params_.input_shape_[0], 0);
  Softmax(ins, losses_, sum_data_, &sm_params_);
  if (is_train()) {
    GradPostExecute(labels, losses, grads);
  } else {
    ForwardPostExecute(labels, losses, out);
    GradPostExecute(labels, losses_, grads, out);
  } else if (out != nullptr) {
    ForwardPostExecute(labels, losses_, out);
  }
  return RET_OK;
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Init() {
  if (context_->infer_shape_interrupt_ && !context_->running_) {
    SetNeedReInit();
    return RET_OK;
  }
  auto dims = inputs_[0]->shape();
  // if (context_ && context_->infer_shape_interrupt_ && !context_->running_) {
  // set_need_reinit();
  //  return RET_OK;
  // }
  auto dims = in_tensors_[0]->shape();
  param->n_dim_ = 2;
  param->number_of_classes_ = dims[1];
  param->batch_size_ = dims[0];
  for (unsigned int i = 0; i < dims.size(); i++) param->input_shape_[i] = dims[i];
  if (2 != this->inputs_.size()) {
  if (2 != this->in_tensors_.size()) {
    MS_LOG(ERROR) << "softmax entropy loss should have two inputs";
    return RET_ERROR;
  }
  auto *in0 = inputs_.front();
  auto *in0 = in_tensors_.front();
  if (in0 == nullptr) {
    MS_LOG(ERROR) << "softmax etropy loss in0 have no data";
    return RET_ERROR;
  }

  size_t data_size = in_tensors_.at(0)->ElementsNum();
  losses_ = new (std::nothrow) float[data_size];
  sum_data_ = new (std::nothrow) float[dims[0]];
  MS_ASSERT(losses_ != nullptr);
  MS_ASSERT(sum_data_ != nullptr);

  sm_params_.n_dim_ = 2;
  sm_params_.element_size_ = data_size;
  sm_params_.axis_ = 1;
  for (int i = 0; i < dims.size(); i++) sm_params_.input_shape_[i] = dims[i];

  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.h
@@ -14,31 +14,32 @@
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "src/train/loss_kernel.h"
 #include "ir/anf.h"
 #include "nnacl/fp32_grad/softmax_grad.h"
 #include "nnacl/fp32/arithmetic.h"
 #include "nnacl/softmax_parameter.h"

 namespace mindspore::kernel {

 class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LiteKernel {
 class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LossKernel {
 public:
  explicit SparseSoftmaxCrossEntropyWithLogitsCPUKernel(OpParameter *parameter,
                                                        const std::vector<lite::tensor::Tensor *> &inputs,
                                                        const std::vector<lite::tensor::Tensor *> &outputs,
                                                        const lite::Context *ctx,
                                                        const mindspore::lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
      : LossKernel(parameter, inputs, outputs, ctx, primitive) {
    param = reinterpret_cast<SoftmaxCrossEntropyParameter *>(parameter);
  }
  ~SparseSoftmaxCrossEntropyWithLogitsCPUKernel() override = default;
  ~SparseSoftmaxCrossEntropyWithLogitsCPUKernel() override { delete[] losses_; delete[] sum_data_; }

  void ForwardPostExecute(const int *labels, const float *losses, float *output) const;
  void GradPostExecute(const int *labels, const float *losses, float *output) const;
  void GradPostExecute(const int *labels, const float *losses, float* grads, float *output) const;

  int Init() override;
  int ReSize() override;
@@ -46,7 +47,11 @@ class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LiteKernel {

 private:
  SoftmaxCrossEntropyParameter *param;
  SoftmaxParameter sm_params_;
  float *losses_ = nullptr;
  float *sum_data_ = nullptr;
 };

 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_SPARSE_SOFTMAX_CROSS_ENTROPY_WITH_LOGITS_H_
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/tuple_getitem.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/tuple_getitem.cc
@@ -0,0 +1,72 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <vector>
 #include "src/runtime/kernel/arm/fp32_grad/tuple_getitem.h"
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"

 using mindspore::kernel::KERNEL_ARCH::kCPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_TupleGetItem;

 namespace mindspore::kernel {

 int TupleGetItemCPUKernel::Init() {
  return RET_OK;
 }

 int TupleGetItemCPUKernel::ReSize() { return 0; }

 int TupleGetItemCPUKernel::Run() {
  auto ret = Prepare();
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Prepare failed.";
    return RET_ERROR;
  }
  auto in = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
  auto out = reinterpret_cast<float *>(out_tensors_.at(0)->Data());

  memcpy(out, in, in_tensors_.at(0)->Size());

  return RET_OK;
 }

 kernel::LiteKernel *CpuTupleGetItemFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
                                                 const std::vector<lite::tensor::Tensor *> &outputs,
                                                 OpParameter *opParameter, const lite::Context *ctx,
                                                 const kernel::KernelKey &desc, const lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_TupleGetItem);
  auto *kernel =
    new (std::nothrow) TupleGetItemCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  MS_ASSERT(kernel != nullptr);

  auto ret = kernel->Init();
  if (RET_OK != ret) {
    MS_LOG(ERROR) << "Init kernel failed, name: " << opParameter->name_ << ", type: "
                  << schema::EnumNamePrimitiveType(static_cast<schema::PrimitiveType>(opParameter->type_));
    delete kernel;
    return nullptr;
  }
  return kernel;
 }

 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_TupleGetItem, CpuTupleGetItemFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/tuple_getitem.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/tuple_getitem.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_TUPLE_GETITEM_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_TUPLE_GETITEM_H_

 #include <vector>
 #include "src/lite_kernel.h"
 #include "ir/anf.h"

 #include "nnacl/fp32/arithmetic.h"

 namespace mindspore::kernel {
 class TupleGetItemCPUKernel : public LiteKernel {
 public:
  explicit TupleGetItemCPUKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                             const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx,
                             const lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {
    param = parameter;
  }
  ~TupleGetItemCPUKernel() override = default;

  int Init() override;
  int ReSize() override;
  int Run() override;

 private:
  OpParameter *param;
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_GRAD_TUPLE_GETITEM_H_
--- a/mindspore/lite/src/scheduler.cc
+++ b/mindspore/lite/src/scheduler.cc
@@ -94,8 +94,10 @@ int Scheduler::InferShape(const lite::Model *model, std::vector<tensor::Tensor *
      inputs.emplace_back(tensors->at(size_t(inIndexes->GetAs<uint32_t>(j))));
    }
    auto outIndexes = cNode->outputIndex();
    for (size_t j = 0; j < outIndexes->size(); j++) {
      outputs.emplace_back(tensors->at(size_t(outIndexes->GetAs<uint32_t>(j))));
    if (outIndexes != nullptr) {
      for (size_t j = 0; j < outIndexes->size(); j++) {
        outputs.emplace_back(tensors->at(size_t(outIndexes->GetAs<uint32_t>(j))));
      }
    }
    auto *primitive = model->GetOp(cNode->name()->str());
    if (primitive == nullptr) {
--- a/mindspore/lite/src/train/loss_kernel.h
+++ b/mindspore/lite/src/train/loss_kernel.h
@@ -0,0 +1,34 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_TRAIN_LOSS_KERNEL_H_
 #define MINDSPORE_LITE_SRC_TRAIN_LOSS_KERNEL_H_
 #include <vector>
 #include "src/lite_kernel.h"
 namespace mindspore::kernel {

 class LossKernel : public LiteKernel {
 public:
  LossKernel()  = default;
  explicit LossKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                      const std::vector<lite::tensor::Tensor *> &outputs,
                      const lite::Context *ctx,
                      const lite::PrimitiveC *primitive)
      : LiteKernel(parameter, inputs, outputs, ctx, primitive) {}
  ~LossKernel() = default;
 };

 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_TRAIN_LOSS_KERNEL_H_
--- a/mindspore/lite/src/train/train_populate_parameter.cc
+++ b/mindspore/lite/src/train/train_populate_parameter.cc
@@ -0,0 +1,250 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/populate_parameter.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/ops/pooling_grad.h"
 #include "nnacl/pooling_parameter.h"
 #include "src/ops/softmax_cross_entropy.h"
 #include "nnacl/fp32_grad/softmax_grad.h"
 #include "src/ops/activation_grad.h"
 #include "nnacl/fp32/activation.h"
 #include "src/ops/conv2d_grad_filter.h"
 #include "src/ops/conv2d_grad_input.h"
 #include "nnacl/conv_parameter.h"
 #include "src/ops/power_grad.h"
 #include "nnacl/power_parameter.h"

 namespace mindspore::kernel {

 OpParameter *DefaultPopulateParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }

  OpParameter *param = new (std::nothrow) OpParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for primitive failed.";
    return nullptr;
  }

  param->type_ = primitive->Type();
  return param;
 }

 OpParameter *PopulateSoftmaxCrossEntropyParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }
  SoftmaxCrossEntropyParameter *sce_param = new (std::nothrow) SoftmaxCrossEntropyParameter();
  if (sce_param == nullptr) {
    MS_LOG(ERROR) << "new SoftmaxCrossEntropyParameter failed.";
    return nullptr;
  }
  sce_param->op_parameter_.type_ = primitive->Type();
  return reinterpret_cast<OpParameter *>(sce_param);
 }

 OpParameter *PopulatePoolingGradParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }
  PoolingParameter *pooling_param = new (std::nothrow) PoolingParameter();
  if (pooling_param == nullptr) {
    MS_LOG(ERROR) << "new PoolingParameter failed.";
    return nullptr;
  }
  pooling_param->op_parameter_.type_ = primitive->Type();
  auto pooling_primitive =
    reinterpret_cast<mindspore::lite::PoolingGrad *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));

  pooling_param->global_ = pooling_primitive->GetGlobal();
  pooling_param->window_w_ = pooling_primitive->GetWindowW();
  pooling_param->window_h_ = pooling_primitive->GetWindowH();

  pooling_param->pad_u_ = pooling_primitive->GetPadUp();
  pooling_param->pad_d_ = pooling_primitive->GetPadDown();
  pooling_param->pad_l_ = pooling_primitive->GetPadLeft();
  pooling_param->pad_r_ = pooling_primitive->GetPadRight();
  pooling_param->stride_w_ = pooling_primitive->GetStrideW();
  pooling_param->stride_h_ = pooling_primitive->GetStrideH();

  pooling_param->pool_mode_ = PoolMode_No;
  pooling_param->round_mode_ = RoundMode_No;

  switch (pooling_primitive->GetPoolingMode()) {
    case schema::PoolMode_MAX_POOLING:
      pooling_param->pool_mode_ = PoolMode_MaxPool;
      break;
    case schema::PoolMode_MEAN_POOLING:
      pooling_param->pool_mode_ = PoolMode_AvgPool;
      break;
    default:
      break;
  }

  switch (pooling_primitive->GetRoundMode()) {
    case schema::RoundMode_FLOOR:
      pooling_param->round_mode_ = RoundMode_Floor;
      break;
    case schema::RoundMode_CEIL:
      pooling_param->round_mode_ = RoundMode_Ceil;
      break;
    default:
      break;
  }
  return reinterpret_cast<OpParameter *>(pooling_param);
 }

 OpParameter *PopulateActivationGradParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }

  ActivationParameter *act_param = new (std::nothrow) ActivationParameter();
  if (act_param == nullptr) {
    MS_LOG(ERROR) << "new ActivationParameter failed.";
    return nullptr;
  }
  act_param->op_parameter_.type_ = primitive->Type();
  auto activation =
    reinterpret_cast<mindspore::lite::ActivationGrad *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
  act_param->type_ = static_cast<int>(activation->GetType());
  act_param->alpha_ = activation->GetAlpha();
  return reinterpret_cast<OpParameter *>(act_param);
 }

 OpParameter *PopulateConvolutionGradFilterParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }

  ConvParameter *param = new (std::nothrow) ConvParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for conv grad filter failed.";
    return nullptr;
  }
  param->op_parameter_.type_ = primitive->Type();

  auto convg_primitive =
    reinterpret_cast<mindspore::lite::Conv2DGradFilter *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
  param->kernel_h_   = convg_primitive->GetKernelH();
  param->kernel_w_   = convg_primitive->GetKernelW();
  param->stride_h_   = convg_primitive->GetStrideH();
  param->stride_w_   = convg_primitive->GetStrideW();
  param->dilation_h_ = convg_primitive->GetDilateH();
  param->dilation_w_ = convg_primitive->GetDilateW();
  param->pad_u_ = convg_primitive->GetPadUp();
  param->pad_d_ = convg_primitive->GetPadDown();
  param->pad_l_ = convg_primitive->GetPadLeft();
  param->pad_r_ = convg_primitive->GetPadRight();
  param->group_ = convg_primitive->GetGroup();
  param->act_type_ = ActType_No;
  switch (convg_primitive->GetActivationType()) {
    case schema::ActivationType_RELU:
      param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      param->act_type_ = ActType_Relu6;
      break;
    default:
      break;
  }

  return reinterpret_cast<OpParameter *>(param);
 }

 OpParameter *PopulateConvolutionGradInputParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }

  ConvParameter *param = new (std::nothrow) ConvParameter();
  if (param == nullptr) {
    MS_LOG(ERROR) << "new Param for conv grad filter failed.";
    return nullptr;
  }
  param->op_parameter_.type_ = primitive->Type();

  auto convg_primitive =
    reinterpret_cast<mindspore::lite::Conv2DGradInput *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
  param->kernel_h_   = convg_primitive->GetKernelH();
  param->kernel_w_   = convg_primitive->GetKernelW();
  param->stride_h_   = convg_primitive->GetStrideH();
  param->stride_w_   = convg_primitive->GetStrideW();
  param->dilation_h_ = convg_primitive->GetDilateH();
  param->dilation_w_ = convg_primitive->GetDilateW();
  param->pad_u_ = convg_primitive->GetPadUp();
  param->pad_d_ = convg_primitive->GetPadDown();
  param->pad_l_ = convg_primitive->GetPadLeft();
  param->pad_r_ = convg_primitive->GetPadRight();
  param->group_ = convg_primitive->GetGroup();
  param->act_type_ = ActType_No;
  switch (convg_primitive->GetActivationType()) {
    case schema::ActivationType_RELU:
      param->act_type_ = ActType_Relu;
      break;
    case schema::ActivationType_RELU6:
      param->act_type_ = ActType_Relu6;
      break;
    default:
      break;
  }

  return reinterpret_cast<OpParameter *>(param);
 }

 OpParameter *PopulatePowerGradParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }

  PowerParameter *power_param = new (std::nothrow) PowerParameter();
  if (power_param == nullptr) {
    MS_LOG(ERROR) << "new PowerParameter failed.";
    return nullptr;
  }
  power_param->op_parameter_.type_ = primitive->Type();
  auto power = reinterpret_cast<mindspore::lite::PowerGrad *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
  power_param->power_ = power->GetPower();
  power_param->scale_ = power->GetScale();
  power_param->shift_ = power->GetShift();
  return reinterpret_cast<OpParameter *>(power_param);
 }

 void PopulateTrainParameters() {
  auto ppr = PopulateParameterRegistry::GetInstance();
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_ApplyMomentum, DefaultPopulateParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_BiasGrad, PopulateArithmetic);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_SoftmaxCrossEntropy, PopulateSoftmaxCrossEntropyParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_ActivationGrad, PopulateActivationGradParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_TupleGetItem, DefaultPopulateParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_Depend, DefaultPopulateParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_BNGrad, DefaultPopulateParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_Conv2DGradFilter, PopulateConvolutionGradFilterParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_Conv2DGradInput, PopulateConvolutionGradInputParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_PoolingGrad, PopulatePoolingGradParameter);
  ppr->AddPopulateParameterFunc(schema::PrimitiveType_PowerGrad, PopulatePowerGradParameter);
 }

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/train/train_populate_parameter.h
+++ b/mindspore/lite/src/train/train_populate_parameter.h
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_POPULATE_PARAMETER_H_
 #define MINDSPORE_LITE_SRC_TRAIN_TRAIN_POPULATE_PARAMETER_H_

 #include "src/ops/primitive_c.h"

 namespace mindspore::kernel {

  void PopulateTrainParameters();


 }  // namespace mindspore::kernel
 #endif  // MINDSPORE_LITE_SRC_TRAIN_TRAIN_POPULATE_PARAMETER_H_
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -0,0 +1,136 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "include/train_session.h"
 #include <algorithm>
 #include "utils/log_adapter.h"
 #include "include/context.h"
 #include "src/common/utils.h"
 #include "mindspore/lite/src/ir/tensor.h"
 #include "src/train/loss_kernel.h"
 #include "src/train/train_populate_parameter.h"
 #include "src/runtime/runtime_api.h"
 #include "src/executor.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/arm/fp32_grad/convolution.h"

 namespace mindspore::session {

 TrainSession::TrainSession() { kernel::PopulateTrainParameters(); }

 void TrainSession::ReplaceOps() {
  mindspore::lite::KernelRegistrar tmp(mindspore::kernel::KERNEL_ARCH::kCPU, kNumberTypeFloat32,
                                       mindspore::schema::PrimitiveType_Conv2D,
                                       mindspore::kernel::CpuConvTrainFp32KernelCreator);
 }

 int TrainSession::CompileGraph(lite::Model *model) {
  model_ = model;
  ReplaceOps();
  return LiteSession::CompileGraph(model);
 }

 void* TrainSession::ExportToBuf(void* buf, size_t *len) const {
 //  auto train_model_impl = (dynamic_cast<lite::train::TrainModelImpl*>(model_->model_impl()));
 //  return train_model_impl->ExportToBuf(buf, len);
  return nullptr;
 }


 int TrainSession::RunGraph(const session::KernelCallBack &before, const session::KernelCallBack &after) {
  auto ms_output_tensors = GetOutputs();
  this->outputs_.clear();
  for (auto ms_tensors : ms_output_tensors)
    for (auto ms_tensor : ms_tensors.second)
    this->outputs_.push_back((dynamic_cast<lite::tensor::LiteTensor*>(ms_tensor))->tensor());
  if (train_mode_)
    return LiteSession::RunGraph(before, after);

  // object is expected to run only inference part of graph
  // prepare a lit of kernels till the loss function -- temporary solution
  std::vector<kernel::LiteKernel *> infference_kernels;
  for (auto kernel : this->kernels_) {
    if (dynamic_cast<const kernel::LossKernel*>(kernel) != nullptr)
      break;
    infference_kernels.push_back(kernel);
  }

  MS_EXCEPTION_IF_NULL(this->context_);
  // TODO(Emir)
  // SetMaxWokerNum(context_->thread_num_);
  // context_->running_ = true;
  lite::Executor executor;
  if (before == nullptr && after == nullptr) {
    return executor.Run(this->inputs_, this->outputs_, infference_kernels, this->context_->allocator.get());
  } else {
    return executor.Run(this->inputs_, this->outputs_, infference_kernels, this->context_->allocator.get(),
           before, after);
  }
 }

 void TrainSession::train() {
  for (auto *kernel : kernels_) {
    MS_ASSERT(nullptr != kernel);
    kernel->train();
  }
  train_mode_ = true;
  ext_output_map_.clear();
  for (auto kernel : this->kernels_) {
    if (dynamic_cast<const kernel::LossKernel*>(kernel) != nullptr) {
      auto *ms_tensor = new lite::tensor::LiteTensor(kernel->out_tensors().at(0));
      ext_output_map_[kernel->name()].emplace_back(ms_tensor);
    }
  }
 }

 void TrainSession::eval() {
  for (auto *kernel : kernels_) {
    MS_ASSERT(nullptr != kernel);
    kernel->eval();
  }
  train_mode_ = false;
  kernel::LiteKernel* last_kernel = nullptr;
  // We should get in_kernels and then get all last kernels
  ext_output_map_ = output_node_map_;
  for (auto kernel : this->kernels_) {
    if ((dynamic_cast<const kernel::LossKernel*>(kernel) != nullptr) &&
        (last_kernel != nullptr)) {
      auto *ms_tensor = new lite::tensor::LiteTensor(last_kernel->out_tensors().at(0));
      ext_output_map_[last_kernel->name()].emplace_back(ms_tensor);
    }
    last_kernel = kernel;
  }
 }

 std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> TrainSession::GetOutputs() const {
  return ext_output_map_;
 }
 std::vector<tensor::MSTensor *> TrainSession::GetOutputsByName(const std::string &name) const {
  auto ret_vect = LiteSession::GetOutputsByNodeName(name);  // TODO(emir):  GetOutputsByTensorName?
  if (ret_vect.size() > 0)
    return ret_vect;
  auto ret = ext_output_map_.find(name);
  if (ret == ext_output_map_.end()) {
    MS_LOG(WARNING) << "Node  " << name << " is not an output node";
    std::vector<mindspore::tensor::MSTensor *> empty_ret;
    return empty_ret;
  }
  return ret->second;
 }



 }  // namespace mindspore::session
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -259,6 +259,10 @@ endif()
 if (SUPPORT_TRAIN)
    set(TEST_LITE_SRC
            ${TEST_LITE_SRC}
           # ${LITE_DIR}/src/train/ops/train_ops.cc
            ${LITE_DIR}/src/train/train_populate_parameter.cc
            ${LITE_DIR}/src/train/train_session.cc
            ${LITE_DIR}/src/lite_session.cc
            #            ${SRC_DIR}/common/trans.cc
            #            ${SRC_DIR}/common/lite/trans_extends.cc
            #            ${SRC_DIR}/kernel/kernel_build_info.cc
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/activation_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/activation_grad_fp32_tests.cc
@@ -25,9 +25,10 @@
 #include "mindspore/lite/src/ir/tensor.h"
 #include "mindspore/lite/src/lite_kernel.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.h"
 #include "nnacl/fp32_grad/activation_grad.h"

 namespace mindspore {
 class TestActGradFp32 :  public mindspore::CommonTest {
 class TestActGradFp32 : public mindspore::CommonTest {
 public:
  TestActGradFp32() {}
 };
@@ -41,13 +42,14 @@ TEST_F(TestActGradFp32, ReluGradFp32) {
  size_t input_size;
  std::string input_path = "./test_data/activationGrad/relu_y_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  EXPECT_EQ(input_size, output_data_size * sizeof(float));
  std::string yt_path = "./test_data/activationGrad/relu_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  EXPECT_EQ(input_size, output_data_size * sizeof(float));
  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    ReluGrad(yt_data, input_data, 50, output_data);
    ReluGrad(yt_data, input_data, output_data_size, output_data);
  }

  int loop_count = 100;
@@ -72,9 +74,9 @@ TEST_F(TestActGradFp32, ReluGradFp32) {

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;

  MS_LOG(INFO) << "ReluGradFp32 passed";
 }
@@ -118,9 +120,9 @@ TEST_F(TestActGradFp32, Relu6GradFp32) {

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;

  MS_LOG(INFO) << "Relu6GradFp32 passed";
 }
@@ -164,9 +166,9 @@ TEST_F(TestActGradFp32, LReluGradFp32) {

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;

  MS_LOG(INFO) << "LReluGradFp32 passed";
 }
@@ -211,9 +213,9 @@ TEST_F(TestActGradFp32, SigmoidGradFp32) {
  EXPECT_EQ(res, 0);
  // lite::CompareOutput(output_data, output_path);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;

  MS_LOG(INFO) << "SigmoidGradFp32 passed";
 }
@@ -257,9 +259,9 @@ TEST_F(TestActGradFp32, tanhGradFp32) {

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;
  MS_LOG(INFO) << "TanhGradFp32 passed";
 }

@@ -267,24 +269,25 @@ TEST_F(TestActGradFp32, hswishGradFp32) {
  // runtime part
  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;
  size_t output_data_size = 50;
  const size_t output_data_size = 10;

  size_t input_size;
  std::string input_path = "./test_data/activationGrad/hswish_x_50.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  EXPECT_EQ(input_size, output_data_size * sizeof(float));
  std::string yt_path = "./test_data/activationGrad/hswish_yt_50.bin";
  auto yt_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(yt_path.c_str(), &input_size));

  EXPECT_EQ(input_size, output_data_size * sizeof(float));
  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    HSwishGrad(yt_data, input_data, 50, output_data);
    HSwishGrad(yt_data, input_data, static_cast<int>(output_data_size), output_data);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    HSwishGrad(yt_data, input_data, 50, output_data);
    HSwishGrad(yt_data, input_data, output_data_size, output_data);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
@@ -292,7 +295,7 @@ TEST_F(TestActGradFp32, hswishGradFp32) {
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
  for (int i = 0; i < std::min(output_data_size, 20UL); i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
@@ -302,9 +305,9 @@ TEST_F(TestActGradFp32, hswishGradFp32) {

  EXPECT_EQ(res, 0);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete yt_data;
  delete[] yt_data;
  MS_LOG(INFO) << "hswishGradFp32 passed";
 }

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/arithmetic_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/arithmetic_grad_fp32_tests.cc
@@ -106,9 +106,14 @@ TEST_F(TestArithmeticGradFp32, TestAddGradFp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // delete all_tensors;
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestAddGradFp32 passed";
 }

@@ -137,9 +142,14 @@ TEST_F(TestArithmeticGradFp32, TestAddGrad2Fp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_1_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i]; //TODO tensor data is unique pointer
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestAddGrad2Fp32 passed";
 }

@@ -169,8 +179,14 @@ TEST_F(TestArithmeticGradFp32, TestAddGrad3Fp32) {
  std::string dx2_path = "./test_data/operators/arithmetic_fp32_8_dx1_5_4_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestAddGrad3Fp32 passed";
 }

@@ -200,8 +216,14 @@ TEST_F(TestArithmeticGradFp32, TestSubGradFp32) {
  std::string dx2_path = "./test_data/operators/arithmetic_fp32_2_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestSubGradFp32 passed";
 }

@@ -231,8 +253,12 @@ TEST_F(TestArithmeticGradFp32, TestSubGrad2Fp32) {
  std::string dx2_path = "./test_data/operators/arithmetic_fp32_3_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  delete kernel_obj;
  MS_LOG(INFO) << "TestSubGrad2Fp32 passed";
 }

@@ -271,9 +297,13 @@ TEST_F(TestArithmeticGradFp32, TestMulGradFp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  delete kernel_obj;
  // delete param;
  MS_LOG(INFO) << "TestMulGradFp32 passed";
 }

@@ -302,9 +332,14 @@ TEST_F(TestArithmeticGradFp32, TestMulGrad2Fp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_4_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestMulGrad2Fp32 passed";
 }

@@ -333,9 +368,14 @@ TEST_F(TestArithmeticGradFp32, TestMulGrad3Fp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestMulGrad3Fp32 passed";
 }

@@ -364,9 +404,14 @@ TEST_F(TestArithmeticGradFp32, TestMulGrad4Fp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_9_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestMulGrad4Fp32 passed";
 }

@@ -395,9 +440,14 @@ TEST_F(TestArithmeticGradFp32, TestDivGradFp32) {

  std::string dx2_path = "./test_data/operators/arithmetic_fp32_5_dx2_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, dx2_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete kernel_obj;
  // delete param;
  MS_LOG(INFO) << "TestDivGradFp32 passed";
 }

@@ -427,8 +477,14 @@ TEST_F(TestArithmeticGradFp32, TestDivGrad2Fp32) {
  std::string output_path = "./test_data/operators/arithmetic_fp32_6_dx1_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
 }

@@ -457,9 +513,14 @@ TEST_F(TestArithmeticGradFp32, TestDivGrad3Fp32) {

  std::string output_path = "./test_data/operators/arithmetic_fp32_10_dx2_5_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  // for (int i = 0; i < 5; i++) delete all_tensors[i];
  // delete param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestDivGrad3Fp32 passed";
 }

@@ -488,9 +549,12 @@ TEST_F(TestArithmeticGradFp32, Test3DDivGrad2Fp32) {

  std::string output_path = "./test_data/operators/arithmetic_fp32_7_dx2_1_1_6.bin";
  EXPECT_EQ(0, lite::CompareRelativeOutput(output_ptr, output_path));

  for (int i = 0; i < 5; i++) delete all_tensors[i];
  delete param;
  for (auto tensor : all_tensors) {
    delete[] reinterpret_cast<float *>(tensor->Data());
    tensor->SetData(nullptr);
    delete tensor;
  }
  delete kernel_obj;
  MS_LOG(INFO) << "TestDivGrad2Fp32 passed";
 }

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bias_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bias_grad_fp32_tests.cc
@@ -18,8 +18,8 @@
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32_grad/bias_grad.h"
 #include "mindspore/lite/src/kernel_registry.h"
 #include "src/runtime/kernel/arm/fp32_grad/bias_grad.h"
 #include "src/kernel_registry.h"

 namespace mindspore {

@@ -40,9 +40,8 @@ TEST_F(TestBiasGradFp32, BiasGradFp32) {
  dy_tensor.SetData(input_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor};

  auto output_data = new float[7];
  std::vector<int> dim_dw({7});
  std::vector<int> dim_dw = {7};
  lite::tensor::Tensor dw_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  dw_tensor.SetData(output_data);
  std::vector<lite::tensor::Tensor *> outputs = {&dw_tensor};
@@ -62,9 +61,12 @@ TEST_F(TestBiasGradFp32, BiasGradFp32) {
  std::string output_path = "./test_data/operators/biasgradfp32_1_db_7.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  // delete[] output_data;
  delete bias_param;
  delete [] input_data;
  delete[] output_data;
  // delete bias_param;
  dy_tensor.SetData(nullptr);
  dw_tensor.SetData(nullptr);
  delete kernel_obj;
  MS_LOG(INFO) << "BiasGradFp32 passed";
 }

--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bn_grad_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bn_grad_fp32_test.cc
@@ -0,0 +1,111 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <iostream>
 #include <memory>
 #include "utils/log_adapter.h"
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "src/runtime/kernel/arm/fp32_grad/bn_grad.h"
 #include "nnacl/fp32_grad/batch_norm.h"
 #include "src/kernel_registry.h"
 #

 namespace mindspore {

 class TestBNGradFp32 : public mindspore::CommonTest {
 public:
  TestBNGradFp32() {}
  lite::tensor::Tensor *CreateInTensor(std::string file_name, std::vector<int> dim);
 };

 lite::tensor::Tensor *TestBNGradFp32::CreateInTensor(std::string file_name, std::vector<int> dim) {
  size_t input_size = 0;
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_name.c_str(), &input_size));
  auto tensor = new lite::tensor::Tensor(TypeId::kNumberTypeFloat32, dim);
  tensor->SetData(input_data);
  EXPECT_EQ(input_size, tensor->Size());
  return tensor;
 }

 TEST_F(TestBNGradFp32, BNGradFp32) {
  // prepare stage
  auto bn_param = new BNGradParameter();
  bn_param->epsilon_ = 0.00001;
  bn_param->momentum_ = 0.1;
  const int batch = 2;
  const int channels = 3;
  const int height = 4;
  const int width = 5;

  auto dy_tensor = CreateInTensor("./test_data/bngrad/dy_2_4_5_3.bin", {batch, height, width, channels});
  auto x_tensor = CreateInTensor("./test_data/bngrad/input_x_2_4_5_3.bin", {batch, height, width, channels});
  auto scale_tensor = CreateInTensor("./test_data/bngrad/scale_3.bin", {1, 1, 1, channels});
  auto mean_tensor = CreateInTensor("./test_data/bngrad/save_mean_3.bin", {1, 1, 1, channels});
  auto var_tensor = CreateInTensor("././test_data/bngrad/save_var_3.bin", {1, 1, 1, channels});
  // prepare output tensors
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, {batch, height, width, channels});
  dx_tensor.MallocData();
  lite::tensor::Tensor dscale_tensor(TypeId::kNumberTypeFloat32, {1, 1, 1, channels});
  dscale_tensor.MallocData();
  lite::tensor::Tensor dbias_tensor(TypeId::kNumberTypeFloat32, {1, 1, 1, channels});
  dbias_tensor.MallocData();

  std::vector<lite::tensor::Tensor *> inputs = {dy_tensor, x_tensor, scale_tensor, mean_tensor, var_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor, &dscale_tensor, &dbias_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_BNGrad};

  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(bn_param), NULL, desc, nullptr);

  for (int i = 0; i < 3; i++) {
    kernel_obj->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel_obj->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  auto time_avg = cost / loop_count;
  std::cout << "single thread running time : " << time_avg << "us\n";
  std::cout << "==========dx==========\n";
  auto dx = reinterpret_cast<float *>(outputs[0]->Data());
  for (int i = 0; i < 7; i++) std::cout << dx[i] << " ";
  std::cout << "\n=======dscale=======\n";
  auto dscale = reinterpret_cast<float *>(outputs[1]->Data());
  for (int i = 0; i < channels; i++) std::cout << dscale[i] << " ";
  std::cout << "\n";
  int res = mindspore::lite::CompareRelativeOutput(dscale, "./test_data/bngrad/output_dscale_3.bin");
  EXPECT_EQ(res, 0);
  std::cout << "==========dbias==========\n";
  auto dbias = reinterpret_cast<float *>(outputs[2]->Data());
  for (int i = 0; i < 3; i++) std::cout << dbias[i] << " ";
  std::cout << "\n";
  res = mindspore::lite::CompareRelativeOutput(dscale, "./test_data/bngrad/output_dscale_3.bin");
  EXPECT_EQ(res, 0);
  for (auto v : inputs) {
    delete[] reinterpret_cast<float *>(v->Data());
    v->SetData(nullptr);
    // delete v;
  }
  delete kernel_obj;
  MS_LOG(INFO) << "BNGradFp32 passed";
 }
 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
@@ -21,6 +21,7 @@
 #include "common/common_test.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_filter.h"
 #include "mindspore/lite/src/runtime/kernel/arm/fp32_grad/convolution_grad_input.h"
 #include "mindspore/lite/nnacl/conv_parameter.h"
@@ -130,11 +131,14 @@ TEST_F(TestConvolutionGradFp32, ConvFp32FilterGrad) {

  EXPECT_EQ(res, 0);

  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete [] input_data;
  delete [] dy_data;
  delete [] dw_data;
  delete kernel;
  delete conv_param;
  // delete conv_param;
  dw_tensor.SetData(nullptr);
  x_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

@@ -193,9 +197,15 @@ TEST_F(TestConvolutionGradFp32, ConvFp32InputGrad) {
  std::string output_path = "./test_data/conv/convfp32_dx_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);

  delete [] dx_data;
  delete [] w_data;
  delete [] dy_data;
  w_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  dx_tensor.SetData(nullptr);
  delete kernel;
  delete conv_param;
  // delete conv_param;

  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

@@ -254,11 +264,14 @@ TEST_F(TestConvolutionGradFp32, ConvFp32GroupFilterGrad) {
  auto res = lite::CompareRelativeOutput(dw_data, output_path);
  EXPECT_EQ(res, 0);

  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete [] input_data;
  delete [] dy_data;
  delete [] dw_data;
  dw_tensor.SetData(nullptr);
  x_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  delete kernel;
  delete conv_param;
  // delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

@@ -317,9 +330,15 @@ TEST_F(TestConvolutionGradFp32, ConvFp32GroupInputGrad) {
  std::string output_path = "./test_data/conv/convfp32_dx_g3_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);
  delete [] dx_data;
  delete [] w_data;
  delete [] dy_data;
  dx_tensor.SetData(nullptr);
  w_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);

  delete kernel;
  delete conv_param;
  // delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

@@ -378,11 +397,14 @@ TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationFilterGrad) {
  std::string output_path = "./test_data/conv/convfp32_dw_g3_d2_18_3_3_3.bin";
  auto res = lite::CompareRelativeOutput(dw_data, output_path);
  EXPECT_EQ(res, 0);
  // delete input_data;
  // delete dy_data;
  // delete [] dw_data;
  delete [] input_data;
  delete [] dy_data;
  delete [] dw_data;
  dw_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  x_tensor.SetData(nullptr);
  delete kernel;
  delete conv_param;
  // delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

@@ -441,80 +463,93 @@ TEST_F(TestConvolutionGradFp32, ConvFp32GroupDilationInputGrad) {
  std::string output_path = "./test_data/conv/convfp32_dx_g3_d2_1_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(dx_data, output_path);
  EXPECT_EQ(res, 0);

  delete [] dx_data;
  delete [] w_data;
  delete [] dy_data;
  dx_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  w_tensor.SetData(nullptr);
  delete kernel;
  delete conv_param;
  // delete conv_param;
  MS_LOG(INFO) << "TestConvolutionGradFp32 Filter Grad passed";
 }

 // TEST_F(TestConvolutionGradFp32, ConvGroupDilation) {
 //   // prepare stage
 //   auto conv_param = new ConvParameter();
 //   InitConvParamGroup3Dilation2FP32(conv_param);

 //   size_t x_size;
 //   std::string x_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
 //   auto x_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(x_path.c_str(), &x_size));
 //   std::vector<int> dim_x({1, 28, 28, 3});
 //   tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
 //   x_tensor.SetData(x_data);

 //   size_t w_size;
 //   std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
 //   auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
 //   std::vector<int> dim_w({18, 3, 3, 1});
 //   tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
 //   w_tensor.SetData(w_data);

 //   size_t output_data_size =
 //     conv_param->output_batch_ * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
 //   auto y_data = new float[output_data_size];
 //   std::vector<int> dim_y({1, 26, 26, 18});
 //   tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
 //   y_tensor.SetData(y_data);

 //   std::vector<tensor::Tensor *> inputs = {&x_tensor, &w_tensor};
 //   std::vector<tensor::Tensor *> outputs = {&y_tensor};
 //   // runtime part

 //   printf("Calculating runtime cost...\n");
 //   uint64_t time_avg = 0;

 //   lite::Context context;
 //   ;
 //   context.deviceCtx.type = lite::DT_CPU;
 //   context.threadNum = 1;

 //   kernel::KernelKey desc = {kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Conv2D};
 //   auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
 //   auto kernel = creator(inputs, outputs, (OpParameter *)conv_param, &context, desc);

 //   kernel->train();
 //   EXPECT_EQ(kernel->is_train(), 1);

 //   // warm up loop
 //   for (int i = 0; i < 3; i++) {
 //     kernel->Run();
 //   }

 //   int loop_count = 100;
 //   auto time_start = mindspore::lite::GetTimeUs();
 //   for (int i = 0; i < loop_count; i++) {
 //     kernel->Run();
 //   }
 //   auto time_end = mindspore::lite::GetTimeUs();
 //   auto cost = time_end - time_start;
 //   time_avg = cost / loop_count;
 //   printf("single thread running time : %f ms\n", time_avg / 1000.0f);

 //   std::string output_path = "./test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin";
 //   auto res = lite::CompareRelativeOutput(y_data, output_path);
 //   EXPECT_EQ(res, 0);

 //   delete kernel;
 //   delete conv_param;

 //   MS_LOG(INFO) << "TestConvolutionFp32 Filter Grad passed";
 // }
 TEST_F(TestConvolutionGradFp32, ConvGroupDilation) {
  // prepare stage
  auto conv_param = new ConvParameter();
  InitConvParamGroup3Dilation2FP32(conv_param);

  size_t x_size;
  std::string x_path = "./test_data/conv/convfp32_x_g3_d2_1_28_28_3.bin";
  auto x_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(x_path.c_str(), &x_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  size_t w_size;
  std::string w_path = "./test_data/conv/convfp32_w_g3_d2_18_3_3_3.bin";
  auto w_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(w_path.c_str(), &w_size));
  std::vector<int> dim_w({18, 3, 3, 1});
  lite::tensor::Tensor w_tensor(TypeId::kNumberTypeFloat32, dim_w);
  w_tensor.SetData(w_data);

  size_t output_data_size =
    conv_param->output_batch_ * conv_param->output_h_ * conv_param->output_w_ * conv_param->output_channel_;
  auto y_data = new float[output_data_size];
  std::vector<int> dim_y({1, 26, 26, 18});
  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
  y_tensor.SetData(y_data);

  std::vector<lite::tensor::Tensor *> inputs = {&x_tensor, &w_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&y_tensor};
  // runtime part

  printf("Calculating runtime cost...\n");
  uint64_t time_avg = 0;

  lite::Context context;
  context.device_ctx_.type = lite::DT_CPU;
  context.thread_num_ = 1;


  auto *kernel = new mindspore::kernel::ConvolutionTrainCPUKernel(reinterpret_cast<OpParameter *>(conv_param),
   inputs, outputs, &context, 0);
  kernel->Init();
  // kernel::KernelKey desc = {kernel::kCPU, kNumberTypeFloat32, schema::PrimitiveType_Conv2D};
  // auto creator = lite::KernelRegistry::GetInstance()->GetKernelCreator(desc);
  // auto kernel = creator(inputs, outputs, (OpParameter *)conv_param, &context, desc);

  kernel->train();
  EXPECT_EQ(kernel->is_train(), 1);

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    kernel->Run();
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
  time_avg = cost / loop_count;
  printf("single thread running time : %f ms\n", time_avg / 1000.0f);

  std::string output_path = "./test_data/conv/convfp32_y_g3_d2_1_26_26_18.bin";
  auto res = lite::CompareRelativeOutput(y_data, output_path);
  EXPECT_EQ(res, 0);

  delete [] y_data;
  delete [] x_data;
  delete [] w_data;
  x_tensor.SetData(nullptr);
  y_tensor.SetData(nullptr);
  w_tensor.SetData(nullptr);
  delete kernel;

  MS_LOG(INFO) << "TestConvolutionFp32 Filter Grad passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/network_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/network_test.cc
@@ -0,0 +1,564 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <dirent.h>
 #include <climits>
 #include <cmath>
 #include <iostream>
 #include <fstream>
 #include <memory>
 #include <string>
 #include <functional>

 #include "mindspore/lite/schema/inner/model_generated.h"
 #include "mindspore/lite/include/model.h"
 #include "common/common_test.h"
 #include "include/train_session.h"
 // #include "include/lite_session.h"
 #include "include/context.h"
 #include "include/errorcode.h"
 #include "utils/log_adapter.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"

 namespace mindspore {
 class NetworkTest : public mindspore::CommonTest {
 public:
  NetworkTest() {}
 };


 //             INPUT(0)
 //                V
 //        +-------------+
 //        |     ReLU    |
 //        +-------------+
 //  +---output(1) V
 //  |             V   V weights(2) <----+
 //  |     +-------------+               |
 //  |     |    MatMul   |               |
 //  |     +-------------+               |
 //  |   output(3) V                     |
 //  |             V   V weights(4)<-+   |
 //  |     +-------------+           |   |
 //  |     |    Bias     |           |   |
 //  |     +-------------+           |   |
 //  |   output(5) V                 |   |
 //  |             V   V LABELS(6)   |   |
 //  |     +-------------+           |   |
 //  |     | CrossEntropy|           |   |
 //  |     +-------------+           |   |
 //  |  +-dy(7) V  V------------------------->Loss (14)
 //  |  |       V                    |   |
 //  |  |  +-------------+           |   |
 //  |  |  |  BiasGrad   |           |   |
 //  |  |  +-------------+           |   |
 //  |  |          V db(8)           |   |
 //  |  |          +--------Update---+   |
 //  |  +-------+                        |
 //  +------V   V                        |
 //        +-------------+               |
 //        |  MatMul     |               |
 //        +-------------+               |
 //               V dw(9)                |
 //               +-----------Update-----+


 TEST_F(NetworkTest, tuning_layer) {
  const int BATCH_SIZE = 32;
  const int NUM_CLASSES = 10;
  const int FEATURE_SIZE = 1000;
  auto meta_graph = std::make_shared<schema::MetaGraphT>();
  meta_graph->name = "graph";
  // define nodes
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {0};
    node->outputIndex = {1};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_Activation;
    auto primitive = new schema::ActivationT;
    primitive->type = schema::ActivationType_RELU;
    node->primitive->value.value = primitive;
    node->name = "ReLU";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {1, 2};
    node->outputIndex = {3};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_MatMul;
    auto primitive = new schema::MatMulT;
    primitive->transposeA = false;
    primitive->transposeB = true;
    node->primitive->value.value = primitive;
    node->name = "MatMul1";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {3, 4};
    node->outputIndex = {5};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_BiasAdd;
    auto primitive = new schema::BiasAddT;
    primitive->axis.push_back(0);
    node->primitive->value.value = primitive;
    node->name = "BiasAdd";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {5, 6};
    node->outputIndex = {14, 7};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_SoftmaxCrossEntropy;
    auto primitive = new schema::SoftmaxCrossEntropyT;
    primitive->axis.push_back(0);
    node->primitive->value.value = primitive;
    node->name = "SoftmaxCrossEntropy";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {7};
    node->outputIndex = {8};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_BiasGrad;
    auto primitive = new schema::BiasGradT;
    primitive->axis.push_back(0);
    node->primitive->value.value = primitive;
    node->name = "BiasGrad";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {7, 1};
    node->outputIndex = {9};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_MatMul;
    auto primitive = new schema::MatMulT;
    primitive->transposeA = true;
    primitive->transposeB = false;
    node->primitive->value.value = primitive;
    node->name = "MatMul2";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {2, 10, 11, 9, 12};
    node->outputIndex = {};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_ApplyMomentum;
    auto primitive = new schema::ApplyMomentumT;
    node->primitive->value.value = primitive;
    node->name = "Momentum";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  {
    auto node = std::make_unique<schema::CNodeT>();
    node->inputIndex = {4, 13, 11, 8, 12};
    node->outputIndex = {};
    node->primitive = std::make_unique<schema::PrimitiveT>();
    node->primitive->value.type = schema::PrimitiveType_ApplyMomentum;
    auto primitive = new schema::ApplyMomentumT;
    node->primitive->value.value = primitive;
    node->name = "Momentum";
    meta_graph->nodes.emplace_back(std::move(node));
  }
  meta_graph->inputIndex = {6, 0};  // XXX TODO why is it reverse?
  meta_graph->outputIndex = {5, 14};
  const int NUM_OF_OUTPUTS = 2;

  auto input0 = std::make_unique<schema::TensorT>();
  input0->nodeType = schema::NodeType::NodeType_ValueNode;
  input0->format = schema::Format_NHWC;
  input0->dataType = TypeId::kNumberTypeFloat32;
  input0->dims = {BATCH_SIZE, FEATURE_SIZE};
  input0->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input0));
  // tensor 1 - relu
  auto relu_out = std::make_unique<schema::TensorT>();
  relu_out->nodeType = schema::NodeType::NodeType_Parameter;
  relu_out->format = schema::Format_NHWC;
  relu_out->dataType = TypeId::kNumberTypeFloat32;
  relu_out->dims = {BATCH_SIZE, FEATURE_SIZE};
  relu_out->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(relu_out));
  // tensor 2 - matmul weights
  auto weight = std::make_unique<schema::TensorT>();
  weight->nodeType = schema::NodeType::NodeType_ValueNode;
  weight->format = schema::Format_KHWC;
  weight->dataType = TypeId::kNumberTypeFloat32;
  weight->dims = {NUM_CLASSES, FEATURE_SIZE};
  size_t weight_size;
  char *buf;
  std::string weight_path = "./test_data/train/train_weight_10_1000.bin";
  ReadFile(weight_path.c_str(), &weight_size, &buf);
  ASSERT_NE(nullptr, buf);
  weight->data.resize(weight_size);
  std::copy(buf, buf + weight_size, weight->data.data());
  meta_graph->allTensors.emplace_back(std::move(weight));
  // tensor 3 - matmul
  auto input3 = std::make_unique<schema::TensorT>();
  input3->nodeType = schema::NodeType::NodeType_Parameter;
  input3->format = schema::Format_NHWC;
  input3->dataType = TypeId::kNumberTypeFloat32;
  input3->dims = {BATCH_SIZE, NUM_CLASSES};
  input3->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input3));
  // tensor 4 - fc bias
  auto bias = std::make_unique<schema::TensorT>();
  bias->nodeType = schema::NodeType::NodeType_ValueNode;
  bias->format = schema::Format_NHWC;
  bias->dataType = TypeId::kNumberTypeFloat32;
  bias->dims = {NUM_CLASSES};
  bias->offset = -1;
  std::string bias_path = "./test_data/train/train_bias_10.bin";
  size_t bias_size;
  ReadFile(bias_path.c_str(), &bias_size, &buf);
  ASSERT_NE(nullptr, buf);
  bias->data.resize(bias_size);
  std::copy(buf, buf + bias_size, bias->data.data());
  meta_graph->allTensors.emplace_back(std::move(bias));

  // tensor 5 - bias_add
  auto input5 = std::make_unique<schema::TensorT>();
  input5->nodeType = schema::NodeType::NodeType_Parameter;
  input5->format = schema::Format_NHWC;
  input5->dataType = TypeId::kNumberTypeFloat32;
  input5->dims = {BATCH_SIZE, NUM_CLASSES};
  input5->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input5));
  // tensor 6 - Label
  {
    auto label = std::make_unique<schema::TensorT>();
    label->nodeType = schema::NodeType::NodeType_ValueNode;
    label->format = schema::Format_NHWC;
    label->dataType = TypeId::kNumberTypeInt32;
    label->dims = {BATCH_SIZE};
    label->offset = -1;
    label->data.resize(BATCH_SIZE * NUM_CLASSES * sizeof(float));
    int *data = reinterpret_cast<int *>(label->data.data());
    for (int i = 0; i < BATCH_SIZE; i++)
      for (int j = 0; j < NUM_CLASSES; j++) *(data + i * NUM_CLASSES + j) = j;
    meta_graph->allTensors.emplace_back(std::move(label));
  }
  // tensor 7 - Softmaxentropy
  auto input7 = std::make_unique<schema::TensorT>();
  input7->nodeType = schema::NodeType::NodeType_Parameter;
  input7->format = schema::Format_NHWC;
  input7->dataType = TypeId::kNumberTypeFloat32;
  input7->dims = {BATCH_SIZE, NUM_CLASSES};
  input7->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input7));
  // tensor 8 - biasGrad
  auto input8 = std::make_unique<schema::TensorT>();
  input8->nodeType = schema::NodeType::NodeType_Parameter;
  input8->format = schema::Format_NHWC;
  input8->dataType = TypeId::kNumberTypeFloat32;
  input8->dims = {NUM_CLASSES};
  input8->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input8));
  // tensor 9 - matmul2
  auto input9 = std::make_unique<schema::TensorT>();
  input9->nodeType = schema::NodeType::NodeType_Parameter;
  input9->format = schema::Format_NHWC;
  input9->dataType = TypeId::kNumberTypeFloat32;
  input9->dims = {NUM_CLASSES, FEATURE_SIZE};
  input9->offset = -1;
  meta_graph->allTensors.emplace_back(std::move(input9));
  // tensor 10 weights accumulate
  auto input10 = std::make_unique<schema::TensorT>();
  input10->nodeType = schema::NodeType::NodeType_ValueNode;
  input10->format = schema::Format_NHWC;
  input10->dataType = TypeId::kNumberTypeFloat32;
  input10->dims = {NUM_CLASSES, FEATURE_SIZE};
  input10->offset = -1;
  size_t input10_size = NUM_CLASSES * FEATURE_SIZE * sizeof(float);
  input10->data.resize(input10_size);
  std::fill(input10->data.data(), input10->data.data() + input10_size, 0.f);
  meta_graph->allTensors.emplace_back(std::move(input10));
  // tensor 11 - lr
  {
    auto lr = std::make_unique<schema::TensorT>();
    lr->nodeType = schema::NodeType::NodeType_ValueNode;
    lr->format = schema::Format_NHWC;
    lr->dataType = TypeId::kNumberTypeFloat32;
    lr->dims = {1};
    lr->offset = -1;
    lr->data.resize(sizeof(float));
    float *data = reinterpret_cast<float *>(lr->data.data());
    *data = 0.01f;
    meta_graph->allTensors.emplace_back(std::move(lr));
  }
  // tensor 12  - momentum
  {
    auto input12 = std::make_unique<schema::TensorT>();
    input12->nodeType = schema::NodeType::NodeType_ValueNode;
    input12->format = schema::Format_NHWC;
    input12->dataType = TypeId::kNumberTypeFloat32;
    input12->dims = {1};
    input12->offset = -1;
    input12->data.resize(sizeof(float));
    float *data = reinterpret_cast<float *>(input12->data.data());
    *data = 0.f;
    meta_graph->allTensors.emplace_back(std::move(input12));
  }
  // tensor 13 - bias accumulate
  auto input13 = std::make_unique<schema::TensorT>();
  input13->nodeType = schema::NodeType::NodeType_ValueNode;
  input13->format = schema::Format_NHWC;
  input13->dataType = TypeId::kNumberTypeFloat32;
  input13->dims = {NUM_CLASSES};
  input13->offset = -1;
  size_t input13_size = NUM_CLASSES * sizeof(float);
  input13->data.resize(input13_size);
  std::fill(input13->data.data(), input13->data.data() + input13_size, 0.f);
  meta_graph->allTensors.emplace_back(std::move(input13));

  // tensor 14 - loss
  {
    auto loss14 = std::make_unique<schema::TensorT>();
    loss14->nodeType = schema::NodeType::NodeType_ValueNode;
    loss14->format = schema::Format_NHWC;
    loss14->dataType = TypeId::kNumberTypeFloat32;
    loss14->dims = {1};
    loss14->offset = -1;
    loss14->data.resize(sizeof(float));
    float *data = reinterpret_cast<float *>(loss14->data.data());
    *data = 0.0f;
    meta_graph->allTensors.emplace_back(std::move(loss14));
  }

  //================================================================
  buf = nullptr;

  flatbuffers::FlatBufferBuilder builder(1024);
  auto offset = schema::MetaGraph::Pack(builder, meta_graph.get());
  builder.Finish(offset);
  size_t size = builder.GetSize();
  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
  std::cout << "build fb size= " << size << "\n";

 #if 0  // EXPORT_FILE
  std::string path = std::string("hcdemo_train.fb");
  std::ofstream ofs(path);
  ASSERT_EQ(true, ofs.good());
  ASSERT_EQ(true, ofs.is_open());

  ofs.seekp(0, std::ios::beg);
  ofs.write(content, size);
  ofs.close();
 #endif

  auto model = lite::Model::Import(content, size);
  ASSERT_NE(nullptr, model);
  meta_graph.reset();
  content = nullptr;
  auto context = new lite::Context;
  context->device_ctx_.type = lite::DT_CPU;
  context->cpu_bind_mode_ = lite::NO_BIND;
  context->thread_num_ = 1;
  auto session = new session::TrainSession();
  ASSERT_NE(nullptr, session);
  session->Init(context);
  auto ret = session->CompileGraph(model);
  ASSERT_EQ(lite::RET_OK, ret);
  session->train();

  auto inputs = session->GetInputs();
  ASSERT_EQ(inputs.size(), 2);
  auto inTensor = inputs.at(0);
  ASSERT_NE(nullptr, inTensor);
  auto data = inTensor->MutableData();
  //===================================================
  size_t input_size;
  std::string input_path = "./test_data/train/train_input_32_1000.bin";
  ReadFile(input_path.c_str(), &input_size, &buf);
  ASSERT_NE(nullptr, buf);
  auto input_data = reinterpret_cast<float *>(buf);
  ASSERT_NE(nullptr, input_data);
  //===================================================
  ASSERT_EQ(input_size, inTensor->Size());
  memcpy(data, input_data, input_size);

  auto labelTensor = inputs.at(1);
  ASSERT_NE(nullptr, labelTensor);
  ASSERT_EQ(BATCH_SIZE, labelTensor->ElementsNum());
  auto labels = reinterpret_cast<int *>(labelTensor->MutableData());
  for (int i = 0; i < BATCH_SIZE; i++) labels[i] = (i * 97) % NUM_CLASSES;

  ret = session->RunGraph();
  ASSERT_EQ(lite::RET_OK, ret);
  auto outputs = session->GetOutputsByName("BiasAdd");
  ASSERT_EQ(outputs.size(), 1);
  auto outTensor = (outputs.at(0));
  ASSERT_NE(nullptr, outTensor);
  ASSERT_EQ(TypeId::kNumberTypeFloat32, outTensor->data_type());
  auto *outData = reinterpret_cast<float *>(outTensor->MutableData());
  ASSERT_NE(nullptr, outData);
  std::cout << "========================dW=====================" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << outData[i] << ", ";
  }
  std::cout << std::endl;
  ret = session->RunGraph();
  outputs = session->GetOutputsByName("BiasAdd");
  ASSERT_EQ(outputs.size(), 1);
  outTensor = (outputs.at(0));
  ASSERT_NE(nullptr, outTensor);
  // ASSERT_EQ(28 * 28 * 32, outTensor->ElementsNum());
  ASSERT_EQ(TypeId::kNumberTypeFloat32, outTensor->data_type());
  outData = reinterpret_cast<float *>(outTensor->MutableData());
  ASSERT_NE(nullptr, outData);
  std::cout << "========================dW=====================" << std::endl;
  for (int i = 0; i < 20; i++) {
    std::cout << outData[i] << ", ";
  }
 //===================================================
 #if 0
  size_t output_size;
  std::string output_path = "./convfp32_out_1_28_28_32.bin";
  buf = mindspore::lite::ReadFile(output_path.c_str(), &output_size);
  ASSERT_NE(nullptr, buf);
  auto output_data = reinterpret_cast<float *>(buf);
  ASSERT_NE(nullptr, output_data);
  //===================================================
  ASSERT_EQ(output_size, runOutput->Size());
  for (size_t i = 0; i < runOutput->ElementsNum(); i++) {
    ASSERT_EQ(output_data[i], outData[i]);
  }
 #endif
  MS_LOG(INFO) << "Passed";
 }

 int32_t fileIterator(mindspore::session::TrainSession *session, const std::string &path,
 std::function<int32_t(mindspore::session::TrainSession *session,
  const std::string &)> cb) {
  int32_t res = 0;
  if (auto dir = opendir(path.c_str())) {
    while (auto f = readdir(dir)) {
      if (!f->d_name || f->d_name[0] == '.') continue;
      if (f->d_type == DT_DIR) fileIterator(session, path + f->d_name + "/", cb);

      if (f->d_type == DT_REG)
       res |= cb(session, path + f->d_name);
    }
    closedir(dir);
  }
  return res;
 }
 #if 0
 void replaceExt(const std::string &src, std::string *dst) {
  dst = &std::move(src.substr(0, src.find_last_of('.')) + ".emb");
 }
 #endif
 int32_t runEffNet(mindspore::session::TrainSession *session, const std::string &in, const std::string &out) {
  // setup input
  auto inputs = session->GetInputs();
  // ASSERT_EQ(inputs.size(), 1);
  auto inTensor = inputs.at(0);
  // ASSERT_NE(nullptr, inTensor);
  float *data = reinterpret_cast<float *>(inTensor->MutableData());

  size_t input_size;
  float *in_buf =  reinterpret_cast<float *>(lite::ReadFile(in.c_str(), &input_size));
  // ASSERT_NE(nullptr, data);
  auto input_data = reinterpret_cast<float *>(in_buf);
  // ASSERT_EQ(input_size, inTensor->Size());
  std::copy(input_data, input_data + inTensor->ElementsNum(), data);

  // execute network
  session->RunGraph();

  // compare outputs
  auto outputs = session->GetOutputs();
  auto output = ((outputs.begin())->second);
  float *output_data = reinterpret_cast<float *>(output.at(0)->MutableData());

  return mindspore::lite::CompareRelativeOutput(output_data, out.c_str());
 }

 TEST_F(NetworkTest, efficient_net) {
  const int NUM_OF_INPUTS = 1;
  char *buf = nullptr;
  size_t net_size = 0;
  std::string net = "./test_data/nets/efficientnet_b0_f.ms";
  ReadFile(net.c_str(), &net_size, &buf);
  auto model = lite::Model::Import(buf, net_size);
  auto context = new lite::Context;
  context->device_ctx_.type = lite::DT_CPU;
  context->cpu_bind_mode_ = lite::NO_BIND;
  context->thread_num_ = 1;


  auto session = new mindspore::session::TrainSession();
  ASSERT_NE(session, nullptr);
  auto ret = session->Init(context);
  ASSERT_EQ(lite::RET_OK, ret);
  ret = session->CompileGraph(model);
  ASSERT_EQ(lite::RET_OK, ret);
  session->eval();

 #if 0
  std::string path = "/opt/share/MiniBinEmbDataset/";
  auto res = fileIterator(session, path, [](mindspore::session::TrainSession *session, const std::string &in) {
    int32_t res = 0;
    if (in.find(".bin") != std::string::npos) {
      std::string out;
      replaceExt(in, out);
      res = runEffNet(session, in, out);
      std::cout << "input file: " << in << (res ?  " Fail" : " Pass") << std::endl;
    }
    return res;
  });
 #else
  std::string in = "./test_data/nets/effNet_input_x_1_3_224_224.bin";
  std::string out = "./test_data/nets/effNet_output_y_1_1000.bin";
  auto res = runEffNet(session, in, out);
 #endif
  // auto inputs = session->GetInputs();
  // ASSERT_EQ(inputs.size(), NUM_OF_INPUTS);
  // auto inTensor = inputs.at(0);
  // ASSERT_NE(nullptr, inTensor);
  // float *data = reinterpret_cast<float *>(inTensor->MutableData());

  // // fill input
  // std::string input_path = "./test_data/nets/effNet_input_x_1_3_224_224.bin";
  // // std::string input_path = "/opt/share/MiniBinEmbDataset/2_pet/n02099601_3111.bin";
  // size_t input_size;
  // char *in_buf = nullptr;
  // ReadFile(input_path.c_str(), &input_size, &in_buf);
  // ASSERT_NE(nullptr, data);
  // auto input_data = reinterpret_cast<float *>(in_buf);
  // ASSERT_EQ(input_size, inTensor->Size());
  // std::copy(input_data, input_data+inTensor->ElementsNum(), data);

  // // execute network
  // ret = session->RunGraph();

  // // compare outputs
  // std::string output_path = "./test_data/nets/effNet_output_y_1_1000.bin";
  // // std::string output_path = "/opt/share/MiniBinEmbDataset/2_pet/n02099601_3111.emb";
  // auto outputs = session->GetOutputs();
  // auto output = ((outputs.begin())->second);
  // float* output_data = reinterpret_cast<float *>(output.at(0)->MutableData());
  // int res = lite::CompareRelativeOutput(output_data, output_path);
  ASSERT_EQ(res, 0);
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/pooling_grad_fp32_tests.cc
@@ -22,6 +22,7 @@
 #include "mindspore/lite/src/kernel_registry.h"
 #include "src/common/utils.h"
 #include "src/common/file_utils.h"
 #include "src/common/file_utils_ext.h"
 #include "src/runtime/kernel/arm/fp32_grad/pooling_grad.h"
 #include "nnacl/fp32_grad/pooling_grad.h"

@@ -60,6 +61,7 @@ TEST_F(TestPoolingGradFp32, AvgPoolingGradFp32) {
  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);
  pooling_param->output_channel_ = 3;
  pooling_param->pool_mode_ = PoolMode_AvgPool;

  // runtime part
  printf("Calculating runtime cost...\n");
@@ -95,7 +97,7 @@ TEST_F(TestPoolingGradFp32, AvgPoolingGradFp32) {
  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  delete input_data;
  delete[] input_data;
  delete[] output_data;
  delete pooling_param;
  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
@@ -122,10 +124,10 @@ TEST_F(TestPoolingGradFp32, AvgPoolingKernelGradFp32) {
  dy_tensor.SetData(input_data);

  std::string input1_path = "./test_data/pooling/avgpoolgradfp32_1_x_1_28_28_3.bin";
  input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1_path.c_str(), &input_size));
  auto input1_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input_data);
  x_tensor.SetData(input1_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};

@@ -150,12 +152,205 @@ TEST_F(TestPoolingGradFp32, AvgPoolingKernelGradFp32) {
  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  // delete[] output_data;
  delete pooling_param;
  delete[] input_data;
  delete[] input1_data;
  delete[] output_data;
  dx_tensor.SetData(nullptr);
  x_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  // delete pooling_param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestAvgPoolingGradFp32 passed";
 }

 TEST_F(TestPoolingGradFp32, AvgPoolingBatchGradFp32) {
  // prepare stage
  auto pooling_param = new PoolingParameter();
  InitPoolingParamFP32(pooling_param);

  pooling_param->output_channel_ = 3;
  pooling_param->input_batch_ = 3;
  pooling_param->output_batch_ = 3;

  // runtime part
  printf("Calculating runtime cost...\n");
  // uint64_t time_avg = 0;
  size_t output_data_size =
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->input_h_ * pooling_param->input_w_;

  size_t input_size;
  std::string input_path = "./test_data/pooling/avgpoolgradfp32_1_dy_3_28_28_3.bin";
  auto input_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input_path.c_str(), &input_size));
  std::vector<int> dim_dy({1, 28, 28, 3});
  lite::tensor::Tensor dy_tensor(TypeId::kNumberTypeFloat32, dim_dy);
  dy_tensor.SetData(input_data);

  std::string input1_path = "./test_data/pooling/avgpoolgradfp32_1_x_3_28_28_3.bin";
  auto input1_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(input1_path.c_str(), &input_size));
  std::vector<int> dim_x({1, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(input1_data);

  std::vector<lite::tensor::Tensor *> inputs = {&dy_tensor, &x_tensor};

  auto output_data = new float[output_data_size];
  std::vector<int> dim_dx({1, 28, 28, 3});
  lite::tensor::Tensor dx_tensor(TypeId::kNumberTypeFloat32, dim_dx);
  dx_tensor.SetData(output_data);
  std::vector<lite::tensor::Tensor *> outputs = {&dx_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};

  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(pooling_param), NULL, desc, nullptr);

  kernel_obj->Run();

  printf("==================output data=================\n");
  for (int i = 0; i < 20; i++) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/avgpoolgradfp32_1_dx_3_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  delete[] input_data;
  delete[] input1_data;
  delete[] output_data;
  dx_tensor.SetData(nullptr);
  x_tensor.SetData(nullptr);
  dy_tensor.SetData(nullptr);
  // delete pooling_param;
  delete kernel_obj;
  MS_LOG(INFO) << "TestAvgPoolingGradBatchFp32 passed";
 }

 TEST_F(TestPoolingGradFp32, AvgPoolGradStride2Fp32) {
  // prepare stage
  // input size will be equal to the original size of x, output size will be the output size as in forward
  auto pool = new PoolingParameter();
  InitPoolingParamFP32(pool);
  pool->output_channel_ = 3;
  pool->pool_mode_ = PoolMode_AvgPool;
  pool->input_batch_ = 3;
  pool->output_batch_ = 3;
  pool->output_h_ = 14;
  pool->output_w_ = 14;
  pool->stride_h_ = 2;
  pool->stride_w_ = 2;

  size_t input_size;
  size_t y_data_size = pool->output_batch_ * pool->output_channel_ * pool->input_h_ * pool->input_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/avgpoolgradfp32_s2_x_3_28_28_3.bin", &input_size));
  std::vector<int> dim_x({pool->output_batch_, pool->input_h_, pool->input_w_, pool->input_channel_});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  auto yt_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/avgpoolgradfp32_s2_dy_3_28_28_3.bin", &input_size));
  std::vector<int> dim_y({pool->output_batch_, pool->output_h_, pool->output_w_, pool->output_channel_});
  lite::tensor::Tensor yt_tensor(TypeId::kNumberTypeFloat32, dim_y);
  yt_tensor.SetData(yt_data);

  auto out_data = new float[y_data_size];
  lite::tensor::Tensor out_tensor(TypeId::kNumberTypeFloat32, dim_x);
  out_tensor.SetData(out_data);

  std::vector<lite::tensor::Tensor *> inputs = {&yt_tensor, &x_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&out_tensor};
  // ----------------------------------------
  kernel::KernelKey pool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto pool_creator = lite::KernelRegistry::GetInstance()->GetCreator(pool_desc);
  auto kernel = pool_creator(inputs, outputs, reinterpret_cast<OpParameter *>(pool), NULL, pool_desc, nullptr);

  kernel->Init();

  auto time_start = mindspore::lite::GetTimeUs();
  kernel->Run();
  auto time_end = mindspore::lite::GetTimeUs();
  printf("single thread running time : %ld ms\n", time_end - time_start);

  std::string output_path = "./test_data/pooling/avgpoolgradfp32_s2_dx_3_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(out_data, output_path);

  EXPECT_EQ(res, 0);

  delete[] x_data;
  delete[] yt_data;
  // delete[] out_data;
  // delete conv_param;
  x_tensor.SetData(nullptr);
  yt_tensor.SetData(nullptr);
  out_tensor.SetData(nullptr);
  delete kernel;
  MS_LOG(INFO) << "AvgPoolGradStride2Fp32 Filter Grad passed";
 }

 TEST_F(TestPoolingGradFp32, AvgPoolGradStride3Fp32) {
  // prepare stage
  // input size will be equal to the original size of x, output size will be the output size as in forward
  auto pool = new PoolingParameter();
  InitPoolingParamFP32(pool);
  pool->output_channel_ = 3;
  pool->pool_mode_ = PoolMode_AvgPool;
  pool->input_batch_ = 3;
  pool->output_batch_ = 3;
  pool->output_h_ = 10;
  pool->output_w_ = 10;
  pool->stride_h_ = 3;
  pool->stride_w_ = 3;

  size_t input_size;
  size_t y_data_size = pool->output_batch_ * pool->output_channel_ * pool->input_h_ * pool->input_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/avgpoolgradfp32_s3_x_3_28_28_3.bin", &input_size));
  std::vector<int> dim_x({pool->output_batch_, pool->input_h_, pool->input_w_, pool->input_channel_});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  auto yt_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/avgpoolgradfp32_s3_dy_3_28_28_3.bin", &input_size));
  std::vector<int> dim_y({pool->output_batch_, pool->output_h_, pool->output_w_, pool->output_channel_});
  lite::tensor::Tensor yt_tensor(TypeId::kNumberTypeFloat32, dim_y);
  yt_tensor.SetData(yt_data);

  auto out_data = new float[y_data_size];
  lite::tensor::Tensor out_tensor(TypeId::kNumberTypeFloat32, dim_x);
  out_tensor.SetData(out_data);

  std::vector<lite::tensor::Tensor *> inputs = {&yt_tensor, &x_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&out_tensor};
  // ----------------------------------------
  kernel::KernelKey pool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto pool_creator = lite::KernelRegistry::GetInstance()->GetCreator(pool_desc);
  auto kernel = pool_creator(inputs, outputs, reinterpret_cast<OpParameter *>(pool), NULL, pool_desc, nullptr);

  kernel->Init();

  auto time_start = mindspore::lite::GetTimeUs();
  kernel->Run();
  auto time_end = mindspore::lite::GetTimeUs();
  printf("single thread running time : %ld ms\n", time_end - time_start);

  std::string output_path = "./test_data/pooling/avgpoolgradfp32_s3_dx_3_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(out_data, output_path);

  EXPECT_EQ(res, 0);

  delete[] x_data;
  delete[] yt_data;
  // delete[] out_data;
  // delete conv_param;
  x_tensor.SetData(nullptr);
  yt_tensor.SetData(nullptr);
  out_tensor.SetData(nullptr);
  delete kernel;
  MS_LOG(INFO) << "AvgPoolGradStride3Fp32 Filter Grad passed";
 }

 TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
  // prepare stage
  auto pooling_param = new PoolingParameter();
@@ -169,26 +364,25 @@ TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
    pooling_param->output_batch_ * pooling_param->output_channel_ * pooling_param->output_h_ * pooling_param->output_w_;

  size_t input_size;
  std::string i_path = "./test_data/pooling/maxpoolgradfp32_1_i_1_28_28_3.bin";
  auto ill_data = reinterpret_cast<int64_t *>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));
  auto i_data = new int[output_data_size];
  for (uint32_t i = 0; i < output_data_size; i++) {
    i_data[i] = static_cast<int>(ill_data[i]);
  }
  std::string i_path = "./test_data/pooling/maxpoolgradfp32_1_x_1_28_28_3.bin";
  auto in_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(i_path.c_str(), &input_size));

  std::string dy_path = "./test_data/pooling/maxpoolgradfp32_1_dy_1_28_28_3.bin";
  auto dy_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dy_path.c_str(), &input_size));

  std::string dx_path = "./test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin";
  auto dx_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(dx_path.c_str(), &input_size));

  auto output_data = new float[output_data_size];
  // warm up loop
  for (int i = 0; i < 3; i++) {
    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
    MaxPoolingGrad(in_data, dx_data, dy_data, output_data, pooling_param);
  }

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
  for (int i = 0; i < loop_count; i++) {
    MaxPoolingGrad(dy_data, i_data, output_data, pooling_param);
    MaxPoolingGrad(in_data, dx_data, dy_data, output_data, pooling_param);
  }
  auto time_end = mindspore::lite::GetTimeUs();
  auto cost = time_end - time_start;
@@ -200,11 +394,13 @@ TEST_F(TestPoolingGradFp32, MaxPoolingGradFp32) {
    std::cout << output_data[i] << " ,";
  }
  std::cout << std::endl;
  std::string output_path = "./test_data/pooling/maxpoolgradfp32_1_dx_1_28_28_3.bin";
  std::string output_path = "./test_data/pooling/maxpoolgradfp32_1_xgrad_1_28_28_3.bin";
  lite::CompareOutput(output_data, output_path);

  // delete input_data;
  delete[] in_data;
  delete pooling_param;
  delete[] dy_data;
  delete[] dx_data;
  delete[] output_data;
  MS_LOG(INFO) << "TestMaxPoolingGradFp32 passed";
 }
@@ -326,4 +522,216 @@ TEST_F(TestPoolingGradFp32, MaxPoolingKernelGradFp32) {
  MS_LOG(INFO) << "TestMaxPoolingKernelGradFp32 passed";
 }
 #endif  // if 0 before MaxPoolingKernelGradFp32

 TEST_F(TestPoolingGradFp32, MaxPoolGradBatchFp32) {
  // prepare stage
  // input size will be equal to the original size of x, output size will be the output size as in forward
  auto maxpool = new PoolingParameter();
  InitPoolingParamFP32(maxpool);
  maxpool->output_channel_ = 3;
  maxpool->pool_mode_ = PoolMode_MaxPool;
  maxpool->input_batch_ = 3;
  maxpool->output_batch_ = 3;

  size_t input_size;
  size_t y_data_size = maxpool->output_batch_ * maxpool->output_channel_ * maxpool->input_h_ * maxpool->input_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_1_x_3_28_28_3.bin", &input_size));
  std::vector<int> dim_x({3, 28, 28, 3});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  auto y_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_1_dx_3_28_28_3.bin", &input_size));
  std::vector<int> dim_y({3, 28, 28, 3});
  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
  y_tensor.SetData(y_data);

  auto yt_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_1_dy_3_28_28_3.bin", &input_size));
  lite::tensor::Tensor yt_tensor(TypeId::kNumberTypeFloat32, dim_y);
  yt_tensor.SetData(yt_data);

  auto out_data = new float[y_data_size];
  lite::tensor::Tensor out_tensor(TypeId::kNumberTypeFloat32, dim_x);
  out_tensor.SetData(out_data);

  std::vector<lite::tensor::Tensor *> maxpool_inputs = {&x_tensor, &y_tensor, &yt_tensor};
  std::vector<lite::tensor::Tensor *> maxpool_outputs = {&out_tensor};
  // ----------------------------------------
  kernel::KernelKey maxpool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto maxpool_creator = lite::KernelRegistry::GetInstance()->GetCreator(maxpool_desc);
  auto kernel = maxpool_creator(maxpool_inputs, maxpool_outputs, reinterpret_cast<OpParameter *>(maxpool), NULL,
                                maxpool_desc, nullptr);

  kernel->Init();

  auto time_start = mindspore::lite::GetTimeUs();
  kernel->Run();
  auto time_end = mindspore::lite::GetTimeUs();
  printf("single thread running time : %ld ms\n", time_end - time_start);

  std::string output_path = "./test_data/pooling/maxpoolgradfp32_1_xgrad_3_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(out_data, output_path);

  EXPECT_EQ(res, 0);

  delete[] x_data;
  delete[] y_data;
  delete[] yt_data;
  // delete[] out_data;
  // delete conv_param;
  x_tensor.SetData(nullptr);
  y_tensor.SetData(nullptr);
  yt_tensor.SetData(nullptr);
  out_tensor.SetData(nullptr);
  delete kernel;
  MS_LOG(INFO) << "MaxPoolGradBatchFp32 Filter Grad passed";
 }

 TEST_F(TestPoolingGradFp32, MaxPoolGradStride2Fp32) {
  // prepare stage
  // input size will be equal to the original size of x, output size will be the output size as in forward
  auto maxpool = new PoolingParameter();
  InitPoolingParamFP32(maxpool);
  maxpool->output_channel_ = 3;
  maxpool->input_channel_ = 3;
  maxpool->pool_mode_ = PoolMode_MaxPool;
  maxpool->input_batch_ = 3;
  maxpool->output_batch_ = 3;
  maxpool->output_h_ = 14;
  maxpool->output_w_ = 14;
  maxpool->stride_h_ = 2;
  maxpool->stride_w_ = 2;

  size_t input_size;
  size_t y_data_size = maxpool->output_batch_ * maxpool->output_channel_ * maxpool->input_h_ * maxpool->input_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s2_x_3_28_28_3.bin", &input_size));
  std::vector<int> dim_x({maxpool->output_batch_, maxpool->input_h_, maxpool->input_w_, maxpool->input_channel_});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  auto y_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s2_dx_3_28_28_3.bin", &input_size));
  std::vector<int> dim_y({maxpool->output_batch_, maxpool->output_h_, maxpool->output_w_, maxpool->output_channel_});
  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
  y_tensor.SetData(y_data);

  auto yt_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s2_dy_3_28_28_3.bin", &input_size));
  lite::tensor::Tensor yt_tensor(TypeId::kNumberTypeFloat32, dim_y);
  yt_tensor.SetData(yt_data);

  auto out_data = new float[y_data_size];
  lite::tensor::Tensor out_tensor(TypeId::kNumberTypeFloat32, dim_x);
  out_tensor.SetData(out_data);

  std::vector<lite::tensor::Tensor *> maxpool_inputs = {&x_tensor, &y_tensor, &yt_tensor};
  std::vector<lite::tensor::Tensor *> maxpool_outputs = {&out_tensor};
  // ----------------------------------------
  kernel::KernelKey maxpool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto maxpool_creator = lite::KernelRegistry::GetInstance()->GetCreator(maxpool_desc);
  auto kernel = maxpool_creator(maxpool_inputs, maxpool_outputs, reinterpret_cast<OpParameter *>(maxpool), NULL,
                                maxpool_desc, nullptr);

  kernel->Init();

  auto time_start = mindspore::lite::GetTimeUs();
  kernel->Run();
  auto time_end = mindspore::lite::GetTimeUs();
  printf("single thread running time : %ld ms\n", time_end - time_start);

  std::string output_path = "./test_data/pooling/maxpoolgradfp32_s2_xgrad_3_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(out_data, output_path);

  EXPECT_EQ(res, 0);

  delete[] x_data;
  delete[] y_data;
  delete[] yt_data;
  // delete[] out_data;
  // delete conv_param;
  x_tensor.SetData(nullptr);
  y_tensor.SetData(nullptr);
  yt_tensor.SetData(nullptr);
  out_tensor.SetData(nullptr);
  delete kernel;
  MS_LOG(INFO) << "MaxPoolGradStride2Fp32 Filter Grad passed";
 }

 TEST_F(TestPoolingGradFp32, MaxPoolGradStride3Fp32) {
  // prepare stage
  // input size will be equal to the original size of x, output size will be the output size as in forward
  auto maxpool = new PoolingParameter();
  InitPoolingParamFP32(maxpool);
  maxpool->output_channel_ = 3;
  maxpool->input_channel_ = 3;
  maxpool->pool_mode_ = PoolMode_MaxPool;
  maxpool->input_batch_ = 3;
  maxpool->output_batch_ = 3;
  maxpool->output_h_ = 10;
  maxpool->output_w_ = 10;
  maxpool->stride_h_ = 3;
  maxpool->stride_w_ = 3;

  size_t input_size;
  size_t y_data_size = maxpool->output_batch_ * maxpool->output_channel_ * maxpool->input_h_ * maxpool->input_w_;

  auto x_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s3_x_3_28_28_3.bin", &input_size));
  std::vector<int> dim_x({maxpool->output_batch_, maxpool->input_h_, maxpool->input_w_, maxpool->input_channel_});
  lite::tensor::Tensor x_tensor(TypeId::kNumberTypeFloat32, dim_x);
  x_tensor.SetData(x_data);

  auto y_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s3_dx_3_28_28_3.bin", &input_size));
  std::vector<int> dim_y({maxpool->output_batch_, maxpool->output_h_, maxpool->output_w_, maxpool->output_channel_});
  lite::tensor::Tensor y_tensor(TypeId::kNumberTypeFloat32, dim_y);
  y_tensor.SetData(y_data);

  auto yt_data = reinterpret_cast<float *>(
    mindspore::lite::ReadFile("./test_data/pooling/maxpoolgradfp32_s3_dy_3_28_28_3.bin", &input_size));
  lite::tensor::Tensor yt_tensor(TypeId::kNumberTypeFloat32, dim_y);
  yt_tensor.SetData(yt_data);

  auto out_data = new float[y_data_size];
  lite::tensor::Tensor out_tensor(TypeId::kNumberTypeFloat32, dim_x);
  out_tensor.SetData(out_data);

  std::vector<lite::tensor::Tensor *> maxpool_inputs = {&x_tensor, &y_tensor, &yt_tensor};
  std::vector<lite::tensor::Tensor *> maxpool_outputs = {&out_tensor};
  // ----------------------------------------
  kernel::KernelKey maxpool_desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_PoolingGrad};
  auto maxpool_creator = lite::KernelRegistry::GetInstance()->GetCreator(maxpool_desc);
  auto kernel = maxpool_creator(maxpool_inputs, maxpool_outputs, reinterpret_cast<OpParameter *>(maxpool), NULL,
                                maxpool_desc, nullptr);

  kernel->Init();

  auto time_start = mindspore::lite::GetTimeUs();
  kernel->Run();
  auto time_end = mindspore::lite::GetTimeUs();
  printf("single thread running time : %ld ms\n", time_end - time_start);

  std::string output_path = "./test_data/pooling/maxpoolgradfp32_s3_xgrad_3_28_28_3.bin";
  auto res = lite::CompareRelativeOutput(out_data, output_path);

  EXPECT_EQ(res, 0);

  delete[] x_data;
  delete[] y_data;
  delete[] yt_data;
  // delete[] out_data;
  // delete conv_param;
  x_tensor.SetData(nullptr);
  y_tensor.SetData(nullptr);
  yt_tensor.SetData(nullptr);
  out_tensor.SetData(nullptr);
  delete kernel;
  MS_LOG(INFO) << "MaxPoolGradStride3Fp32 Filter Grad passed";
 }

 }  // namespace mindspore
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/softmax_crossentropy_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/softmax_crossentropy_fp32_tests.cc
@@ -40,7 +40,7 @@ TEST_F(TestSoftmaxCrossEntropyFp32, SoftmaxCrossEntropyFp32) {
  y_tensor.SetData(input_data);

  std::string label_path = "./test_data/operators/sce_fp32_1_l_6.bin";
  auto ll_labels = reinterpret_cast<int64 *>(mindspore::lite::ReadFile(label_path.c_str(), &input_size));
  auto ll_labels = reinterpret_cast<int64_t *>(mindspore::lite::ReadFile(label_path.c_str(), &input_size));
  auto labels = new int[6];
  for (int i = 0; i < 6; i++) labels[i] = static_cast<int>(ll_labels[i]);

@@ -57,7 +57,7 @@ TEST_F(TestSoftmaxCrossEntropyFp32, SoftmaxCrossEntropyFp32) {
  auto grad = new float[24];
  lite::tensor::Tensor grad_tensor(TypeId::kNumberTypeFloat32, dim_y);
  grad_tensor.SetData(grad);
  std::vector<lite::tensor::Tensor *> outputs = {&grad_tensor, &loss_tensor};
  std::vector<lite::tensor::Tensor *> outputs = {&loss_tensor, &grad_tensor};

  kernel::KernelKey desc = {kernel::kCPU, TypeId::kNumberTypeFloat32, schema::PrimitiveType_SoftmaxCrossEntropy};
  auto creator = lite::KernelRegistry::GetInstance()->GetCreator(desc);
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/dy_2_4_5_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/dy_2_4_5_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/input_x_2_4_5_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/input_x_2_4_5_3.bin
@@ -0,0 +1,2 @@
 V_‚?�K¿ŒÏ§¿øÓà¿…�î>J?/=Å"m?LÒu¿œÂj@!U$?f=ˆ?e¥[?W·Ú¾òÎ ½m©?æ	e¾O™·?}4¿¬žˆ?˜B<–ÚK¿íÕÀše›¾¶ÆÚ½Ýø¿p~|>½?ƒ¿Š7ì½ç§ø½E	:?JÍ¿Ì¬>—	? Ä~?Ï«¾óN1¿?>HV¿ú|Ê¾¨œØ=€{‰¿IðU?x›¿©v¾°W>úÉ[¾$˜î? ú•¿]›¿4Bu¿û
 ¾ç4@Ç×¿¦+?Ÿ…z>ušB?Åä™=|e >MÚê>¢Ÿ>í¶ß?}Þ0?¹ö§¾©¿ëœ>�û†¾	ù
@<ç€?Âv�?våZ?zäÅ¿@±í¾è.Î¿•8B?ðîo½
Œ¿ªâÔ¾q"m¿n¯‰?”k>=ì"ê>:©¿³»@ÇÉ<>+R¿
 b±¿6.Œ?“i?¶›v?`j6¿R~]?çJU¼6„s¿GöŒ?M·—¿% ž?Äh”>£È¿½ÇÍ	¿¯ÚG¼Â½?„¦>³Ó“¾«'6?Æ÷@¿�¥ð¿2¿ƒ/V¾K5è¿TÆ>X]?„[Ý?v_Ø¿¥ü¤¾”j?pý˜?€\l?ã.l=°äÀb©?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/output_dbias_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/output_dbias_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/output_dscale_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/output_dscale_3.bin
@@ -0,0 +1 @@
 ���B���B���B
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/save_mean_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/save_mean_3.bin
@@ -0,0 +1 @@
 å:c2;@e<
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/save_var_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/save_var_3.bin
@@ -0,0 +1 @@
 =}…?ÂM€?Z|?
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/scale_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/bngrad/scale_3.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/conv1x1fp32_output1_nhwc.bin
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/test_data/conv/convfp32_dw_32_3_3_3.bin