tod bug fix

5 years ago · 15d4f6fb04
--- a/mindspore/lite/include/train_session.h
+++ b/mindspore/lite/include/train_session.h
@@ -20,7 +20,6 @@
 #include <tuple>
 #include <unordered_map>
 #include "include/lite_session.h"
 #include "include/train_model.h"

 namespace mindspore {
 namespace session {
@@ -33,19 +32,23 @@ class TrainSession : public session::LiteSession {

  /// \brief Static method to create a TrainSession object
  ///
  /// \param[in] model_buf A buffer that was read from a MS model file
  /// \param[in] size Length of the buffer
  /// \param[in] context Defines the context of the session to be created
  /// \param[in] train_mode training mode to initialize Session with
  ///
  /// \return Pointer of MindSpore Lite TrainSession
  static TrainSession *CreateSession(lite::Context *context);
  static TrainSession *CreateSession(const char *model_buf, size_t size, lite::Context *context,
                                     bool train_mode = false);

  /// \brief Compile MindSpore Lite train model
  ///
  /// \note CompileTrainGraph should be called before RunGraph
  /// \brief Static method to create a TrainSession object
  ///
  /// \param[in] model Define the model to be compiled
  /// \param[in] filename Filename to read flatbuffer from
  /// \param[in] context Defines the context of the session to be created
  /// \param[in] train_mode training mode to initialize Session with
  ///
  /// \return STATUS as an error code of compiling graph, STATUS is defined in errorcode.h
  virtual int CompileTrainGraph(lite::TrainModel *model) = 0;
  /// \return Pointer of MindSpore Lite TrainSession
  static TrainSession *CreateSession(const std::string &filename, lite::Context *context, bool train_mode = false);

  /// \brief Export the trained model into a buffer
  ///
--- a/mindspore/lite/nnacl/fp32_grad/softmax_grad.h
+++ b/mindspore/lite/nnacl/fp32_grad/softmax_grad.h
@@ -30,6 +30,7 @@ typedef struct SoftmaxCrossEntropyParameter {
  unsigned int number_of_classes_;
  int n_dim_;
  int input_shape_[5];
  int is_grad;
 } SoftmaxCrossEntropyParameter;

 void SoftmaxGrad(const float *input_ptr, const float *yt_ptr, float *output_ptr, float *sum_data, float *sum_mul,
--- a/mindspore/lite/schema/model.fbs
+++ b/mindspore/lite/schema/model.fbs
@@ -253,6 +253,7 @@ union PrimitiveType {
    All,
    Assert,
    Adder,
    SparseSoftmaxCrossEntropy
 }

 enum QuantType: int {
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -301,12 +301,14 @@ table BatchNorm {
 }

 table BiasGrad {
    axis: [int];
 }


 table SoftmaxCrossEntropy {
    axis: [int];
 }

 table SparseSoftmaxCrossEntropy {
    isGrad: int;
 }

 table make_tuple {
--- a/mindspore/lite/src/ops/bias_grad.cc
+++ b/mindspore/lite/src/ops/bias_grad.cc
@@ -23,9 +23,6 @@
 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 std::vector<int> BiasGrad::GetAxis() const { return this->primitive_->value.AsBiasGrad()->axis; }

 void BiasGrad::SetAxis(const std::vector<int> &axis) { this->primitive_->value.AsBiasGrad()->axis = axis; }
 int BiasGrad::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
  if (this->primitive_ == nullptr) {
    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
@@ -45,11 +42,11 @@ int BiasGrad::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &i
      MS_LOG(ERROR) << "new primitiveT value failed";
      return RET_ERROR;
    }
    if (prim.GetAttr("axis") == nullptr) {
      MS_LOG(WARNING) << "get axis failed";
      attr->axis = {0};
    } else {
      attr->axis = CastToInt(prim.GetAttr("axis"));

    this->primitive_->value.value = attr;
    if (this->primitive_->value.value == nullptr) {
      MS_LOG(ERROR) << "primitive value is nullptr";
      return RET_ERROR;
    }
    this->primitive_->value.value = attr;
  }
@@ -64,21 +61,12 @@ int BiasGrad::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffer
    MS_LOG(ERROR) << "value_as_BiasGrad return nullptr";
    return RET_ERROR;
  }
  std::vector<int32_t> axis;
  if (attr->axis() != nullptr) {
    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
      axis.push_back(attr->axis()->data()[i]);
    }
  }
  auto val_offset = schema::CreateBiasGradDirect(*fbb, &axis);

  auto val_offset = schema::CreateBiasGrad(*fbb);
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_BiasGrad, val_offset.o);
  fbb->Finish(prim_offset);
  return RET_OK;
 }
 std::vector<int> BiasGrad::GetAxis() const {
  auto fb_vector = this->primitive_->value_as_BiasGrad()->axis();
  return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }

 PrimitiveC *BiasGradCreator(const schema::Primitive *primitive) {
  return PrimitiveC::NewPrimitiveC<BiasGrad>(primitive);
--- a/mindspore/lite/src/ops/conv2d.cc
+++ b/mindspore/lite/src/ops/conv2d.cc
@@ -82,7 +82,8 @@ void ConvertConvWeight(const ParameterPtr &param_node) {
  auto weight = std::dynamic_pointer_cast<ParamValueLite>(param);
  MS_ASSERT(weight != nullptr);

  std::unique_ptr<T> buf(new (std::nothrow) T[weight->tensor_shape_size()]);
  std::unique_ptr<T[]> buf(new (std::nothrow) T[weight->tensor_shape_size()]);

  if (buf == nullptr) {
    MS_LOG(ERROR) << "new buf failed";
    return;
@@ -150,9 +151,13 @@ void Conv2D::PopulaterConv2DMultiGroup(const Primitive &prim, schema::PrimitiveT
  attr->padRight = pad_list[3];

  auto dilation = CastToInt(prim.GetAttr("dilation"));
 #ifdef SUPPORT_TRAIN
  attr->dilateH = dilation[2];
  attr->dilateW = dilation[3];
 #else
  attr->dilateH = dilation[0];
  attr->dilateW = dilation[1];

 #endif
  auto kernel_size = CastToInt(prim.GetAttr("kernel_size"));
  attr->kernelH = kernel_size[0];
  attr->kernelW = kernel_size[1];
--- a/mindspore/lite/src/ops/conv2d_grad_filter.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_filter.cc
@@ -110,8 +110,8 @@ int Conv2DGradFilter::UnPackAttr(const Primitive &prim, const std::vector<AnfNod
    attr->padRight = pad_list[3];

    auto dilation = CastToInt(prim.GetAttr("dilation"));
    attr->dilateH = dilation[0];
    attr->dilateW = dilation[1];
    attr->dilateH = dilation[2];
    attr->dilateW = dilation[3];

    auto kernel_size = CastToInt(prim.GetAttr("kernel_size"));
    attr->kernelH = kernel_size[0];
--- a/mindspore/lite/src/ops/conv2d_grad_input.cc
+++ b/mindspore/lite/src/ops/conv2d_grad_input.cc
@@ -111,8 +111,8 @@ int Conv2DGradInput::UnPackAttr(const Primitive &prim, const std::vector<AnfNode
    attr->padRight = pad_list[3];

    auto dilation = CastToInt(prim.GetAttr("dilation"));
    attr->dilateH = dilation[0];
    attr->dilateW = dilation[1];
    attr->dilateH = dilation[2];
    attr->dilateW = dilation[3];

    auto kernel_size = CastToInt(prim.GetAttr("kernel_size"));
    attr->kernelH = kernel_size[0];
--- a/mindspore/lite/src/ops/deconv2d.cc
+++ b/mindspore/lite/src/ops/deconv2d.cc
@@ -76,7 +76,7 @@ void ConvertConvWeight(const ParameterPtr &param_node) {
  auto weight = std::dynamic_pointer_cast<ParamValueLite>(param);
  MS_ASSERT(weight != nullptr);

  std::unique_ptr<T> buf(new (std::nothrow) T[weight->tensor_shape_size()]);
  std::unique_ptr<T[]> buf(new (std::nothrow) T[weight->tensor_shape_size()]);
  if (buf == nullptr) {
    MS_LOG(ERROR) << "new buf failed";
    return;
--- a/mindspore/lite/src/ops/dropout.cc
+++ b/mindspore/lite/src/ops/dropout.cc
@@ -89,7 +89,6 @@ int Dropout::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *> out
  output0->set_shape(input->shape());
  output0->set_data_type(input->data_type());
  output0->set_format(input->format());

  if (outputs_.size() > 1) {
    auto output1 = outputs_[1];
    MS_ASSERT(output1 != nullptr);
@@ -97,7 +96,6 @@ int Dropout::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *> out
    output1->set_data_type(input->data_type());
    output1->set_format(input->format());
  }

  return RET_OK;
 }

--- a/mindspore/lite/src/ops/dropout_grad.cc
+++ b/mindspore/lite/src/ops/dropout_grad.cc
@@ -92,9 +92,7 @@ int DropoutGrad::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *>
  output->set_shape(input->shape());
  output->set_data_type(input->data_type());
  output->set_format(input->format());

  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/populate/tile_populate.cc
+++ b/mindspore/lite/src/ops/populate/tile_populate.cc
@@ -31,6 +31,13 @@ OpParameter *PopulateTileParameter(const mindspore::lite::PrimitiveC *primitive)
  memset(tile_param, 0, sizeof(TileParameter));
  tile_param->op_parameter_.type_ = primitive->Type();
  auto param = reinterpret_cast<mindspore::lite::Tile *>(const_cast<mindspore::lite::PrimitiveC *>(primitive));
 #ifdef SUPPORT_TRAIN
  auto multiples = param->GetMultiples();
  tile_param->in_dim_ = multiples.size();
  for (int i = 0; i < tile_param->in_dim_; ++i) {
    tile_param->multiples_[i] = multiples[i];
  }
 #else
  auto dims = param->GetDims();
  auto multiples = param->GetMultiples();
  for (size_t i = 0; i < kDimension_4d; ++i) {
@@ -39,6 +46,7 @@ OpParameter *PopulateTileParameter(const mindspore::lite::PrimitiveC *primitive)
  for (size_t i = 0; i < dims.size(); ++i) {
    tile_param->multiples_[dims[i]] = multiples[i];
  }
 #endif
  return reinterpret_cast<OpParameter *>(tile_param);
 }

--- a/mindspore/lite/src/ops/primitive_c.cc
+++ b/mindspore/lite/src/ops/primitive_c.cc
@@ -161,6 +161,7 @@
 #include "src/ops/group_conv2d_grad_input.h"
 #include "src/ops/power_grad.h"
 #include "src/ops/softmax_cross_entropy.h"
 #include "src/ops/sparse_softmax_cross_entropy.h"
 #include "src/ops/bn_grad.h"
 #include "src/ops/arithmetic_grad.h"
 #include "src/ops/depend.h"
@@ -578,6 +579,8 @@ std::shared_ptr<PrimitiveC> PrimitiveC::Create(const Primitive &prim, const std:
 #ifdef SUPPORT_TRAIN
  } else if (op_type == "SoftmaxCrossEntropyWithLogits") {
    return NewPrimitiveC<SoftmaxCrossEntropy>(prim, inputs, quantType);
  } else if (op_type == "SparseSoftmaxCrossEntropyWithLogits") {
    return NewPrimitiveC<SparseSoftmaxCrossEntropy>(prim, inputs, quantType);
  } else if (op_type == "BiasAddGrad") {
    return NewPrimitiveC<BiasGrad>(prim, inputs, quantType);
  } else if (op_type == "ApplyMomentum") {
@@ -916,6 +919,8 @@ PrimitiveC *PrimitiveC::Create(mindspore::schema::PrimitiveT *primitive) {
      return new (std::nothrow) ArithmeticGrad(primitive);
    case schema::PrimitiveType_SoftmaxCrossEntropy:
      return new (std::nothrow) SoftmaxCrossEntropy(primitive);
    case schema::PrimitiveType_SparseSoftmaxCrossEntropy:
      return new (std::nothrow) SparseSoftmaxCrossEntropy(primitive);
    case schema::PrimitiveType_PowerGrad:
      return new (std::nothrow) PowerGrad(primitive);
    case schema::PrimitiveType_Depend:
--- a/mindspore/lite/src/ops/softmax_cross_entropy.cc
+++ b/mindspore/lite/src/ops/softmax_cross_entropy.cc
@@ -23,11 +23,6 @@
 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 std::vector<int> SoftmaxCrossEntropy::GetAxis() const { return this->primitive_->value.AsSoftmaxCrossEntropy()->axis; }

 void SoftmaxCrossEntropy::SetAxis(const std::vector<int> &axis) {
  this->primitive_->value.AsSoftmaxCrossEntropy()->axis = axis;
 }
 int SoftmaxCrossEntropy::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
  if (this->primitive_ == nullptr) {
    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
@@ -48,7 +43,6 @@ int SoftmaxCrossEntropy::UnPackAttr(const Primitive &prim, const std::vector<Anf
      return RET_ERROR;
    }

    attr->axis = {0};
    this->primitive_->value.value = attr;
    if (this->primitive_->value.value == nullptr) {
      MS_LOG(ERROR) << "primitive value is nullptr";
@@ -59,10 +53,6 @@ int SoftmaxCrossEntropy::UnPackAttr(const Primitive &prim, const std::vector<Anf
 }
 #else

 std::vector<int> SoftmaxCrossEntropy::GetAxis() const {
  auto fb_vector = this->primitive_->value_as_SoftmaxCrossEntropy()->axis();
  return std::vector<int>(fb_vector->begin(), fb_vector->end());
 }
 int SoftmaxCrossEntropy::UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) {
  MS_ASSERT(nullptr != primitive);
  MS_ASSERT(nullptr != fbb);
@@ -71,13 +61,8 @@ int SoftmaxCrossEntropy::UnPackToFlatBuilder(const schema::Primitive *primitive,
    MS_LOG(ERROR) << "value_as_SoftmaxCrossEntropy return nullptr";
    return RET_ERROR;
  }
  std::vector<int32_t> axis;
  if (attr->axis() != nullptr) {
    for (int i = 0; i < static_cast<int>(attr->axis()->size()); i++) {
      axis.push_back(attr->axis()->data()[i]);
    }
  }
  auto val_offset = schema::CreateSoftmaxCrossEntropyDirect(*fbb, &axis);

  auto val_offset = schema::CreateSoftmaxCrossEntropy(*fbb);
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SoftmaxCrossEntropy, val_offset.o);
  fbb->Finish(prim_offset);
  return RET_OK;
@@ -100,6 +85,7 @@ int SoftmaxCrossEntropy::InferShape(std::vector<Tensor *> inputs, std::vector<Te
  MS_ASSERT(out != nullptr);

  std::vector<int> outshape;
  outshape.push_back(in0->shape()[0]);
  outshape.push_back(1);
  out->set_shape(outshape);
  out->set_data_type(in0->data_type());
--- a/mindspore/lite/src/ops/sparse_softmax_cross_entropy.cc
+++ b/mindspore/lite/src/ops/sparse_softmax_cross_entropy.cc
@@ -0,0 +1,120 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/ops/sparse_softmax_cross_entropy.h"

 #ifndef PRIMITIVE_WRITEABLE
 #include "src/ops/ops_register.h"
 #endif

 namespace mindspore {
 namespace lite {
 #ifdef PRIMITIVE_WRITEABLE
 int SparseSoftmaxCrossEntropy::GetIsGrad() const {
  return this->primitive_->value.AsSparseSoftmaxCrossEntropy()->isGrad;
 }

 void SparseSoftmaxCrossEntropy::SetIsGrad(int isGrad) {
  this->primitive_->value.AsSparseSoftmaxCrossEntropy()->isGrad = isGrad;
 }

 int SparseSoftmaxCrossEntropy::UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) {
  if (this->primitive_ == nullptr) {
    this->primitive_ = new (std::nothrow) schema::PrimitiveT;
    if (this->primitive_ == nullptr) {
      MS_LOG(ERROR) << "new primitiveT failed";
      return RET_ERROR;
    }
    this->primitive_->value.type = schema::PrimitiveType_SparseSoftmaxCrossEntropy;
  }
  if (this->primitive_->value.type != schema::PrimitiveType_SparseSoftmaxCrossEntropy) {
    MS_LOG(ERROR) << "Primitive type is error :" << this->primitive_->value.type;
    return RET_ERROR;
  }
  if (this->primitive_->value.value == nullptr) {
    auto attr = new (std::nothrow) schema::SparseSoftmaxCrossEntropyT();
    if (attr == nullptr) {
      MS_LOG(ERROR) << "new primitiveT value failed";
      return RET_ERROR;
    }

    attr->isGrad = GetValue<bool>(prim.GetAttr("is_grad"));
    this->primitive_->value.value = attr;
    if (this->primitive_->value.value == nullptr) {
      MS_LOG(ERROR) << "primitive value is nullptr";
      return RET_ERROR;
    }
  }
  return RET_OK;
 }
 #else

 int SparseSoftmaxCrossEntropy::GetIsGrad() const {
  return this->primitive_->value_as_SparseSoftmaxCrossEntropy()->isGrad();
 }
 int SparseSoftmaxCrossEntropy::UnPackToFlatBuilder(const schema::Primitive *primitive,
                                                   flatbuffers::FlatBufferBuilder *fbb) {
  MS_ASSERT(nullptr != primitive);
  MS_ASSERT(nullptr != fbb);
  auto attr = primitive->value_as_SparseSoftmaxCrossEntropy();
  if (attr == nullptr) {
    MS_LOG(ERROR) << "value_as_SparseSoftmaxCrossEntropy return nullptr";
    return RET_ERROR;
  }
  auto val_offset = schema::CreateSparseSoftmaxCrossEntropy(*fbb, attr->isGrad());
  auto prim_offset = schema::CreatePrimitive(*fbb, schema::PrimitiveType_SparseSoftmaxCrossEntropy, val_offset.o);
  fbb->Finish(prim_offset);
  return RET_OK;
 }

 PrimitiveC *SparseSoftmaxCrossEntropyCreator(const schema::Primitive *primitive) {
  return PrimitiveC::NewPrimitiveC<SparseSoftmaxCrossEntropy>(primitive);
 }
 Registry SparseSoftmaxCrossEntropyRegistry(schema::PrimitiveType_SparseSoftmaxCrossEntropy,
                                           SparseSoftmaxCrossEntropyCreator);
 #endif

 int SparseSoftmaxCrossEntropy::InferShape(std::vector<Tensor *> inputs, std::vector<Tensor *> outputs) {
  if (2 != inputs.size()) {
    MS_LOG(ERROR) << "SparseSoftmaxCrossEntropy should have at two inputs";
    return RET_ERROR;
  }

  if (1 != outputs.size()) {
    MS_LOG(ERROR) << "SparseSoftmaxCrossEntropy should have one output";
    return RET_ERROR;
  }
  auto *in0 = inputs.front();
  MS_ASSERT(in0 != nullptr);
  auto *out = outputs.front();
  MS_ASSERT(out != nullptr);

  if (GetIsGrad() != 0) {
    out->set_shape(in0->shape());
    out->set_data_type(in0->data_type());
    out->set_format(in0->format());
  } else {
    std::vector<int> outshape;
    outshape.push_back(1);
    out->set_shape(outshape);
    out->set_data_type(in0->data_type());
    out->set_format(in0->format());
  }

  return RET_OK;
 }
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/ops/sparse_softmax_cross_entropy.h
+++ b/mindspore/lite/src/ops/sparse_softmax_cross_entropy.h
@@ -0,0 +1,48 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_LITE_SRC_OPS_SPARSE_SOFTMAX_CROSS_ENTROPY_H_
 #define MINDSPORE_LITE_SRC_OPS_SPARSE_SOFTMAX_CROSS_ENTROPY_H_

 #include <vector>
 #include <set>
 #include <cmath>
 #include <memory>

 #include "src/ops/primitive_c.h"

 namespace mindspore {
 namespace lite {
 class SparseSoftmaxCrossEntropy : public PrimitiveC {
 public:
  SparseSoftmaxCrossEntropy() = default;
  ~SparseSoftmaxCrossEntropy() = default;
 #ifdef PRIMITIVE_WRITEABLE
  MS_DECLARE_PARENT(SparseSoftmaxCrossEntropy, PrimitiveC);
  explicit SparseSoftmaxCrossEntropy(schema::PrimitiveT *primitive) : PrimitiveC(primitive) {}
  void SetIsGrad(int isGrad);
  int UnPackAttr(const Primitive &prim, const std::vector<AnfNodePtr> &inputs) override;
 #else
  int UnPackToFlatBuilder(const schema::Primitive *primitive, flatbuffers::FlatBufferBuilder *fbb) override;
 #endif
  int InferShape(std::vector<lite::Tensor *> inputs_, std::vector<lite::Tensor *> outputs_) override;

  int GetIsGrad() const;
 };
 }  // namespace lite
 }  // namespace mindspore

 #endif  // MINDSPORE_LITE_SRC_OPS_SPARSE_SOFTMAX_CROSS_ENTROPY_H_
--- a/mindspore/lite/src/ops/tile.cc
+++ b/mindspore/lite/src/ops/tile.cc
@@ -140,6 +140,21 @@ int Tile::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *> output

  std::vector<int> out_shape;
  std::vector<int> multiples = GetMultiples();

 #ifdef SUPPORT_TRAIN
  const size_t in_dims = input->shape().size();
  const size_t delta_dims = in_dims - multiples.size();

  size_t i = 0;
  for (; i < delta_dims; ++i) {
    int tmp = input->shape().at(i);
    out_shape.push_back(tmp);
  }
  for (; i < in_dims; ++i) {
    int tmp = input->shape().at(i) * (multiples[i - delta_dims]);
    out_shape.push_back(tmp);
  }
 #else
  std::vector<int> dims = GetDims();
  const size_t in_dims = input->shape().size();

@@ -150,7 +165,7 @@ int Tile::InferShape(std::vector<Tensor *> inputs_, std::vector<Tensor *> output
  for (size_t i = 0; i < dims.size(); ++i) {
    out_shape[dims[i]] = input->shape()[dims[i]] * (multiples[i]);
  }

 #endif
  output->set_shape(out_shape);
  return RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/adam.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/adam.cc
@@ -45,22 +45,28 @@ int AdamCPUKernel::Execute(int task_id) {
  auto eps = reinterpret_cast<float *>(in_tensors_[8]->MutableData())[0];
  auto gradient = reinterpret_cast<float *>(in_tensors_[9]->MutableData());
  size_t elem_num = in_tensors_[0]->ElementsNum();
  if (fabs(1 - beta1_power) <= 0.0f) {
    MS_LOG(ERROR) << "divisor cannot be 0";

  if ((1.f - beta1_power) <= 0.0f) {
    MS_LOG(ERROR) << "divisor cannot be 0 or below";
    return RET_ERROR;
  }
  if ((1.f - beta2_power) < 0.0f) {
    MS_LOG(ERROR) << "sqrt cannot be negative";
    return RET_ERROR;
  }
  auto update_lr = learning_rate * std::sqrt(1 - beta2_power) / (1 - beta1_power);

  auto update_lr = learning_rate * std::sqrt(1.f - beta2_power) / (1.f - beta1_power);

  if (adam_param_->use_nesterov_) {  // Nadam
    for (size_t i = 0; i < elem_num; ++i) {
      m[i] += (gradient[i] - m[i]) * (1 - beta1);
      v[i] += (gradient[i] * gradient[i] - v[i]) * (1 - beta2);
      weight[i] -= update_lr * (m[i] * beta1 + (1 - beta1) * gradient[i]) / (std::sqrt(v[i]) + eps);
      m[i] += (gradient[i] - m[i]) * (1.f - beta1);
      v[i] += (gradient[i] * gradient[i] - v[i]) * (1.f - beta2);
      weight[i] -= update_lr * (m[i] * beta1 + (1.f - beta1) * gradient[i]) / (std::sqrt(v[i]) + eps);
    }
  } else {
    for (size_t i = 0; i < elem_num; ++i) {
      m[i] += (gradient[i] - m[i]) * (1 - beta1);
      v[i] += (gradient[i] * gradient[i] - v[i]) * (1 - beta2);
      m[i] += (gradient[i] - m[i]) * (1.f - beta1);
      v[i] += (gradient[i] * gradient[i] - v[i]) * (1.f - beta2);
      weight[i] -= update_lr * m[i] / (std::sqrt(v[i]) + eps);
    }
  }
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/dropout.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/dropout.cc
@@ -115,7 +115,6 @@ kernel::LiteKernel *CpuDropoutFp32KernelCreator(const std::vector<lite::Tensor *
  auto *kernel = new (std::nothrow) DropoutCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "Dropout new kernel failed.";
    free(opParameter);
    return nullptr;
  }
  auto ret = kernel->Init();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/dropout_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/dropout_grad.cc
@@ -102,7 +102,6 @@ kernel::LiteKernel *CpuDropoutGradFp32KernelCreator(const std::vector<lite::Tens
  auto *kernel = new (std::nothrow) DropoutGradCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
    MS_LOG(ERROR) << "DropoutGrad new kernel failed.";
    free(opParameter);
    return nullptr;
  }
  auto ret = kernel->Init();
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sgd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sgd.cc
@@ -39,16 +39,37 @@ int SgdCPUKernel::Execute(int task_id) {
  auto gradient = reinterpret_cast<float *>(in_tensors_[1]->MutableData());
  float moment = reinterpret_cast<float *>(in_tensors_[4]->MutableData())[0];
  size_t elem_num = in_tensors_[0]->ElementsNum();

  if (sgd_param_->use_nesterov_) {
    for (size_t i = 0; i < elem_num; ++i) {
      accumulate[i] = accumulate[i] * moment + gradient[i];
      weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate;
  auto stat = reinterpret_cast<float *>(in_tensors_[5]->MutableData());

  if (stat[0] > 0) {
    stat[0] = 0;
    memcpy(accumulate, gradient, elem_num * sizeof(float));
    if (sgd_param_->use_nesterov_) {
      for (size_t i = 0; i < elem_num; ++i) {
        weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate;
      }
    } else {
      for (size_t i = 0; i < elem_num; ++i) {
        weight[i] -= accumulate[i] * learning_rate;
      }
    }
  } else {
    for (size_t i = 0; i < elem_num; ++i) {
      accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - sgd_param_->dampening_);
      weight[i] -= accumulate[i] * learning_rate;
    if (moment > 0.f) {
      if (sgd_param_->use_nesterov_) {
        for (size_t i = 0; i < elem_num; ++i) {
          accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - sgd_param_->dampening_);
          weight[i] -= (accumulate[i] * moment + gradient[i]) * learning_rate;
        }
      } else {
        for (size_t i = 0; i < elem_num; ++i) {
          accumulate[i] = accumulate[i] * moment + gradient[i] * (1.f - sgd_param_->dampening_);
          weight[i] -= accumulate[i] * learning_rate;
        }
      }
    } else {
      for (size_t i = 0; i < elem_num; ++i) {
        weight[i] -= gradient[i] * learning_rate;
      }
    }
  }
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/softmax_cross_entropy_with_logits.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/softmax_cross_entropy_with_logits.cc
@@ -34,27 +34,29 @@ int SoftmaxCrossEntropyWithLogitsCPUKernel::ReSize() { return RET_OK; }
 void SoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const float *labels, const float *logits, float *grads,
                                                                float *output2) const {
  float eps = 1e-6;
  float total_loss = 0.0;
  if (grads != nullptr) {
    for (int i = 0; i < param_->batch_size_; ++i) {
      float loss = 0.f;
      for (size_t j = 0; j < param_->number_of_classes_; ++j) {
        float logit =
          -logf(logits[i * param_->number_of_classes_ + j] <= 0.0 ? eps : logits[i * param_->number_of_classes_ + j]);
        grads[i * param_->number_of_classes_ + j] =
          (logits[i * param_->number_of_classes_ + j] - labels[i * param_->number_of_classes_ + j]);
        total_loss += labels[i * param_->number_of_classes_ + j] * logit;
        loss += labels[i * param_->number_of_classes_ + j] * logit;
      }
      output2[i] = loss;
    }
  } else {
    for (int i = 0; i < param_->batch_size_; ++i) {
      float loss = 0.f;
      for (size_t j = 0; j < param_->number_of_classes_; ++j) {
        float logit =
          -logf(logits[i * param_->number_of_classes_ + j] <= 0.0 ? eps : logits[i * param_->number_of_classes_ + j]);
        total_loss += labels[i * param_->number_of_classes_ + j] * logit;
        loss += labels[i * param_->number_of_classes_ + j] * logit;
      }
      output2[i] = loss;
    }
  }
  output2[0] = total_loss / param_->batch_size_;
 }

 int SoftmaxCrossEntropyWithLogitsCPUKernel::Execute(int task_id) {
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.cc
@@ -25,7 +25,7 @@
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_SoftmaxCrossEntropy;
 using mindspore::schema::PrimitiveType_SparseSoftmaxCrossEntropy;

 namespace mindspore::kernel {

@@ -51,10 +51,9 @@ int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::ForwardPostExecute(const int *
  return RET_OK;
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses, float *grads,
                                                                  float *output) const {
 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *labels, const float *losses,
                                                                  float *grads) const {
  size_t row_start = 0;
  float total_loss = 0;
  for (int i = 0; i < param->batch_size_; ++i) {
    if (labels[i] < 0) {
      MS_LOG(ERROR) << "label value must >= 0";
@@ -65,7 +64,6 @@ int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *lab
      MS_LOG(ERROR) << "error label input!";
      return RET_ERROR;
    } else {
      total_loss -= logf(losses[i * param->number_of_classes_ + label]);
      for (size_t j = 0; j < param->number_of_classes_; ++j) {
        size_t index = row_start + j;
        if (j == label) {
@@ -77,18 +75,14 @@ int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::GradPostExecute(const int *lab
    }
    row_start += param->number_of_classes_;
  }
  output[0] = total_loss / param->batch_size_;
  return RET_OK;
 }

 int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Execute(int task_id) {
  auto sce_param = reinterpret_cast<SoftmaxCrossEntropyParameter *>(op_parameter_);
  auto ins = reinterpret_cast<float *>(in_tensors_.at(0)->data_c());
  auto labels = reinterpret_cast<int *>(in_tensors_.at(1)->data_c());
  float *out = reinterpret_cast<float *>(out_tensors_.at(0)->data_c());
  float *grads = nullptr;
  if (IsTrain() && out_tensors_.size() > 1) {
    grads = reinterpret_cast<float *>(out_tensors_.at(1)->MutableData());
  }
  size_t data_size = in_tensors_.at(0)->ElementsNum();
  MS_ASSERT(out != nullptr);
  MS_ASSERT(labels != nullptr);
@@ -99,8 +93,8 @@ int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Execute(int task_id) {
  std::fill(losses_, losses_ + data_size, 0.f);
  std::fill(sum_data_, sum_data_ + sm_params_.input_shape_[0], 0.f);
  Softmax(ins, losses_, sum_data_, &sm_params_);
  if (IsTrain()) {
    GradPostExecute(labels, losses_, grads, out);
  if (sce_param->is_grad) {
    GradPostExecute(labels, losses_, out);
  } else {
    ForwardPostExecute(labels, losses_, out);
  }
@@ -133,12 +127,12 @@ int SparseSoftmaxCrossEntropyWithLogitsCPUKernel::Init() {
  param->batch_size_ = dims[0];
  for (unsigned int i = 0; i < dims.size(); i++) param->input_shape_[i] = dims[i];
  if (2 != this->in_tensors_.size()) {
    MS_LOG(ERROR) << "softmax entropy loss should have two inputs";
    MS_LOG(ERROR) << "sparse softmax entropy loss should have two inputs";
    return RET_ERROR;
  }
  auto *in0 = in_tensors_.front();
  if (in0 == nullptr) {
    MS_LOG(ERROR) << "softmax etropy loss in0 have no data";
    MS_LOG(ERROR) << "sparse softmax etropy loss in0 have no data";
    return RET_ERROR;
  }
  size_t data_size = in_tensors_.at(0)->ElementsNum();
@@ -155,7 +149,7 @@ kernel::LiteKernel *CpuSparseSoftmaxCrossEntropyFp32KernelCreator(
  const std::vector<lite::Tensor *> &inputs, const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
  const lite::InnerContext *ctx, const kernel::KernelKey &desc, const mindspore::lite::PrimitiveC *primitive) {
  MS_ASSERT(opParameter != nullptr);
  MS_ASSERT(desc.type == schema::PrimitiveType_SoftmaxCrossEntropy);
  MS_ASSERT(desc.type == schema::PrimitiveType_SparseSoftmaxCrossEntropy);
  auto *kernel =
    new (std::nothrow) SparseSoftmaxCrossEntropyWithLogitsCPUKernel(opParameter, inputs, outputs, ctx, primitive);
  if (kernel == nullptr) {
@@ -172,4 +166,6 @@ kernel::LiteKernel *CpuSparseSoftmaxCrossEntropyFp32KernelCreator(
  }
  return kernel;
 }
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_SparseSoftmaxCrossEntropy,
           CpuSparseSoftmaxCrossEntropyFp32KernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/sparse_softmax_cross_entropy_with_logits.h
@@ -38,7 +38,7 @@ class SparseSoftmaxCrossEntropyWithLogitsCPUKernel : public LossKernel {
  ~SparseSoftmaxCrossEntropyWithLogitsCPUKernel() override {}

  int ForwardPostExecute(const int *labels, const float *losses, float *output) const;
  int GradPostExecute(const int *labels, const float *losses, float *grads, float *output) const;
  int GradPostExecute(const int *labels, const float *losses, float *grads) const;

  int Init() override;
  int ReSize() override;
--- a/mindspore/lite/src/train/train_model.cc
+++ b/mindspore/lite/src/train/train_model.cc
@@ -14,7 +14,7 @@
 * limitations under the License.
 */
 #include "src/ops/primitive_c.h"
 #include "include/train_model.h"
 #include "src/train/train_model.h"
 #include "src/common/log_adapter.h"
 #include "include/errorcode.h"
 #include "src/common/graph_util.h"
--- a/mindspore/lite/src/train/train_model.h
+++ b/mindspore/lite/src/train/train_model.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_INCLUDE_TRAIN_MODEL_H_
 #define MINDSPORE_LITE_INCLUDE_TRAIN_MODEL_H_
 #ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_MODEL_H_
 #define MINDSPORE_LITE_SRC_TRAIN_TRAIN_MODEL_H_
 #include <vector>
 #include "include/model.h"

@@ -50,4 +50,4 @@ struct TrainModel : public lite::Model {
 }  // namespace lite
 }  // namespace mindspore

 #endif  // MINDSPORE_LITE_INCLUDE_TRAIN_MODEL_H_
 #endif  // MINDSPORE_LITE_SRC_TRAIN_TRAIN_MODEL_H_
--- a/mindspore/lite/src/train/train_populate_parameter.cc
+++ b/mindspore/lite/src/train/train_populate_parameter.cc
@@ -19,6 +19,7 @@
 #include "src/ops/pooling_grad.h"
 #include "nnacl/pooling_parameter.h"
 #include "src/ops/softmax_cross_entropy.h"
 #include "src/ops/sparse_softmax_cross_entropy.h"
 #include "nnacl/fp32_grad/softmax_grad.h"
 #include "src/ops/activation_grad.h"
 #include "nnacl/fp32/activation_fp32.h"
@@ -146,6 +147,26 @@ OpParameter *PopulateSgdParameter(const mindspore::lite::PrimitiveC *primitive)
  return reinterpret_cast<OpParameter *>(p);
 }

 OpParameter *PopulateSparseSoftmaxCrossEntropyParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
    return nullptr;
  }
  SoftmaxCrossEntropyParameter *sce_param =
    reinterpret_cast<SoftmaxCrossEntropyParameter *>(malloc(sizeof(SoftmaxCrossEntropyParameter)));
  if (sce_param == nullptr) {
    MS_LOG(ERROR) << "malloc SoftmaxCrossEntropyParameter failed.";
    return nullptr;
  }
  auto sce_primitive = reinterpret_cast<mindspore::lite::SparseSoftmaxCrossEntropy *>(
    const_cast<mindspore::lite::PrimitiveC *>(primitive));

  sce_param->is_grad = sce_primitive->GetIsGrad();

  sce_param->op_parameter_.type_ = primitive->Type();
  return reinterpret_cast<OpParameter *>(sce_param);
 }

 OpParameter *PopulateSoftmaxCrossEntropyParameter(const mindspore::lite::PrimitiveC *primitive) {
  if (primitive == nullptr) {
    MS_LOG(ERROR) << "Primitive is nullptr when populating parameter for op.";
@@ -157,6 +178,7 @@ OpParameter *PopulateSoftmaxCrossEntropyParameter(const mindspore::lite::Primiti
    MS_LOG(ERROR) << "malloc SoftmaxCrossEntropyParameter failed.";
    return nullptr;
  }
  sce_param->is_grad = 0;
  sce_param->op_parameter_.type_ = primitive->Type();
  return reinterpret_cast<OpParameter *>(sce_param);
 }
@@ -468,6 +490,8 @@ void PopulateTrainParameters() {
  lite::Registry BiasGradParameterRegistry(schema::PrimitiveType_BiasGrad, PopulateBiasGradParameter);
  lite::Registry SoftmaxCrossEntropyParameterRegistry(schema::PrimitiveType_SoftmaxCrossEntropy,
                                                      PopulateSoftmaxCrossEntropyParameter);
  lite::Registry SparseSoftmaxCrossEntropyParameterRegistry(schema::PrimitiveType_SparseSoftmaxCrossEntropy,
                                                            PopulateSparseSoftmaxCrossEntropyParameter);
  lite::Registry ActivationParameterRegistry(schema::PrimitiveType_ActivationGrad, PopulateActivationGradParameter);
  lite::Registry TupleGetItemParameterRegistry(schema::PrimitiveType_TupleGetItem, DefaultPopulateParameter);
  lite::Registry DependParameterRegistry(schema::PrimitiveType_Depend, DefaultPopulateParameter);
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -21,8 +21,8 @@
 #include <vector>
 #include <iostream>
 #include <fstream>
 #include <memory>
 #include "include/errorcode.h"
 #include "include/train_model.h"
 #include "src/common/utils.h"
 #include "src/tensor.h"
 #include "src/train/loss_kernel.h"
@@ -72,18 +72,9 @@ void TrainSession::RestoreOps(const std::vector<CreatorOp> &restore) {

 void TrainSession::AllocWorkSpace() {
  size_t workspace_size = 0;
  for (auto ori_kernel : kernels_) {
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      if (workspace_size < ori_kernel->workspace_size()) {
        workspace_size = ori_kernel->workspace_size();
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      for (auto kernel : sub_graph->nodes()) {
        if (workspace_size < kernel->workspace_size()) {
          workspace_size = kernel->workspace_size();
        }
      }
  for (auto kernel : this->train_kernels_) {
    if (workspace_size < kernel->workspace_size()) {
      workspace_size = kernel->workspace_size();
    }
  }
  mindspore::kernel::LiteKernel::AllocWorkspace(workspace_size);
@@ -92,40 +83,27 @@ void TrainSession::AllocWorkSpace() {
 int TrainSession::CompileGraph(lite::Model *model) { return lite::RET_ERROR; }

 int TrainSession::CompileTrainGraph(mindspore::lite::TrainModel *model) {
  if (model == nullptr) {
    MS_LOG(ERROR) << "model is null";
    return RET_ERROR;
  }
  model_ = model;

  auto restore = ReplaceOps();
  auto ret = lite::LiteSession::CompileGraph(model);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "Compile train graph failed";
    MS_LOG(ERROR) << "failed to compile train model";
    return RET_ERROR;
  }
  orig_output_map_ = output_node_map_;
  orig_output_node_map_ = output_node_map_;
  orig_output_tensor_map_ = output_tensor_map_;
  for (auto inTensor : inputs_) {
    inTensor->MutableData();
  }

  for (auto inTensor : inputs_) inTensor->MutableData();
  RestoreOps(restore);
  CompileTrainKernels();      // Prepare a list of train kernels
  CompileInferenceKernels();  // Prepare a list of eval kernels
  CompileOptimizedKernels();  // Prepare a list of kenels which are optimized (weight update step)
  CompileTrainOutputs();      // prepare outputs in train mode
  CompileEvalOutputs();       // prepare outputs in eval mode
  AllocWorkSpace();
  MarkOptimizedKernels();
  CompileTrainKernels();
  if (train_mode_) {
    auto ret1 = Train();
    if (ret1 != RET_OK) {
      MS_LOG(ERROR) << "faild to initialize network in train mode";
      return RET_ERROR;
    }
  } else {
    auto ret1 = Eval();
    if (ret1 != RET_OK) {
      MS_LOG(ERROR) << "faild to initialize network in eval mode";
      return RET_ERROR;
    }
  }
  return ret;

  return RET_OK;
 }

 TrainSession::~TrainSession() {
@@ -180,219 +158,144 @@ int TrainSession::SaveToFile(const std::string &filename) const {
 }

 int TrainSession::Train() {
  for (auto ori_kernel : kernels_) {
    MS_ASSERT(nullptr != ori_kernel);
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      auto ret = ori_kernel->Train();
      if (ret != RET_OK) {
        MS_LOG(ERROR) << ori_kernel->name() << " failed to set train mode";
        return RET_ERROR;
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      MS_ASSERT(nullptr != sub_graph);
      for (auto kernel : sub_graph->nodes()) {
        MS_ASSERT(nullptr != kernel);
        auto ret = kernel->Train();
        if (ret != RET_OK) {
          MS_LOG(ERROR) << kernel->name() << " failed to set train mode";
          return RET_ERROR;
        }
      }
    }
  }
  output_node_map_.clear();
  output_tensor_map_.clear();
  // shift kernels to train mode
  train_mode_ = true;
  for (auto ori_kernel : kernels_) {
    MS_ASSERT(nullptr != ori_kernel);
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      UpdateOutputMapByLossKernel(ori_kernel);
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      MS_ASSERT(nullptr != sub_graph);
      for (auto kernel : sub_graph->nodes()) {
        MS_ASSERT(nullptr != kernel);
        UpdateOutputMapByLossKernel(kernel);
      }
  for (auto kernel : this->train_kernels_) {
    MS_ASSERT(nullptr != kernel);
    auto ret = kernel->Train();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << kernel->name() << " failed to set train mode";
      return RET_ERROR;
    }
  }
  // set train outputs
  output_node_map_ = train_output_node_map_;
  output_tensor_map_ = train_output_tensor_map_;

  return RET_OK;
 }

 void TrainSession::UpdateOutputMapByLossKernel(const kernel::LiteKernel *kernel) {
  if (kernel != nullptr && IsLossKernel(kernel)) {
    auto *ms_tensor = kernel->out_tensors().at(0);
    if (ms_tensor != nullptr) {
      (void)ms_tensor->MutableData();
      output_node_map_[kernel->name()].emplace_back(ms_tensor);
      auto index = TSFindTensor(tensors_, ms_tensor);
      if (index != tensors_.size()) {
        output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor));
      }
 int TrainSession::Eval() {
  // shift kernels to eval mode
  train_mode_ = false;
  for (auto kernel : this->train_kernels_) {
    MS_ASSERT(kernel != nullptr);
    auto ret = kernel->Eval();
    if (ret != RET_OK) {
      MS_LOG(ERROR) << kernel->name() << " failed to set eval mode";
      return RET_ERROR;
    }
  }
  // set eval outputs
  output_node_map_ = eval_output_node_map_;
  output_tensor_map_ = eval_output_tensor_map_;
  return RET_OK;
 }

 void TrainSession::UpdateOutputMapByInKernel(const kernel::LiteKernel *kernel) {
  if (kernel != nullptr && IsLossKernel(kernel)) {
    for (auto in_kernel : kernel->in_kernels()) {
      if (output_node_map_.find(in_kernel->name()) == output_node_map_.end()) {
        auto *ms_tensor = in_kernel->out_tensors().at(0);
        if (ms_tensor != nullptr) {
          output_node_map_[in_kernel->name()].emplace_back(ms_tensor);
          auto index = TSFindTensor(tensors_, ms_tensor);
          if (index != tensors_.size()) {
            output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor));
 void TrainSession::CompileEvalOutputs() {
  eval_output_node_map_.clear();
  eval_output_tensor_map_.clear();
  for (auto kernel : this->train_kernels_) {
    if (IsLossKernel(kernel)) {
      for (auto in_kernel : kernel->in_kernels()) {
        // insert if not already in
        if (eval_output_node_map_.find(in_kernel->name()) == eval_output_node_map_.end()) {
          auto *ms_tensor = in_kernel->out_tensors().at(0);
          if (ms_tensor != nullptr) {
            eval_output_node_map_[in_kernel->name()].emplace_back(ms_tensor);
            auto index = TSFindTensor(tensors_, ms_tensor);
            if (index != tensors_.size()) {
              eval_output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor));
            }
          }
        }
      }
    }
  }
  if (eval_output_node_map_.size() == 0) eval_output_node_map_ = orig_output_node_map_;
  if (eval_output_tensor_map_.size() == 0) eval_output_tensor_map_ = orig_output_tensor_map_;
 }

 int TrainSession::Eval() {
  for (auto ori_kernel : kernels_) {
    MS_ASSERT(nullptr != ori_kernel);
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      auto ret = ori_kernel->Eval();
      if (ret != RET_OK) {
        MS_LOG(ERROR) << ori_kernel->name() << " failed to set eval mode";
        return RET_ERROR;
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      MS_ASSERT(nullptr != sub_graph);
      for (auto kernel : sub_graph->nodes()) {
        MS_ASSERT(nullptr != kernel);
        auto ret = kernel->Eval();
        if (ret != RET_OK) {
          MS_LOG(ERROR) << kernel->name() << " failed to set eval mode";
          return RET_ERROR;
 void TrainSession::CompileTrainOutputs() {
  train_output_node_map_.clear();
  train_output_tensor_map_.clear();
  for (auto kernel : this->train_kernels_) {
    if (orig_output_node_map_.find(kernel->name()) == orig_output_node_map_.end()) continue;
    // Mask out optimizer out tensors
    if (IsMaskOutput(kernel)) continue;
    // insert if not already in
    if (train_output_node_map_.find(kernel->name()) == train_output_node_map_.end()) {
      auto *ms_tensor = kernel->out_tensors().at(0);
      if (ms_tensor != nullptr) {
        train_output_node_map_[kernel->name()].emplace_back(ms_tensor);
        auto index = TSFindTensor(tensors_, ms_tensor);
        if (index != tensors_.size()) {
          train_output_tensor_map_.insert(std::make_pair(std::to_string(index), ms_tensor));
        }
      }
    }
  }
  output_node_map_ = orig_output_map_;
  output_tensor_map_ = orig_output_tensor_map_;

  train_mode_ = false;
  for (auto ori_kernel : kernels_) {
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      UpdateOutputMapByInKernel(ori_kernel);
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      for (auto kernel : sub_graph->nodes()) {
        UpdateOutputMapByInKernel(kernel);
      }
    }
  }
  if (inference_kernels_.size() == 0) {
    BuildInferenceKernelsMap();
  }
  return RET_OK;
  if (train_output_node_map_.size() == 0) train_output_node_map_ = orig_output_node_map_;
  if (train_output_tensor_map_.size() == 0) train_output_tensor_map_ = orig_output_tensor_map_;
 }

 void TrainSession::BuildInferenceKernelsRecursive(kernel::LiteKernel *kernel, std::vector<kernel::LiteKernel *> *v) {
  if (std::find(v->begin(), v->end(), kernel) == v->end()) {  // kernel is not in vector
    v->push_back(kernel);
  if (std::find(v->begin(), v->end(), kernel) == v->end()) {  // kernel is not already in vector
    if (!IsLossKernel(kernel)) v->push_back(kernel);
    for (auto in_node : kernel->in_kernels()) {
      BuildInferenceKernelsRecursive(in_node, v);
    }
  }
 }

 void TrainSession::BuildInferenceKernelsMap() {
  std::vector<kernel::LiteKernel *> req_kernels;
  for (auto kernel : this->kernels_) {
    if (kernel->subgraph_type() == kernel::kNotSubGraph) {
      if (IsLossKernel(kernel)) {  // For each loss in the system add backward tree
        for (auto in_node : kernel->in_kernels()) {
          BuildInferenceKernelsRecursive(in_node, &req_kernels);
        }
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
      for (auto sub_kernel : sub_graph->nodes()) {
        if (IsLossKernel(sub_kernel)) {  // For each loss in the system add backward tree
          for (auto in_node : sub_kernel->in_kernels()) {
            BuildInferenceKernelsRecursive(in_node, &req_kernels);
          }
        }
      }
    }
  }

  inference_kernels_.clear();
 void TrainSession::CompileTrainKernels() {
  train_kernels_.clear();
  for (auto ori_kernel : kernels_) {
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      if (std::find(req_kernels.begin(), req_kernels.end(), ori_kernel) != req_kernels.end()) {
        inference_kernels_.push_back(ori_kernel);
      }
      train_kernels_.push_back(ori_kernel);
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      for (auto kernel : sub_graph->nodes()) {
        if (std::find(req_kernels.begin(), req_kernels.end(), kernel) != req_kernels.end()) {
          inference_kernels_.push_back(kernel);
        }
        train_kernels_.push_back(kernel);
      }
    }
  }
  if (inference_kernels_.size() == 0) {
    inference_kernels_ = this->kernels_;
  }
 }

 void TrainSession::CompileTrainKernels() {
  train_kernels_.clear();
  for (auto ori_kernel : kernels_) {
    if (ori_kernel->subgraph_type() == kernel::kNotSubGraph) {
      train_kernels_.push_back(ori_kernel);
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(ori_kernel);
      for (auto kernel : sub_graph->nodes()) {
        train_kernels_.push_back(kernel);
 void TrainSession::CompileInferenceKernels() {
  std::vector<kernel::LiteKernel *> req_kernels;
  for (auto kernel : this->train_kernels_) {
    if (IsLossKernel(kernel)) {  // For each loss in the system add backward tree
      for (auto in_node : kernel->in_kernels()) {
        BuildInferenceKernelsRecursive(in_node, &req_kernels);
      }
    }
  }
  inference_kernels_.clear();
  for (auto ori_kernel : this->train_kernels_) {
    if (std::find(req_kernels.begin(), req_kernels.end(), ori_kernel) != req_kernels.end()) {
      inference_kernels_.push_back(ori_kernel);
    }
  }
  if (inference_kernels_.size() == 0) {
    inference_kernels_ = this->train_kernels_;
  }
 }

 void TrainSession::MarkOptimizedKernels() {
 void TrainSession::CompileOptimizedKernels() {
  std::vector<lite::Tensor *> ot;
  for (auto kernel : this->kernels_) {
    if (kernel->subgraph_type() == kernel::kNotSubGraph) {
      if (IsOptimizer(kernel)) {
        std::copy(kernel->in_tensors().begin(), kernel->in_tensors().end(), std::back_inserter(ot));
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
      for (auto sub_kernel : sub_graph->nodes()) {
        if (IsOptimizer(sub_kernel)) {
          std::copy(sub_kernel->in_tensors().begin(), sub_kernel->in_tensors().end(), std::back_inserter(ot));
        }
      }
  for (auto kernel : this->train_kernels_) {
    if (IsOptimizer(kernel)) {
      std::copy(kernel->in_tensors().begin(), kernel->in_tensors().end(), std::back_inserter(ot));
    }
  }
  for (auto kernel : this->kernels_) {
    if (kernel->subgraph_type() == kernel::kNotSubGraph) {
      if (!IsOptimizer(kernel)) {
        for (auto it : kernel->in_tensors()) {
          if (std::find(ot.begin(), ot.end(), it) != ot.end()) {
            kernel->set_trainable(true);
            break;
          }
        }
      }
    } else {
      auto sub_graph = reinterpret_cast<kernel::SubGraphKernel *>(kernel);
      for (auto sub_kernel : sub_graph->nodes()) {
        if (!IsOptimizer(sub_kernel)) {
          for (auto it : sub_kernel->in_tensors()) {
            if (std::find(ot.begin(), ot.end(), it) != ot.end()) {
              sub_kernel->set_trainable(true);
              break;
            }
          }

  for (auto kernel : this->train_kernels_) {
    if (!IsOptimizer(kernel)) {
      for (auto it : kernel->in_tensors()) {
        if (std::find(ot.begin(), ot.end(), it) != ot.end()) {
          kernel->set_trainable(true);
          break;
        }
      }
    }
@@ -400,19 +303,31 @@ void TrainSession::MarkOptimizedKernels() {
 }

 bool TrainSession::IsLossKernel(const kernel::LiteKernel *kernel) const {
  return (kernel->Type() == schema::PrimitiveType_SoftmaxCrossEntropy);
  return (kernel->Type() == schema::PrimitiveType_SoftmaxCrossEntropy ||
          kernel->Type() == schema::PrimitiveType_SparseSoftmaxCrossEntropy);
 }

 bool TrainSession::IsOptimizer(kernel::LiteKernel *kernel) const {
  return ((kernel->Type() == schema::PrimitiveType_Adam) || (kernel->Type() == schema::PrimitiveType_Sgd) ||
          (kernel->Type() == schema::PrimitiveType_ApplyMomentum));
 }
 bool TrainSession::IsMaskOutput(kernel::LiteKernel *kernel) const {
  return (IsOptimizer(kernel) || (kernel->Type() == schema::PrimitiveType_Assign));
 }

 }  // namespace lite

 session::TrainSession *session::TrainSession::CreateSession(lite::Context *context) {
 session::TrainSession *session::TrainSession::CreateSession(const char *model_buf, size_t size, lite::Context *context,
                                                            bool train_mode) {
  auto model = mindspore::lite::TrainModel::Import(model_buf, size);
  if (model == nullptr) {
    MS_LOG(ERROR) << "create model for  train session failed";
    return nullptr;
  }

  auto session = new (std::nothrow) lite::TrainSession();
  if (session == nullptr) {
    delete model;
    MS_LOG(ERROR) << "create session failed";
    return nullptr;
  }
@@ -422,7 +337,59 @@ session::TrainSession *session::TrainSession::CreateSession(lite::Context *conte
    delete session;
    return nullptr;
  }

  ret = session->CompileTrainGraph(model);
  if (ret != mindspore::lite::RET_OK) {
    MS_LOG(ERROR) << "Compiling Train Graph sesssion failed";
    delete session;
    return nullptr;
  }

  if (train_mode) {
    ret = session->Train();
  } else {
    ret = session->Eval();
  }
  if (ret != mindspore::lite::RET_OK) {
    MS_LOG(ERROR) << "Could not switch to Train Modei " << train_mode;
    delete session;
    return nullptr;
  }

  return session;
 }

 session::TrainSession *session::TrainSession::CreateSession(const std::string &filename, lite::Context *context,
                                                            bool train_mode) {
  std::ifstream ifs(filename);
  if (!ifs.good()) {
    MS_LOG(ERROR) << "File: " << filename << " does not exist";
    return nullptr;
  }

  if (!ifs.is_open()) {
    MS_LOG(ERROR) << "File: " << filename << " open failed";
    return nullptr;
  }

  ifs.seekg(0, std::ios::end);
  auto size = ifs.tellg();
  if (size == 0) {
    MS_LOG(ERROR) << "Could not read file " << filename;
    return nullptr;
  }
  std::unique_ptr<char[]> buf(new (std::nothrow) char[size]);
  if (buf == nullptr) {
    MS_LOG(ERROR) << "malloc buf failed, file: " << filename;
    ifs.close();
    return nullptr;
  }

  ifs.seekg(0, std::ios::beg);
  ifs.read(buf.get(), size);
  ifs.close();

  return session::TrainSession::CreateSession(buf.get(), size, context, train_mode);
 }

 }  // namespace mindspore
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -21,7 +21,7 @@
 #include <unordered_map>
 #include "src/ops/primitive_c.h"
 #include "include/train_session.h"
 #include "include/train_model.h"
 #include "src/train/train_model.h"
 #include "src/lite_session.h"

 /*
@@ -52,7 +52,7 @@ class TrainSession : virtual public session::TrainSession, virtual public lite::
  int RunGraph(const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr) override;

  int CompileGraph(lite::Model *model) override;
  int CompileTrainGraph(lite::TrainModel *model) override;
  virtual int CompileTrainGraph(lite::TrainModel *model);

  void *ExportToBuf(char *buf, size_t *len) const override;
  int SaveToFile(const std::string &filename) const override;
@@ -80,24 +80,34 @@ class TrainSession : virtual public session::TrainSession, virtual public lite::
    return lite::LiteSession::Resize(inputs, dims);
  }

  void UpdateOutputMapByInKernel(const kernel::LiteKernel *kernel);
  void UpdateOutputMapByLossKernel(const kernel::LiteKernel *kernel);

 protected:
  void AllocWorkSpace();
  bool IsLossKernel(const kernel::LiteKernel *kernel) const;
  bool IsOptimizer(kernel::LiteKernel *kernel) const;
  virtual void MarkOptimizedKernels();
  bool IsMaskOutput(kernel::LiteKernel *kernel) const;
  virtual std::vector<CreatorOp> ReplaceOps();
  virtual void RestoreOps(const std::vector<CreatorOp> &restore);
  virtual void BuildInferenceKernelsMap();
  virtual void BuildInferenceKernelsRecursive(kernel::LiteKernel *ker, std::vector<kernel::LiteKernel *> *req_kernels);
  virtual void CompileTrainKernels();
  virtual void CompileInferenceKernels();
  virtual void CompileOptimizedKernels();
  virtual void CompileTrainOutputs();
  virtual void CompileEvalOutputs();

  TrainModel *model_ = nullptr;
  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> orig_output_map_;
  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> orig_output_node_map_;
  std::unordered_map<std::string, mindspore::tensor::MSTensor *> orig_output_tensor_map_;

  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> eval_output_node_map_;
  std::unordered_map<std::string, mindspore::tensor::MSTensor *> eval_output_tensor_map_;

  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> train_output_node_map_;
  std::unordered_map<std::string, mindspore::tensor::MSTensor *> train_output_tensor_map_;

  std::vector<kernel::LiteKernel *> inference_kernels_;
  std::vector<kernel::LiteKernel *> train_kernels_;

 private:
  void BuildInferenceKernelsRecursive(kernel::LiteKernel *ker, std::vector<kernel::LiteKernel *> *req_kernels);
 };
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/test/models_ms_train.cfg
+++ b/mindspore/lite/test/models_ms_train.cfg
@@ -1,7 +1,7 @@
 mini_alexnet
 #mobilenetv1
 #mobilenetv2
 #mobilenetv3 # this model got error when RunX86
 # mobilenetv1
 mobilenetv2
 mobilenetv3
 lenet
 #effnet
 effnet_tune
--- a/mindspore/lite/test/run_net_train.sh
+++ b/mindspore/lite/test/run_net_train.sh
@@ -16,7 +16,7 @@ function Run_Export(){
        echo ${model_name}'_train_export.py' >> "${export_log_file}"
        echo 'exporting' ${model_name}
        echo 'docker run --user $(id -u):$(id -g) --env CLOUD_MODEL_ZOO=${CLOUD_MODEL_ZOO} -w $PWD --runtime=nvidia -v /home/$USER:/home/$USER -v /opt/share:/opt/share --privileged=true mindspore_dev:5 python '${models_path}'/'${model_name}'_train_export.py' >>  "${export_log_file}"
        docker run --user $(id -u):$(id -g) --env CLOUD_MODEL_ZOO=${CLOUD_MODEL_ZOO} -w $PWD --runtime=nvidia -v /home/$USER:/home/$USER -v /opt/share:/opt/share --privileged=true mindspore_dev:5 python ${models_path}'/'${model_name}_train_export.py
        docker run --user $(id -u):$(id -g) --env CLOUD_MODEL_ZOO=${CLOUD_MODEL_ZOO} -w $PWD --runtime=nvidia -v /home/$USER:/home/$USER -v /opt/share:/opt/share --privileged=true mindspore_dev:5 python ${models_path}'/'${model_name}_train_export.py ${epoch_num}
        if [ $? = 0 ]; then
            export_result='export mindspore '${model_name}'_train_export pass';echo ${export_result} >> ${export_result_file}
        else
@@ -74,18 +74,19 @@ function Run_x86() {
        echo ${model_name}'_train' >> "${run_x86_log_file}"
        echo 'cd  '${x86_path}'/mindspore-lite-'${version}'-runtime-x86-'${process_unit_x86}-train >> "${run_x86_log_file}"
        cd ${x86_path}/mindspore-lite-${version}-runtime-x86-${process_unit_x86}-train || return 1
        echo 'LD_LIBRARY_PATH='${LD_LIBRARY_PATH}':./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./net_train/net_train --modelFile='${ms_models_path}'/'${model_name}'_train.ms --inDataFile='${input_path}'/'${model_name}'_input1.bin,'${train_io_path}'/'${model_name}'_input2.bin --expectedDataFile='${train_io_path}'/'${model_name}'_outputs.bin --exportFile='${ms_models_path}'/'${model_name}'_train_exported.ms' >> "${run_x86_log_file}"
        echo 'LD_LIBRARY_PATH='${LD_LIBRARY_PATH}':./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib;./net_train/net_train --epochs='${epoch_num}' --modelFile='${ms_models_path}'/'${model_name}'_train.ms --inDataFile='${input_path}'/'${model_name}'_input1.bin,'${train_io_path}'/'${model_name}'_input2.bin --expectedDataFile='${train_io_path}'/'${model_name}'_outputs.bin --exportFile='${ms_models_path}'/'${model_name}'_train_exported.ms'  >> "${run_x86_log_file}"
        echo '-------------------------------------------------------------------------------' >> "${run_x86_log_file}"
        LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:./lib:./third_party/libjpeg-turbo/lib:./third_party/opencv/lib \
        ${run_valgrind}./net_train/net_train \
        --modelFile=${ms_models_path}/${model_name}_train.ms \
        --inDataFile=${train_io_path}/${model_name}_input1.bin,${train_io_path}/${model_name}_input2.bin \
        --expectedDataFile=${train_io_path}/${model_name}_outputs.bin \
        --exportFile=${ms_models_path}/${model_name}_train_exported.ms >> "${run_x86_log_file}"
        --exportFile=${ms_models_path}/${model_name}_train_exported.ms >> "${run_x86_log_file}" \
        --epochs=${epoch_num}
        if [ $? = 0 ]; then
            run_result='x86: '${model_name}'_train pass'; echo ${run_result} >> ${run_net_train_result_file}
        else
            run_result='x86: '${model_name}'_train failed'; echo ${run_result} >> ${run_net_train_result_file}; return 1
            run_result='x86: '${model_name}'_train failed'; echo ${run_result} >> ${run_net_train_result_file}
        fi
    done < ${models_mindspore_train_config}
 }
@@ -160,12 +161,12 @@ function Run_arm() {
        echo 'chmod 777 net_train' >> ${adb_cmd_run_file}
        if [ "$1" == arm64 ]; then
            echo 'cp  /data/local/tmp/libc++_shared.so ./' >> ${adb_cmd_run_file}
            echo 'export LD_LIBRARY_PATH=/data/local/tmp/net_train_test;./net_train --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin --exportFile='${model_name}'_train_exported.ms' >> "${run_arm_log_file}"
            echo 'export LD_LIBRARY_PATH=/data/local/tmp/net_train_test;./net_train --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin --exportFile='${model_name}'_train_exported.ms' >> "${adb_cmd_run_file}"
            echo 'export LD_LIBRARY_PATH=./:/data/local/tmp/net_train_test;./net_train --epochs='${epoch_num}' --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin' >> "${run_arm_log_file}"
            echo 'export LD_LIBRARY_PATH=./:/data/local/tmp/net_train_test;./net_train --epochs='${epoch_num}' --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin' >> "${adb_cmd_run_file}"
        elif [ "$1" == arm32 ]; then
            echo 'cp  /data/local/tmp/arm32/libc++_shared.so ./' >> ${adb_cmd_run_file}
            echo 'export LD_LIBRARY_PATH=/data/local/tmp/net_train_test;./net_train --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin --exportFile='${model_name}'_train_exported.ms' >> "${run_arm_log_file}"
            echo 'export LD_LIBRARY_PATH=/data/local/tmp/net_train_test;./net_train --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin --exportFile='${model_name}'_train_exported.ms' >> "${adb_cmd_run_file}"
            echo 'export LD_LIBRARY_PATH=./:/data/local/tmp/:/data/local/tmp/net_train_test;./net_train --epochs='${epoch_num}' --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin'  >> "${run_arm_log_file}"
            echo 'export LD_LIBRARY_PATH=./:/data/local/tmp/:/data/local/tmp/net_train_test;./net_train --epochs='${epoch_num}' --modelFile='${model_name}'_train.ms --inDataFile=/data/local/tmp/net_train_test/'${model_name}'_input1.bin,/data/local/tmp/net_train_test/'${model_name}'_input2.bin --expectedDataFile=/data/local/tmp/net_train_test/'${model_name}'_outputs.bin'  >> "${adb_cmd_run_file}"
        fi    

        adb -s ${device_id} shell < ${adb_cmd_run_file} >> ${run_arm_log_file}
@@ -203,16 +204,18 @@ function Print_Result() {
 basepath=$(pwd)
 echo ${basepath}

 train_io_path=""
 # Example:run_net_train.sh -r /home/emir/Work/TestingEnv/release -m /home/emir/Work/TestingEnv/train_models -i /home/emir/Work/TestingEnv/train_io -d "8KE5T19620002408"
 # For running on arm64, use -t to set platform tools path (for using adb commands)
 while getopts "r:m:d:i:e:v" opt; do
 epoch_num=1
 train_io_path=""
 while getopts "r:m:d:i:e:vt:" opt; do
    case ${opt} in
        r)
 	    release_path=${OPTARG}
            echo "release_path is ${OPTARG}"
            ;;
        m)

 	    models_path=${OPTARG}"/models_train"
            echo "models_path is ${OPTARG}"
            ;;
@@ -229,9 +232,13 @@ while getopts "r:m:d:i:e:v" opt; do
            echo "enable_export = ${OPTARG}"
            ;;          
        v)
 	    run_valgrind="valgrind "
 	    run_valgrind="valgrind --log-file=valgrind.log "
            echo "Run x86 with valgrind"
            ;;            
            ;;
        t)
 	    epoch_num=${OPTARG}
            echo "train epoch num is ${OPTARG}"
            ;;                          
        ?)
            echo "unknown para"
            exit 1;;
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bn_grad_fp32_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/bn_grad_fp32_test.cc
@@ -181,7 +181,6 @@ TEST_F(TestBNGradFp32, BNTtrainFp32) {
  auto kernel_obj = creator(inputs, outputs, reinterpret_cast<OpParameter *>(bn_param), &context, desc, nullptr);
  ASSERT_NE(kernel_obj, nullptr);
  mindspore::kernel::LiteKernel::AllocWorkspace(kernel_obj->workspace_size());

  float *save_mean = reinterpret_cast<float *>(save_mean_tensor.MutableData());
  float *save_var = reinterpret_cast<float *>(save_var_tensor.MutableData());
  for (int i = 0; i < channels; i++) {
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/convolution_grad_fp32_tests.cc
@@ -273,10 +273,7 @@ TEST_F(TestConvolutionGradFp32, ConvFp32GroupFilterGrad) {
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), &context, desc, nullptr);
  ASSERT_NE(kernel, nullptr);
  mindspore::kernel::LiteKernel::AllocWorkspace(kernel->workspace_size());
  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }
  kernel->Run();

  int loop_count = 100;
  auto time_start = mindspore::lite::GetTimeUs();
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/deconvolution_grad_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/deconvolution_grad_fp32_tests.cc
@@ -205,10 +205,7 @@ TEST_F(TestDeConvolutionGradFp32, DeConvFp32Dilation2FilterGrad) {
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), &context, desc, nullptr);
  ASSERT_NE(kernel, nullptr);
  mindspore::kernel::LiteKernel::AllocWorkspace(kernel->workspace_size());

  // warm up loop
  for (int i = 0; i < 3; i++) {
    kernel->Run();
  }

  // runtime part
@@ -631,6 +628,7 @@ TEST_F(TestDeConvolutionGradFp32, DeConvFp32Dilation2Group12Stride2FilterGrad) {
  ASSERT_NE(creator, nullptr);
  auto kernel = creator(inputs, outputs, reinterpret_cast<OpParameter *>(conv_param), &context, desc, nullptr);
  ASSERT_NE(kernel, nullptr);

  mindspore::kernel::LiteKernel::AllocWorkspace(kernel->workspace_size());

  // warm up loop
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/network_test.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/network_test.cc
@@ -23,7 +23,6 @@
 #include <functional>

 #include "schema/inner/model_generated.h"
 #include "mindspore/lite/include/train_model.h"
 #include "common/common_test.h"
 #include "include/train_session.h"
 #include "include/context.h"
@@ -131,7 +130,6 @@ TEST_F(NetworkTest, tuning_layer) {
    node->primitive->value.type = schema::PrimitiveType_SoftmaxCrossEntropy;
    auto primitive = new schema::SoftmaxCrossEntropyT;
    ASSERT_NE(primitive, nullptr);
    primitive->axis.push_back(0);
    node->primitive->value.value = primitive;
    node->name = "SoftmaxCrossEntropy";
    meta_graph->nodes.emplace_back(std::move(node));
@@ -144,7 +142,6 @@ TEST_F(NetworkTest, tuning_layer) {
    node->primitive->value.type = schema::PrimitiveType_BiasGrad;
    auto primitive = new schema::BiasGradT;
    ASSERT_NE(primitive, nullptr);
    primitive->axis.push_back(0);
    node->primitive->value.value = primitive;
    node->name = "BiasGrad";
    meta_graph->nodes.emplace_back(std::move(node));
@@ -360,17 +357,13 @@ TEST_F(NetworkTest, tuning_layer) {
  const char *content = reinterpret_cast<char *>(builder.GetBufferPointer());
  std::cout << "build fb size= " << size << std::endl;

  auto model = lite::TrainModel::Import(content, size);
  ASSERT_NE(nullptr, model);
  meta_graph.reset();
  content = nullptr;
  lite::Context context;
  context.device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = lite::NO_BIND;
  context.thread_num_ = 1;
  auto session = session::TrainSession::CreateSession(&context);
  auto session = session::TrainSession::CreateSession(content, size, &context);
  ASSERT_NE(nullptr, session);
  auto ret = session->CompileTrainGraph(model);
  ASSERT_EQ(lite::RET_OK, ret);
  session->Train();
  session->Train();  // Just double check that calling Train twice does not cause a problem

@@ -398,7 +391,7 @@ TEST_F(NetworkTest, tuning_layer) {
  std::fill(labels, labels + labelTensor->ElementsNum(), 0.f);
  for (int i = 0; i < BATCH_SIZE; i++) labels[i * NUM_CLASSES + (i * 97) % NUM_CLASSES] = 1.0;

  ret = session->RunGraph();
  auto ret = session->RunGraph();
  ASSERT_EQ(lite::RET_OK, ret);
  auto outputs = session->GetOutputsByNodeName("SoftmaxCrossEntropy");
  ASSERT_EQ(outputs.size(), 1);
@@ -514,23 +507,14 @@ int32_t runNet(mindspore::session::LiteSession *session, const std::string &in,
 }

 TEST_F(NetworkTest, efficient_net) {
  char *buf = nullptr;
  size_t net_size = 0;

  std::string net = "./test_data/nets/effnetb0_fwd_nofuse.ms";
  ReadFile(net.c_str(), &net_size, &buf);
  auto model = lite::TrainModel::Import(buf, net_size);
  delete[] buf;
  auto context = new lite::Context;
  ASSERT_NE(context, nullptr);
  context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = lite::NO_BIND;
  context->thread_num_ = 1;

  auto session = session::TrainSession::CreateSession(context);
  std::string net = "./test_data/nets/effnetb0_fwd_nofuse.ms";
  auto session = session::TrainSession::CreateSession(net, context, false);
  ASSERT_NE(session, nullptr);
  auto ret = session->CompileTrainGraph(model);
  ASSERT_EQ(lite::RET_OK, ret);
  session->Eval();

  std::string in = "./test_data/nets/effNet_input_x_1_3_224_224.bin";
  std::string out = "./test_data/nets/effNet_output_y_1_1000.bin";
@@ -540,58 +524,6 @@ TEST_F(NetworkTest, efficient_net) {
  ASSERT_EQ(res, 0);
 }

 TEST_F(NetworkTest, retina_net) {
  char *buf = nullptr;
  size_t net_size = 0;

  std::string net = "./test_data/nets/retinaface1.ms";
  ReadFile(net.c_str(), &net_size, &buf);
  // auto model = lite::TrainModel::Import(buf, net_size);
  auto model = lite::Model::Import(buf, net_size);
  delete[] buf;
  auto context = new lite::Context;
  ASSERT_NE(context, nullptr);
  context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = lite::NO_BIND;
  context->thread_num_ = 1;

  // auto session = session::TrainSession::CreateSession(context);
  auto session = session::LiteSession::CreateSession(context);
  ASSERT_NE(session, nullptr);
  auto ret = session->CompileGraph(model);
  EXPECT_EQ(lite::RET_OK, ret);
  // session->Eval();

  std::string in = "./test_data/nets/test1.hwc_normalized_f32";
  std::cout << "----- Output 0 -----" << std::endl;
  std::string out = "./test_data/nets/test1_loc.f32";
  int final_res = 0;
  auto res = runNet(session, in, out, "448", true);
  // ASSERT_EQ(res, 0);
  if (res != 0) {
    final_res = res;
  }

  std::cout << "----- Output 1 -----" << std::endl;
  out = "./test_data/nets/test1_conf.f32";
  res = runNet(session, in, out, "435", true);
  // ASSERT_EQ(res, 0);
  if (res != 0) {
    final_res |= res;
  }
  std::cout << "----- Output 2 -----" << std::endl;
  out = "./test_data/nets/test1_landms.f32";
  res = runNet(session, in, out, "421", true);
  if (res != 0) {
    final_res |= res;
  }

  EXPECT_EQ(final_res, 0);

  delete model;
  delete session;
  delete context;
 }

 TEST_F(NetworkTest, mobileface_net) {
  char *buf = nullptr;
  size_t net_size = 0;
--- a/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/softmax_crossentropy_fp32_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/arm/fp32_grad/softmax_crossentropy_fp32_tests.cc
@@ -55,9 +55,9 @@ TEST_F(TestSoftmaxCrossEntropyFp32, SoftmaxCrossEntropyFp32) {

  std::vector<lite::Tensor *> inputs = {&y_tensor, &l_tensor};

  auto loss = new float[1];
  auto loss = new float[6];
  ASSERT_NE(loss, nullptr);
  std::vector<int> dim_dw({1});
  std::vector<int> dim_dw({6, 1});
  lite::Tensor loss_tensor(TypeId::kNumberTypeFloat32, dim_dw);
  loss_tensor.set_data(loss);
  auto grad = new float[24];
--- a/mindspore/lite/tools/anf_exporter/anf_exporter.cc
+++ b/mindspore/lite/tools/anf_exporter/anf_exporter.cc
@@ -73,10 +73,14 @@ void AnfExporter::RemoveIfDepend(const CNodePtr &cnode) {
    if (IsPrimitiveCNode(dependNode, schema::PrimitiveType_Depend) ||
        IsPrimitiveCNode(dependNode, schema::PrimitiveType_ControlDepend)) {
      hasDepend = true;
      bool maskOut = (dependNode->inputs().size() == 3) ? true : false;
      for (size_t j = 1; j < dependNode->inputs().size(); ++j) {
        AnfNodePtr dependInputNode = dependNode->input(j);
        if (dependInputNode->isa<CNode>()) {
          inputs.emplace_back(dependInputNode);
          if (maskOut) {
            break;
          }
        }
      }
    } else {
@@ -220,6 +224,11 @@ schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph, bool kee
      ret = RET_MEMORY_FAILED;
      break;
    }
 #ifdef SUPPORT_TRAIN
    RemoveIfMakeTuple(cnode);
    RemoveIfDepend(cnode);
 #endif

    if ((primitive_c->Type() == schema::PrimitiveType_TupleGetItem) ||
 #ifdef SUPPORT_TRAIN
        (primitive_c->Type() == schema::PrimitiveType_Depend) ||
@@ -228,9 +237,8 @@ schema::MetaGraphT *AnfExporter::Export(const FuncGraphPtr &func_graph, bool kee
        (primitive_c->Type() == schema::PrimitiveType_MakeTuple)) {
      continue;
    }
 #ifndef SUPPORT_TRAIN
    RemoveIfMakeTuple(cnode);
 #ifdef SUPPORT_TRAIN
    RemoveIfDepend(cnode);
 #endif
    auto primT = primitive_c->primitiveT();
    auto node = std::make_unique<schema::CNodeT>();
@@ -489,6 +497,11 @@ int AnfExporter::ConvertInputValueNode(const std::shared_ptr<AnfNode> &input_ano
    paramTensor->format = schema::Format(valueLite->format());
    paramTensor->dataType = valueLite->tensor_type();
    paramTensor->dims = valueLite->tensor_shape();
 #ifdef SUPPORT_TRAIN
    if (paramTensor->dims.size() == 0) {
      paramTensor->dims = {1};
    }
 #endif
    auto ret = memcpy_s(paramTensor->data.data(), valueLite->tensor_size() * sizeof(uint8_t), valueLite->tensor_addr(),
                        valueLite->tensor_size());
    if (ret != EOK) {
--- a/mindspore/lite/tools/anf_importer/import_from_protobuf.cc
+++ b/mindspore/lite/tools/anf_importer/import_from_protobuf.cc
@@ -733,13 +733,6 @@ bool AnfImporterFromProtobuf::BuildReturnForFuncGraph(const FuncGraphPtr &output
    outputFuncGraph->set_return(return_node);
    MS_LOG(INFO) << "Construct funcgraph finined, all success.";
  } else {
 #ifdef SUPPORT_TRAIN
    auto ret_node = outputFuncGraph->get_return();
    if (ret_node) {
      ret_node->add_input(cnode_ptr);
      return true;
    }
 #endif
    const onnx::ValueInfoProto &output_node = importProto.output(0);
    const onnx::TypeProto &output_typeproto = output_node.type();
    int output_type = output_typeproto.tensor_type().elem_type();
@@ -805,29 +798,13 @@ int AnfImporterFromProtobuf::ImportNodesForGraph(const FuncGraphPtr &outputFuncG
      MS_LOG(ERROR) << "primitive_c is nullptr";
      return RET_ERROR;
    }

 #ifdef SUPPORT_TRAIN
    if (primitive_c->Type() == schema::PrimitiveType_MakeTuple) {
      last_cnode_ptr = cnode_ptr;
      if (!BuildReturnForFuncGraph(outputFuncGraph, importProto, cnode_ptr)) {
        MS_LOG(ERROR) << "Build ReturnNode for funcgraph failed";
        status = RET_ERROR;
      }
    }
 #endif
  }
  if (status != RET_OK) {
    return status;
  }
 #ifdef SUPPORT_TRAIN
  if (last_cnode_ptr != cnode_ptr) {
 #else
  {
 #endif
    if (!BuildReturnForFuncGraph(outputFuncGraph, importProto, cnode_ptr)) {
      MS_LOG(ERROR) << "Build ReturnNode for funcgraph failed";
      status = RET_ERROR;
    }
  if (!BuildReturnForFuncGraph(outputFuncGraph, importProto, cnode_ptr)) {
    MS_LOG(ERROR) << "Build ReturnNode for funcgraph failed";
    status = RET_ERROR;
  }
  return status;
 }
--- a/mindspore/lite/tools/common/node_util.cc
+++ b/mindspore/lite/tools/common/node_util.cc
@@ -137,10 +137,19 @@ static const std::vector<schema::PrimitiveType> int8OpList = {schema::PrimitiveT
                                                              schema::PrimitiveType_L2Norm};

 static const std::vector<schema::PrimitiveType> needInsertOpList = {
 #ifdef SUPPORT_TRAIN
  schema::PrimitiveType_Eltwise,      schema::PrimitiveType_Activation,
  schema::PrimitiveType_Concat,       schema::PrimitiveType_Power,
  schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Add,
  schema::PrimitiveType_Split,        schema::PrimitiveType_Slice,
  schema::PrimitiveType_Crop
 #else
  schema::PrimitiveType_Eltwise, schema::PrimitiveType_Activation,   schema::PrimitiveType_Concat,
  schema::PrimitiveType_Power,   schema::PrimitiveType_StridedSlice, schema::PrimitiveType_Add,
  schema::PrimitiveType_Split,   schema::PrimitiveType_Slice,        schema::PrimitiveType_Crop,
  schema::PrimitiveType_Mul,     schema::PrimitiveType_Maximum};
  schema::PrimitiveType_Mul,     schema::PrimitiveType_Maximum
 #endif
 };

 static const std::unordered_map<int, int> nc2NhAxisMap = {{0, 0}, {1, -1}, {2, 1}, {3, 2}};

--- a/mindspore/lite/tools/converter/anf_transform.cc
+++ b/mindspore/lite/tools/converter/anf_transform.cc
@@ -109,9 +109,11 @@ FuncGraphPtr AnfTransform::Transform(const FuncGraphPtr &old_graph, const conver
    fusion_pm->AddPass(remove_unused_transpose_pass);
  }
  auto const_fold_pm = std::make_shared<opt::PassManager>("const fold fusion pass manager", false);
  auto inne_context_ptr = std::make_shared<lite::InnerContext>();
  inne_context_ptr->Init();
  const_fold_pm->AddPass(std::make_shared<opt::ConstFoldPass>(inne_context_ptr));
  if (!config->trainModel) {
    auto inne_context_ptr = std::make_shared<lite::InnerContext>();
    inne_context_ptr->Init();
    const_fold_pm->AddPass(std::make_shared<opt::ConstFoldPass>(inne_context_ptr));
  }
  const_fold_pm->AddPass(std::make_shared<opt::UpdateConv2DParamPass>());
  fusion_pm->AddPass(std::make_shared<opt::ConvConvFusion>());
  convert_pm->AddPass(std::make_shared<opt::ClipConvertActivationPass>());
--- a/mindspore/lite/tools/net_train/net_train.cc
+++ b/mindspore/lite/tools/net_train/net_train.cc
@@ -32,16 +32,6 @@ static const char *DELIM_COLON = ":";
 static const char *DELIM_COMMA = ",";
 static const char *DELIM_SLASH = "/";

 void SaveFile(std::string path, void *buf, size_t size) {
  std::ofstream ofs(path);
  MS_ASSERT(ofs.good() == true);
  MS_ASSERT(ofs.is_open() == true);

  ofs.seekp(0, std::ios::beg);
  ofs.write((const char *)buf, size);
  ofs.close();
 }

 int NetTrain::GenerateRandomData(size_t size, void *data) {
  MS_ASSERT(data != nullptr);
  char *casted_data = static_cast<char *>(data);
@@ -61,7 +51,7 @@ int NetTrain::GenerateInputData() {
    }
    auto tensor_byte_size = tensor->Size();
    auto status = GenerateRandomData(tensor_byte_size, input_data);
    if (status != 0) {
    if (status != RET_OK) {
      std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
      MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
      return status;
@@ -73,14 +63,14 @@ int NetTrain::GenerateInputData() {
 int NetTrain::LoadInput() {
  if (flags_->in_data_file_.empty()) {
    auto status = GenerateInputData();
    if (status != 0) {
    if (status != RET_OK) {
      std::cerr << "Generate input data error " << status << std::endl;
      MS_LOG(ERROR) << "Generate input data error " << status;
      return status;
    }
  } else {
    auto status = ReadInputFile();
    if (status != 0) {
    if (status != RET_OK) {
      std::cerr << "ReadInputFile error, " << status << std::endl;
      MS_LOG(ERROR) << "ReadInputFile error, " << status;
      return status;
@@ -331,20 +321,6 @@ int NetTrain::RunExportedNet() {

  MS_LOG(INFO) << "start reading exported model file";
  std::cout << "start reading exported model file" << std::endl;
  size_t size = 0;
  char *graph_buf = ReadFile(flags_->export_file_.c_str(), &size);
  if (graph_buf == nullptr) {
    MS_LOG(ERROR) << "Read exported model file failed while running " << model_name.c_str();
    std::cerr << "Read exported model file failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }
  auto model = lite::TrainModel::Import(graph_buf, size);
  delete[](graph_buf);
  if (model == nullptr) {
    MS_LOG(ERROR) << "Import exported model file failed while running " << model_name.c_str();
    std::cerr << "Import exported model file failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }
  auto context = std::make_shared<Context>();
  if (context == nullptr) {
    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
@@ -362,18 +338,12 @@ int NetTrain::RunExportedNet() {

  context->thread_num_ = flags_->num_threads_;
  // context->enable_float16_ = flags_->enable_fp16_;
  session_ = session::TrainSession::CreateSession(context.get());
  session_ = session::TrainSession::CreateSession(flags_->export_file_.c_str(), context.get());
  if (session_ == nullptr) {
    MS_LOG(ERROR) << "CreateSession failed while running ", model_name.c_str();
    std::cout << "CreateSession failed while running ", model_name.c_str();
    return RET_ERROR;
  }
  auto ret = session_->CompileTrainGraph(model);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "CompileGraph failed while running ", model_name.c_str();
    std::cout << "CompileGraph failed while running ", model_name.c_str();
    return ret;
  }

  ms_inputs_ = session_->GetInputs();
  auto end_prepare_time = GetTimeUs();
@@ -383,13 +353,13 @@ int NetTrain::RunExportedNet() {
  // Load input
  MS_LOG(INFO) << "start generate input data";
  auto status = LoadInput();
  if (status != 0) {
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Generate input data error";
    return status;
  }

  status = session_->RunGraph();
  if (status != 0) {
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Inference error " << status;
    std::cerr << "Inference error " << status << std::endl;
    return status;
@@ -405,7 +375,7 @@ int NetTrain::RunExportedNet() {
      delete data.second;
    }
    data_.clear();
    if (status != 0) {
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run MarkAccuracy on exported model error: " << status;
      std::cout << "Run MarkAccuracy on exported model error: " << status << std::endl;
      return status;
@@ -421,20 +391,6 @@ int NetTrain::RunNetTrain() {

  MS_LOG(INFO) << "start reading model file";
  std::cout << "start reading model file" << std::endl;
  size_t size = 0;
  char *graph_buf = ReadFile(flags_->model_file_.c_str(), &size);
  if (graph_buf == nullptr) {
    MS_LOG(ERROR) << "Read model file failed while running " << model_name.c_str();
    std::cerr << "Read model file failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }
  auto model = lite::TrainModel::Import(graph_buf, size);
  delete[](graph_buf);
  if (model == nullptr) {
    MS_LOG(ERROR) << "Import model file failed while running " << model_name.c_str();
    std::cerr << "Import model file failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }
  auto context = std::make_shared<Context>();
  if (context == nullptr) {
    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
@@ -451,18 +407,12 @@ int NetTrain::RunNetTrain() {
  }
  context->thread_num_ = flags_->num_threads_;
  // context->enable_float16_ = flags_->enable_fp16_;
  session_ = session::TrainSession::CreateSession(context.get());
  session_ = session::TrainSession::CreateSession(flags_->model_file_.c_str(), context.get());
  if (session_ == nullptr) {
    MS_LOG(ERROR) << "CreateSession failed while running ", model_name.c_str();
    std::cout << "CreateSession failed while running ", model_name.c_str();
    return RET_ERROR;
  }
  auto ret = session_->CompileTrainGraph(model);
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "CompileGraph failed while running ", model_name.c_str();
    std::cout << "CompileGraph failed while running ", model_name.c_str();
    return ret;
  }

  session_->Train();

@@ -474,13 +424,13 @@ int NetTrain::RunNetTrain() {
  // Load input
  MS_LOG(INFO) << "start generate input data";
  auto status = LoadInput();
  if (status != 0) {
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Generate input data error";
    return status;
  }
  if (flags_->epochs_ > 0) {
    status = MarkPerformance();
    if (status != 0) {
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
      std::cout << "Run MarkPerformance error: " << status << std::endl;
      return status;
@@ -494,24 +444,22 @@ int NetTrain::RunNetTrain() {
      delete data.second;
    }
    data_.clear();
    if (status != 0) {
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
      std::cout << "Run MarkAccuracy error: " << status << std::endl;
      return status;
    }
  }
  if (!flags_->export_file_.empty()) {
    size_t tsize = 0;
    auto buf = session_->ExportToBuf(nullptr, &tsize);
    if (buf == nullptr) {
      MS_LOG(ERROR) << "Run ExportToBuf error";
      std::cout << "Run ExportToBuf error";
    auto ret = session_->SaveToFile(flags_->export_file_);
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "SaveToFile error";
      std::cout << "Run SaveToFile error";
      return RET_ERROR;
    }
    SaveFile(flags_->export_file_, buf, size);

    status = RunExportedNet();
    if (status != 0) {
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run Exported model error: " << status;
      std::cout << "Run Exported model error: " << status << std::endl;
      return status;
@@ -754,14 +702,14 @@ int RunNetTrain(int argc, const char **argv) {

  NetTrain net_trainer(&flags);
  auto status = net_trainer.Init();
  if (status != 0) {
  if (status != RET_OK) {
    MS_LOG(ERROR) << "NetTrain init Error : " << status;
    std::cerr << "NetTrain init Error : " << status << std::endl;
    return RET_ERROR;
  }

  status = net_trainer.RunNetTrain();
  if (status != 0) {
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Run NetTrain "
                  << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
                  << " Failed : " << status;
--- a/mindspore/lite/tools/net_train/net_train.h
+++ b/mindspore/lite/tools/net_train/net_train.h
@@ -29,7 +29,6 @@
 #include <memory>
 #include <cfloat>
 #include <utility>
 #include "include/train_model.h"
 #include "tools/common/flag_parser.h"
 #include "src/common/file_utils.h"
 #include "src/common/utils.h"
@@ -129,8 +128,10 @@ class MS_API NetTrain {
    MS_ASSERT(input != nullptr);
    static int i = 0;
    auto inData = reinterpret_cast<T *>(input->MutableData());
    size_t tensorSize = input->ElementsNum();
    size_t len = (tensorSize < 20) ? tensorSize : 20;
    std::cout << "InData" << i++ << ": ";
    for (size_t j = 0; j < 20; j++) {
    for (size_t j = 0; j < len; j++) {
      std::cout << inData[j] << " ";
    }
    std::cout << std::endl;
@@ -190,10 +191,8 @@ class MS_API NetTrain {
            }
          } else {
            // just assume that atol = rtol
            if (absoluteError > 1e-5) {
              meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
              errorCount++;
            }
            meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
            errorCount++;
          }
        }
      }