From e0a936228692b6345b161c957e4ae8ac54fb2bfb Mon Sep 17 00:00:00 2001
From: Emir Haleva <emir.haleva@huawei.com>
Date: Thu, 29 Apr 2021 07:26:19 +0300
Subject: [PATCH] Support TrainSession::ExportInference

---
 mindspore/lite/include/train/train_session.h  |  10 +-
 mindspore/lite/schema/ops.fbs                 |  10 +-
 mindspore/lite/src/CMakeLists.txt             |   1 +
 mindspore/lite/src/train/train_export.cc      | 202 +++++++++++++++
 mindspore/lite/src/train/train_export.h       |  58 +++++
 mindspore/lite/src/train/train_session.cc     |  27 +-
 mindspore/lite/src/train/train_session.h      |   1 +
 mindspore/lite/src/train/train_utils.cc       |  16 ++
 mindspore/lite/src/train/train_utils.h        |  10 +
 mindspore/lite/test/CMakeLists.txt            |   2 +
 mindspore/lite/test/run_net_train.sh          |  23 +-
 .../lite/tools/benchmark_train/net_train.cc   | 232 +++++++++++++++++-
 .../lite/tools/benchmark_train/net_train.h    |  17 +-
 13 files changed, 572 insertions(+), 37 deletions(-)
 create mode 100644 mindspore/lite/src/train/train_export.cc
 create mode 100644 mindspore/lite/src/train/train_export.h
diff --git a/mindspore/lite/include/train/train_session.h b/mindspore/lite/include/train/train_session.h
index 2c80b65357..fe0bb65c5d 100644
--- a/mindspore/lite/include/train/train_session.h
+++ b/mindspore/lite/include/train/train_session.h
@@ -13,8 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
-#define MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
+#ifndef MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
+#define MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
 #include <vector>
 #include <string>
 #include <tuple>
@@ -115,6 +115,10 @@ class TrainSession : public session::LiteSession {
     loss_name_ = loss_name;
     return mindspore::lite::RET_OK;
   }
+  /// \brief Save model for inference (LiteSession)
+  /// \param[in] fb_name pretrained model file name prefix. '.ms' is added as extension.
+  /// \return STATUS as an error code of the set operation, STATUS is defined in errorcode.h
+  virtual int ExportInference(std::string fb_name) { return mindspore::lite::RET_ERROR; }
 
  protected:
   bool train_mode_ = false;
@@ -125,4 +129,4 @@ class TrainSession : public session::LiteSession {
 };
 }  // namespace session
 }  // namespace mindspore
-#endif  // MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
+#endif  // MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
diff --git a/mindspore/lite/schema/ops.fbs b/mindspore/lite/schema/ops.fbs
index da0bd88273..a0db93b125 100644
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -443,11 +443,6 @@ table Crop {
     offsets: [long];
 }
 
-table CumSum {
-    exclusive: bool = false;
-    reverse: bool = false;
-}
-
 table CustomExtractFeatures {
 }
 
@@ -1111,6 +1106,11 @@ table LogSoftmax {
 table Call {
 }
 
+table CumSum {
+    exclusive: bool;
+    reverse: bool;
+}
+
 table Custom {
     type: string;
     attr: [Attribute];
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index 8fec0ab48d..c7fc328ffb 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -123,6 +123,7 @@ if(SUPPORT_TRAIN)
             ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_metrics.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
             )
     if(ENABLE_V0)
       set(LITE_SRC
diff --git a/mindspore/lite/src/train/train_export.cc b/mindspore/lite/src/train/train_export.cc
new file mode 100644
index 0000000000..5f7761d705
--- /dev/null
+++ b/mindspore/lite/src/train/train_export.cc
@@ -0,0 +1,202 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#define _STUB
+#include "src/train/train_export.h"
+#include <unistd.h>
+#include <sys/stat.h>
+#include <fstream>
+#include <utility>
+#include <map>
+#include <set>
+#include "schema/inner/model_generated.h"
+#include "src/train/train_utils.h"
+
+namespace mindspore {
+namespace lite {
+
+std::vector<uint8_t> TrainExport::CreateData(const mindspore::lite::Tensor *tensor) {
+  uint8_t *tensor_data = reinterpret_cast<uint8_t *>(tensor->data_c());
+  auto size = tensor->Size();
+  std::vector<uint8_t> data(tensor_data, tensor_data + size);
+  return data;
+}
+
+std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(const mindspore::lite::Tensor *tensor,
+                                                           schema::Tensor *scTensor) {
+  auto tensorT = std::make_unique<schema::TensorT>();
+  tensorT->nodeType = scTensor->nodeType();
+  tensorT->dataType = tensor->data_type();
+  tensorT->dims = tensor->shape();
+  tensorT->format = tensor->format();
+  tensorT->name = tensor->tensor_name();
+  tensorT->refCount = 0;
+  tensorT->offset = 0;
+  tensorT->enableHuffmanCode = false;
+  if ((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) {
+    tensorT->data = CreateData(tensor);
+  }
+  for (auto quant_param : tensor->quant_params()) {
+    auto quantParamT = std::make_unique<schema::QuantParamT>();
+    quantParamT->scale = quant_param.scale;
+    quantParamT->zeroPoint = quant_param.zeroPoint;
+    quantParamT->min = 0;
+    quantParamT->max = 0;
+    quantParamT->narrowRange = true;
+    quantParamT->numBits = quant_param.bitNum;
+    quantParamT->inited = quant_param.inited;
+    quantParamT->varCorr = quant_param.var_corr;
+    quantParamT->meanCorr = quant_param.mean_corr;
+    quantParamT->dstDtype = quant_param.dstDtype;
+    quantParamT->roundType = quant_param.roundType;
+    quantParamT->multiplier = quant_param.multiplier;
+    tensorT->quantParams.emplace_back(std::move(quantParamT));
+  }
+  tensorT->quantClusters = tensor->quant_clusters();
+  return tensorT;
+}
+
+mindspore::lite::Model::Node *TrainExport::FindNode(const mindspore::kernel::LiteKernel *kernel) {
+  auto nodes = model_->all_nodes_;
+  auto it = std::find_if(nodes.begin(), nodes.end(),
+                         [&kernel](mindspore::lite::Model::Node *n) { return (kernel->name() == n->name_); });
+  if (it == nodes.end()) {
+    return nullptr;
+  }
+  return *it;
+}
+
+std::unique_ptr<schema::CNodeT> TrainExport::CreateCNode(const mindspore::kernel::LiteKernel *kernel,
+                                                         std::vector<uint32_t> inputIndex,
+                                                         std::vector<uint32_t> outputIndex) {
+  auto cnodeT = std::make_unique<schema::CNodeT>();
+  cnodeT->inputIndex = inputIndex;
+  cnodeT->outputIndex = outputIndex;
+  cnodeT->name = kernel->name();
+  cnodeT->quantType = schema::QuantType_QUANT_NONE;
+  // find kernel in model
+  auto *node = FindNode(kernel);
+  if (node == nullptr) {
+    MS_LOG(ERROR) << "cannot find kernel " + kernel->name() + " in model";
+    return nullptr;
+  }
+  auto primitive = reinterpret_cast<schema::Primitive *>(const_cast<void *>(node->primitive_));
+  cnodeT->primitive = std::unique_ptr<schema::PrimitiveT>(primitive->UnPack());
+  return cnodeT;
+}
+
+int TrainExport::Export(const std::vector<mindspore::kernel::LiteKernel *> &kernels,
+                        const std::vector<mindspore::lite::Tensor *> &tensors,
+                        const std::vector<std::string> &output_names) {
+  std::map<size_t, size_t> remap;
+  std::vector<size_t> map_index;
+  std::set<size_t> out_set;
+  int tensor_idx = 0;
+  auto meta_graph = std::make_unique<schema::MetaGraphT>();
+  meta_graph->fmkType = 3;
+  meta_graph->name = model_->name_;
+  meta_graph->version = model_->version_;
+  for (const auto kernel : kernels) {
+    std::vector<uint32_t> in_idx, out_idx;
+    for (const auto tensor : kernel->in_tensors()) {
+      size_t id = TSFindTensor(tensors, tensor);
+      if (id == tensors.size()) {
+        MS_LOG(ERROR) << "cannot find tensor " + tensor->ToString() + " in model";
+        return RET_ERROR;
+      }
+      auto it = remap.find(id);
+      if (it == remap.end()) {
+        remap[id] = tensor_idx;
+        in_idx.push_back(tensor_idx);
+        map_index.push_back(id);
+        tensor_idx++;
+      } else {
+        in_idx.push_back(it->second);
+      }
+    }
+    for (const auto tensor : kernel->out_tensors()) {
+      size_t id = TSFindTensor(tensors, tensor);
+      if (id == tensors.size()) {
+        MS_LOG(ERROR) << "cannot find tensor " + tensor->ToString() + " in model";
+        return RET_ERROR;
+      }
+      out_set.insert(id);
+      auto it = remap.find(id);
+      if (it == remap.end()) {
+        remap[id] = tensor_idx;
+        map_index.push_back(id);
+        out_idx.push_back(tensor_idx);
+        out_set.insert(tensor_idx);
+        tensor_idx++;
+      } else {
+        out_idx.push_back(it->second);
+        out_set.insert(it->second);
+      }
+    }
+    auto cnode = CreateCNode(kernel, in_idx, out_idx);
+    meta_graph->nodes.emplace_back(std::move(cnode));
+  }
+  for (auto id : map_index) {
+    mindspore::lite::Tensor *tensor = tensors.at(id);
+    schema::Tensor *scTensor = model_->all_tensors_.at(id);
+    auto tensorT = CreateTensor(tensor, scTensor);
+    // find a tensor which is not an output
+    if (out_set.find(id) == out_set.end()) {
+      if ((tensorT->nodeType == NodeType_ValueNode) && (tensorT->data.size() == 0)) {
+        meta_graph->inputIndex.push_back(remap[id]);
+      }
+    }
+    // find output tensor
+    if (std::find(output_names.begin(), output_names.end(), tensor->tensor_name()) != output_names.end()) {
+      meta_graph->outputIndex.push_back(remap[id]);
+    }
+    meta_graph->allTensors.emplace_back(std::move(tensorT));
+  }
+  auto graph = meta_graph.release();
+  int err = SaveToFile(graph, file_name_);
+  if (err != RET_OK) {
+    MS_LOG(ERROR) << "failed to save flatbuffer file " << file_name_;
+  }
+  delete graph;
+  return err;
+}
+
+int TrainExport::SaveToFile(const schema::MetaGraphT *graph, const std::string &outputPath) {
+  flatbuffers::FlatBufferBuilder builder(1024);
+  auto offset = schema::MetaGraph::Pack(builder, graph);
+  builder.Finish(offset);
+  schema::FinishMetaGraphBuffer(builder, offset);
+  int size = builder.GetSize();
+  auto content = builder.GetBufferPointer();
+  if (content == nullptr) {
+    MS_LOG(ERROR) << "GetBufferPointer nullptr";
+    return RET_ERROR;
+  }
+  if (access((outputPath + ".ms").c_str(), F_OK) == 0) {
+    chmod((outputPath + ".ms").c_str(), S_IWUSR);
+  }
+  std::ofstream output(outputPath + ".ms", std::ofstream::binary);
+  if (!output.is_open()) {
+    MS_LOG(ERROR) << "Can not open output file: " << outputPath << ".ms";
+    return RET_ERROR;
+  }
+  output.write((const char *)content, size);
+  output.close();
+  chmod((outputPath + ".ms").c_str(), S_IRUSR);
+  return RET_OK;
+}
+
+}  // namespace lite
+}  // namespace mindspore
diff --git a/mindspore/lite/src/train/train_export.h b/mindspore/lite/src/train/train_export.h
new file mode 100644
index 0000000000..41cac80237
--- /dev/null
+++ b/mindspore/lite/src/train/train_export.h
@@ -0,0 +1,58 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
+#define MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
+#include <string>
+#include <vector>
+#include <memory>
+#include "schema/inner/model_generated.h"
+#include "src/lite_kernel.h"
+#include "src/lite_model.h"
+
+namespace mindspore {
+#ifndef _STUB
+namespace schema {
+struct CNodeT;
+struct TensorT;
+struct MetaGraphT;
+}  // namespace schema
+#endif
+namespace lite {
+
+class TrainExport {
+ public:
+  TrainExport(const std::string file_name, const mindspore::lite::Model *model)
+      : model_(model), file_name_(file_name) {}
+  virtual ~TrainExport() {}
+  int Export(const std::vector<mindspore::kernel::LiteKernel *> &kernels,
+             const std::vector<mindspore::lite::Tensor *> &tensors, const std::vector<std::string> &output_names);
+
+ protected:
+  virtual std::vector<uint8_t> CreateData(const mindspore::lite::Tensor *tensor);
+
+ private:
+  const Model *model_;
+  std::string file_name_;
+  mindspore::lite::Model::Node *FindNode(const mindspore::kernel::LiteKernel *kernel);
+  std::unique_ptr<schema::TensorT> CreateTensor(const mindspore::lite::Tensor *tensor, schema::Tensor *scTensor);
+  std::unique_ptr<schema::CNodeT> CreateCNode(const mindspore::kernel::LiteKernel *kernel,
+                                              std::vector<uint32_t> inputIndex, std::vector<uint32_t> outputIndex);
+  int SaveToFile(const schema::MetaGraphT *graph, const std::string &outputPath);
+};
+};  // namespace lite
+}  // namespace mindspore
+
+#endif  // MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
diff --git a/mindspore/lite/src/train/train_session.cc b/mindspore/lite/src/train/train_session.cc
index f04d17cbcb..41b4736a65 100644
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -37,25 +37,12 @@
 #include "src/runtime/kernel/arm/fp32_grad/convolution.h"
 #include "src/runtime/kernel/arm/fp32/batchnorm_fp32.h"
 #include "src/common/tensor_util.h"
+#include "src/train/train_utils.h"
+#include "src/train/train_export.h"
 
 namespace mindspore {
 namespace lite {
 
-static size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter) {
-  for (size_t i = 0; i < where.size(); i++) {
-    if (where[i] == searchParameter) {
-      return i;
-    }
-  }
-  return where.size();
-}
-
-static kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where,
-                                        const std::string &searchParameter) {
-  auto it = std::find_if(where.begin(), where.end(),
-                         [&searchParameter](const kernel::LiteKernel *k) { return (k->name() == searchParameter); });
-  return *it;
-}
 TrainSession::TrainSession() {
   is_train_session_ = true;
 #ifdef ENABLE_V0
@@ -476,6 +463,16 @@ int TrainSession::SetLossName(std::string loss_name) {
   }
   return RET_OK;
 }
+
+int TrainSession::ExportInference(std::string file_name) {
+  bool orig_train_state = IsTrain();
+  Eval();
+  TrainExport texport(file_name, model_);
+  int status = texport.Export(inference_kernels_, tensors_, GetOutputTensorNames());
+  if (orig_train_state) Train();
+  return status;
+}
+
 }  // namespace lite
 
 session::TrainSession *session::TrainSession::CreateSession(mindspore::lite::Model *model, lite::Context *context,
diff --git a/mindspore/lite/src/train/train_session.h b/mindspore/lite/src/train/train_session.h
index cee89e1ff2..dfeac194db 100644
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -87,6 +87,7 @@ class TrainSession : virtual public session::TrainSession, virtual public lite::
     }
     return outputs;
   }
+  int ExportInference(std::string file_name) override;
 
  protected:
   void AllocWorkSpace();
diff --git a/mindspore/lite/src/train/train_utils.cc b/mindspore/lite/src/train/train_utils.cc
index bf3ac8af84..118207d78a 100644
--- a/mindspore/lite/src/train/train_utils.cc
+++ b/mindspore/lite/src/train/train_utils.cc
@@ -19,10 +19,26 @@
 #include "include/errorcode.h"
 #include "include/ms_tensor.h"
 #include "src/common/utils.h"
+#include "src/lite_kernel.h"
 
 namespace mindspore {
 namespace lite {
 
+size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter) {
+  for (size_t i = 0; i < where.size(); i++) {
+    if (where[i] == searchParameter) {
+      return i;
+    }
+  }
+  return where.size();
+}
+
+kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where, const std::string &searchParameter) {
+  auto it = std::find_if(where.begin(), where.end(),
+                         [&searchParameter](const kernel::LiteKernel *k) { return (k->name() == searchParameter); });
+  return *it;
+}
+
 float CalculateSparseClassification(tensor::MSTensor *input, tensor::MSTensor *output) {
   if ((input->shape().size() != 1) || (input->data_type() != kNumberTypeInt32) || (output->shape().size() != 2)) {
     MS_LOG(WARNING) << "SparceClassification got a " << input->shape() << "-D input tensor, " << output->shape()
diff --git a/mindspore/lite/src/train/train_utils.h b/mindspore/lite/src/train/train_utils.h
index cba2f57da3..6be9bc704b 100644
--- a/mindspore/lite/src/train/train_utils.h
+++ b/mindspore/lite/src/train/train_utils.h
@@ -16,11 +16,21 @@
 #ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_
 #define MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_
 
+#include <vector>
+#include <string>
 #include "include/ms_tensor.h"
+#include "src/tensor.h"
 
 namespace mindspore {
+namespace kernel {
+class LiteKernel;
+}
+
 namespace lite {
 
+kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where, const std::string &searchParameter);
+size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter);
+
 float CalculateSparseClassification(tensor::MSTensor *input, tensor::MSTensor *output);
 float CalculateOneHotClassification(tensor::MSTensor *input, tensor::MSTensor *output);
 
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index a3a755fa87..2c06e76b50 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -292,6 +292,8 @@ if(SUPPORT_TRAIN)
             ${LITE_DIR}/src/train/train_populate_parameter.cc
             ${LITE_DIR}/src/train/train_populate_parameter_v0.cc
             ${LITE_DIR}/src/train/train_session.cc
+            ${LITE_DIR}/src/train/train_export.cc
+            ${LITE_DIR}/src/train/train_utils.cc
             ${LITE_DIR}/src/train/transfer_session.cc
             ${LITE_DIR}/src/lite_session.cc
             )
diff --git a/mindspore/lite/test/run_net_train.sh b/mindspore/lite/test/run_net_train.sh
index 3824cf0bf2..c259b5ea38 100755
--- a/mindspore/lite/test/run_net_train.sh
+++ b/mindspore/lite/test/run_net_train.sh
@@ -89,13 +89,15 @@ function Run_x86() {
             model_name=${line_array[0]}'_train_quant'
             accuracy_limit=${line_array[2]}
         fi
-        
+        if [[ "${save_lite}" == "1" ]]; then
+          inference_file="${ms_models_path}/${model_name}_infer"
+        fi
         echo ${model_name} >> "${run_x86_log_file}"
         ${run_valgrind}./tools/benchmark_train/benchmark_train \
         --modelFile=${ms_models_path}/${model_name}.ms \
         --inDataFile=${train_io_path}/${model_prefix}_input1.bin,${train_io_path}/${model_prefix}_input2.bin \
         --expectedDataFile=${train_io_path}/${model_prefix}_output --epochs=${epoch_num} --numThreads=${threads} \
-        --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}"
+        --accuracyThreshold=${accuracy_limit} --inferenceFile=${inference_file} >> "${run_x86_log_file}"
         if [ $? = 0 ]; then
             run_result='x86: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_train_result_file}
         else
@@ -138,8 +140,8 @@ function Run_arm() {
     # If build with minddata, copy the minddata related libs
     cd ${benchmark_train_test_path} || exit 1
     if [ -f ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/lib/libminddata-lite.so ]; then
-        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libjpeg.so ${benchmark_train_test_path}/libjpeg.so || exit 1
-        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libturbojpeg.so ${benchmark_train_test_path}/libturbojpeg.so || exit 1
+        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libjpeg.so* ${benchmark_train_test_path}/ || exit 1
+        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libturbojpeg.so* ${benchmark_train_test_path}/ || exit 1
         cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/lib/libminddata-lite.so ${benchmark_train_test_path}/libminddata-lite.so || exit 1
     fi
     if [ "$1" == arm64 ]; then
@@ -178,8 +180,9 @@ function Run_arm() {
             run_result=$1': '${model_name}' irrelevant'; echo ${run_result} >> ${run_benchmark_train_result_file}
             continue
         fi
-  
-
+        if [[ "${save_lite}" == "1" ]]; then
+          inference_file="${ms_models_path}/${model_name}_infer"
+        fi
         # run benchmark_train test without clib data
         echo ${model_name} >> "${run_arm_log_file}"
         adb -s ${device_id} push ${train_io_path}/${model_prefix}_input*.bin ${train_io_path}/${model_prefix}_output*.bin  /data/local/tmp/benchmark_train_test >> ${adb_push_log_file}
@@ -198,7 +201,7 @@ function Run_arm() {
         --modelFile=${model_name}.ms \
         --inDataFile=${tmp_dir}/${model_prefix}_input1.bin,${tmp_dir}/${model_prefix}_input2.bin \
         --expectedDataFile=${tmp_dir}/${model_prefix}_output \
-        --numThreads=${threads} --accuracyThreshold=${accuracy_limit}
+        --numThreads=${threads} --accuracyThreshold=${accuracy_limit} --inferenceFile=${inference_file}
 ENDM
         )
         echo "${adb_cmd}" >> ${run_arm_log_file}
@@ -249,7 +252,7 @@ models_mindspore_train_config=${basepath}/models_ms_train.cfg
 epoch_num=1
 threads=2
 train_io_path=""
-while getopts "r:M:c:m:d:i:e:vt:q:D" opt; do
+while getopts "r:M:c:m:d:i:e:vt:q:DF" opt; do
     case ${opt} in
         r)
            release_path=${OPTARG}
@@ -291,6 +294,8 @@ while getopts "r:M:c:m:d:i:e:vt:q:D" opt; do
         t)
             epoch_num=${OPTARG}
             echo "train epoch num is ${epoch_num}"
+            ;;
+        F)  save_lite=1
             ;;                          
         ?)
             echo "unknown para"
@@ -342,7 +347,7 @@ if [[ $enable_export == 1 ]]; then
     Run_Export
     Print_Result ${export_result_file}
 
-fi    
+fi 
 
 # Write converter result to temp file
 run_converter_log_file=${logs_path}/run_converter_log.txt
diff --git a/mindspore/lite/tools/benchmark_train/net_train.cc b/mindspore/lite/tools/benchmark_train/net_train.cc
index 8fcaa0029b..2271defc52 100644
--- a/mindspore/lite/tools/benchmark_train/net_train.cc
+++ b/mindspore/lite/tools/benchmark_train/net_train.cc
@@ -20,6 +20,9 @@
 #undef __STDC_FORMAT_MACROS
 #include <algorithm>
 #include <utility>
+#ifdef ENABLE_NEON
+#include <arm_neon.h>
+#endif
 #include "src/common/common.h"
 #include "include/ms_tensor.h"
 #include "include/context.h"
@@ -178,6 +181,88 @@ int NetTrain::CompareOutput() {
       MS_LOG(ERROR) << "ReadFile return nullptr";
       return RET_ERROR;
     }
+
+    if (flags_->enable_fp16_ && tensor->data_type() == kNumberTypeFloat16) {
+      if (static_cast<int>(size / sizeof(float)) != tensor->ElementsNum()) {
+        MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
+                      << ", read size: " << size / sizeof(float);
+        return RET_ERROR;
+      }
+    } else {
+      if (size != tensor->Size()) {
+        MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
+                      << ", read size: " << size;
+        return RET_ERROR;
+      }
+    }
+    float bias = 0.f;
+    if (flags_->enable_fp16_ && tensor->data_type() == kNumberTypeFloat16) {
+#ifdef ENABLE_FP16
+      bias = CompareData<float16_t>(bin_buf, tensor->ElementsNum(), reinterpret_cast<float16_t *>(outputs));
+#endif
+    } else {
+      bias = CompareData<float>(bin_buf, tensor->ElementsNum(), reinterpret_cast<float *>(outputs));
+    }
+    if (bias >= 0) {
+      total_bias += bias;
+      total_size++;
+    } else {
+      has_error = true;
+      break;
+    }
+    i++;
+    delete[] bin_buf;
+  }
+
+  if (!has_error) {
+    float mean_bias;
+    if (total_size != 0) {
+      mean_bias = total_bias / total_size * 100;
+    } else {
+      mean_bias = 0;
+    }
+
+    std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%"
+              << " threshold is:" << this->flags_->accuracy_threshold_ << std::endl;
+    std::cout << "=======================================================" << std::endl << std::endl;
+
+    if (mean_bias > this->flags_->accuracy_threshold_) {
+      MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
+      std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
+      return RET_ERROR;
+    } else {
+      return RET_OK;
+    }
+  } else {
+    MS_LOG(ERROR) << "Error in CompareData";
+    std::cerr << "Error in CompareData" << std::endl;
+    std::cout << "=======================================================" << std::endl << std::endl;
+    return RET_ERROR;
+  }
+}
+int NetTrain::CompareOutputLite(const std::unique_ptr<session::LiteSession> &lite_session) {
+  std::cout << "================ Comparing Forward Output data ================" << std::endl;
+  float total_bias = 0;
+  int total_size = 0;
+  bool has_error = false;
+  auto tensors_list = lite_session->GetOutputs();
+  if (tensors_list.empty()) {
+    MS_LOG(ERROR) << "Cannot find output tensors, get model output failed";
+    return RET_ERROR;
+  }
+  mindspore::tensor::MSTensor *tensor = nullptr;
+  int i = 1;
+  for (auto it = tensors_list.begin(); it != tensors_list.end(); ++it) {
+    tensor = lite_session->GetOutputByTensorName(it->first);
+    std::cout << "output is tensor " << it->first << "\n";
+    auto outputs = tensor->MutableData();
+    size_t size;
+    std::string output_file = flags_->data_file_ + std::to_string(i) + ".bin";
+    auto *bin_buf = ReadFileBuf(output_file.c_str(), &size);
+    if (bin_buf == nullptr) {
+      MS_LOG(ERROR) << "ReadFile return nullptr";
+      return RET_ERROR;
+    }
     if (size != tensor->Size()) {
       MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
                     << ", read size: " << size;
@@ -288,7 +373,7 @@ int NetTrain::MarkAccuracy() {
   }
   session_->Eval();
 
-  auto status = session_->RunGraph();
+  auto status = session_->RunGraph(before_call_back_, after_call_back_);
   if (status != RET_OK) {
     MS_LOG(ERROR) << "Inference error " << status;
     std::cerr << "Inference error " << status << std::endl;
@@ -303,6 +388,40 @@ int NetTrain::MarkAccuracy() {
   }
   return RET_OK;
 }
+int NetTrain::MarkAccuracyLite(const std::unique_ptr<session::LiteSession> &lite_session) {
+  MS_LOG(INFO) << "MarkAccuracy";
+  std::cout << "MarkAccuracy" << std::endl;
+  for (auto &msInput : ms_inputs_) {
+    switch (msInput->data_type()) {
+      case TypeId::kNumberTypeFloat:
+        PrintInputData<float>(msInput);
+        break;
+      case TypeId::kNumberTypeFloat32:
+        PrintInputData<float>(msInput);
+        break;
+      case TypeId::kNumberTypeInt32:
+        PrintInputData<int>(msInput);
+        break;
+      default:
+        MS_LOG(ERROR) << "Datatype " << msInput->data_type() << " is not supported.";
+        return RET_ERROR;
+    }
+  }
+  auto status = lite_session->RunGraph();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Inference error " << status;
+    std::cerr << "Inference error " << status << std::endl;
+    return status;
+  }
+
+  status = CompareOutputLite(lite_session);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Compare output error " << status;
+    std::cerr << "Compare output error " << status << std::endl;
+    return status;
+  }
+  return RET_OK;
+}
 
 int NetTrain::RunExportedNet() {
   auto start_prepare_time = GetTimeUs();
@@ -375,6 +494,80 @@ int NetTrain::RunExportedNet() {
   return RET_OK;
 }
 
+int NetTrain::RunExportedNetLite(std::string file_name) {
+  auto start_prepare_time = GetTimeUs();
+  // Load graph
+  std::string model_name = file_name.substr(file_name.find_last_of(DELIM_SLASH) + 1);
+
+  MS_LOG(INFO) << "start reading exported model file";
+  std::cout << "reading " << file_name << std::endl;
+  auto context = std::make_shared<Context>();
+  if (context == nullptr) {
+    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
+    std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
+    return RET_ERROR;
+  }
+
+  if (flags_->cpu_bind_mode_ == 2) {
+    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = MID_CPU;
+  } else if (flags_->cpu_bind_mode_ == 1) {
+    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = HIGHER_CPU;
+  } else {
+    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
+  }
+
+  context->thread_num_ = flags_->num_threads_;
+
+  auto *model = mindspore::lite::Model::Import(file_name.c_str());
+  if (model == nullptr) {
+    MS_LOG(ERROR) << "create model for lite session failed";
+    return RET_ERROR;
+  }
+  auto lite_session = std::unique_ptr<session::LiteSession>(session::LiteSession::CreateSession(context.get()));
+  if (lite_session == nullptr) {
+    MS_LOG(ERROR) << "ExportedFile CreateSession failed while running " << model_name.c_str();
+    std::cout << "CreateSession failed while running " << model_name.c_str() << std::endl;
+    return RET_ERROR;
+  }
+  if (lite_session->CompileGraph(model) != RET_OK) {
+    MS_LOG(ERROR) << "Cannot compile model";
+    delete model;
+    return RET_ERROR;
+  }
+  ms_inputs_ = lite_session->GetInputs();
+  auto end_prepare_time = GetTimeUs();
+  MS_LOG(INFO) << "Exported model PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
+  std::cout << "Exported model PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms" << std::endl;
+
+  // Load input
+  MS_LOG(INFO) << "start generate input data";
+  auto status = LoadInput();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Generate input data error";
+    delete model;
+    return status;
+  }
+  if (!flags_->data_file_.empty()) {
+    MS_LOG(INFO) << "Check accuracy for exported model";
+    std::cout << "Check accuracy for exported model " << std::endl;
+    status = MarkAccuracyLite(lite_session);
+    for (auto &data : data_) {
+      data.second->shape.clear();
+      data.second->data.clear();
+      delete data.second;
+    }
+    data_.clear();
+    if (status != RET_OK) {
+      MS_LOG(ERROR) << "Run MarkAccuracy on exported model error: " << status;
+      std::cout << "Run MarkAccuracy on exported model error: " << status << std::endl;
+      delete model;
+      return status;
+    }
+  }
+  delete model;
+  return RET_OK;
+}
+
 int NetTrain::RunNetTrain() {
   auto start_prepare_time = GetTimeUs();
   // Load graph
@@ -451,6 +644,17 @@ int NetTrain::RunNetTrain() {
       return status;
     }
   }
+  status = CheckExecute(model);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Run CheckExecute error: " << status;
+    std::cout << "Run CheckExecute error: " << status << std::endl;
+    return status;
+  }
+  return RET_OK;
+}
+
+int NetTrain::CheckExecute(mindspore::lite::Model *model) {
+  int status;
   if (!flags_->export_file_.empty()) {
     auto ret = Model::Export(model, flags_->export_file_.c_str());
     if (ret != RET_OK) {
@@ -459,12 +663,33 @@ int NetTrain::RunNetTrain() {
       return RET_ERROR;
     }
     delete session_;
+    session_ = nullptr;
     status = RunExportedNet();
     if (status != RET_OK) {
       MS_LOG(ERROR) << "Run Exported model error: " << status;
       std::cout << "Run Exported model error: " << status << std::endl;
       return status;
     }
+  } else {
+    if (!flags_->inference_file_.empty()) {
+      auto tick = GetTimeUs();
+      status = session_->ExportInference(flags_->inference_file_);
+      if (status != RET_OK) {
+        MS_LOG(ERROR) << "Save model error: " << status;
+        std::cout << "Save model error: " << status << std::endl;
+        return status;
+      }
+      std::cout << "ExportInference() execution time is " << GetTimeUs() - tick << "us\n";
+      delete session_;
+      session_ = nullptr;
+
+      status = RunExportedNetLite(flags_->inference_file_ + ".ms");
+      if (status != RET_OK) {
+        MS_LOG(ERROR) << "Running saved model error: " << status;
+        std::cout << "Running saved model error: " << status << std::endl;
+        return status;
+      }
+    }
   }
   return RET_OK;
 }
@@ -554,6 +779,11 @@ int NetTrain::InitCallbackParameter() {
         case kNumberTypeInt32:
           std::cout << TensorSum<int>(output, tensor_size);
           break;
+#ifdef ENABLE_FP16
+        case kNumberTypeFloat16:
+          std::cout << TensorSum<float16_t>(output, tensor_size);
+          break;
+#endif
         default:
           std::cout << "unsupported type:" << type;
           break;
diff --git a/mindspore/lite/tools/benchmark_train/net_train.h b/mindspore/lite/tools/benchmark_train/net_train.h
index 0d2839825e..252f3d31be 100644
--- a/mindspore/lite/tools/benchmark_train/net_train.h
+++ b/mindspore/lite/tools/benchmark_train/net_train.h
@@ -30,6 +30,7 @@
 #include <cfloat>
 #include <utility>
 #include <algorithm>
+
 #include "tools/common/flag_parser.h"
 #include "src/common/file_utils.h"
 #include "src/common/utils.h"
@@ -51,14 +52,15 @@ struct MS_API CheckTensor {
 };
 
 template <typename T>
-T TensorSum(void *data, int size) {
+float TensorSum(void *data, int size) {
   T *typed_data = reinterpret_cast<T *>(data);
-  T sum = static_cast<T>(0);
+  float sum = 0.f;
   for (int i = 0; i < size; i++) {
-    sum += typed_data[i];
+    sum += static_cast<float>(typed_data[i]);
   }
   return sum;
 }
+
 class MS_API NetTrainFlags : public virtual FlagParser {
  public:
   NetTrainFlags() {
@@ -77,6 +79,7 @@ class MS_API NetTrainFlags : public virtual FlagParser {
     AddFlag(&NetTrainFlags::layer_checksum_, "layerCheckSum", "layer output checksum print (debug)", false);
     AddFlag(&NetTrainFlags::enable_fp16_, "enableFp16", "Enable float16", false);
     AddFlag(&NetTrainFlags::loss_name_, "lossName", "loss layer name", "");
+    AddFlag(&NetTrainFlags::inference_file_, "inferenceFile", "MS file to export inference model", "");
   }
 
   ~NetTrainFlags() override = default;
@@ -109,6 +112,7 @@ class MS_API NetTrainFlags : public virtual FlagParser {
   bool layer_checksum_ = false;
   std::vector<std::vector<int64_t>> resize_dims_;
   std::string loss_name_ = "";
+  std::string inference_file_ = "";
 };
 
 class MS_API NetTrain {
@@ -166,6 +170,7 @@ class MS_API NetTrain {
     for (int j = 0; j < std::min(50, size); j++) {
       std::cout << refOutput[j] << " ";
     }
+    std::cout << std::endl;
     for (int j = 0; j < size; j++) {
       if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
         std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
@@ -174,7 +179,7 @@ class MS_API NetTrain {
       }
 
       auto tolerance = absoluteTolerance + relativeTolerance * fabs(refOutput[j]);
-      auto absoluteError = std::fabs(msTensorData[j] - refOutput[j]);
+      auto absoluteError = std::fabs(static_cast<float>(msTensorData[j]) - refOutput[j]);
       if (absoluteError > tolerance) {
         if (fabs(refOutput[j]) == 0) {
           if (absoluteError > 1e-5) {
@@ -208,6 +213,10 @@ class MS_API NetTrain {
   int MarkAccuracy();
 
  private:
+  int RunExportedNetLite(std::string file_name);
+  int MarkAccuracyLite(const std::unique_ptr<session::LiteSession> &lite_session);
+  int CompareOutputLite(const std::unique_ptr<session::LiteSession> &lite_session);
+  int CheckExecute(mindspore::lite::Model *model);
   NetTrainFlags *flags_;
   session::TrainSession *session_ = nullptr;
   std::vector<mindspore::tensor::MSTensor *> ms_inputs_;