!15860 [MS][Lite][TOD] Support TrainSession::ExportInference

From: @ehaleva Reviewed-by: @HilbertDavid,@hangangqiang Signed-off-by: @HilbertDavid
4 years ago · 57ec0c3b62
--- a/mindspore/lite/include/train/train_session.h
+++ b/mindspore/lite/include/train/train_session.h
@@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
 #define MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
 #ifndef MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
 #define MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
 #include <vector>
 #include <string>
 #include <tuple>
@@ -115,6 +115,10 @@ class TrainSession : public session::LiteSession {
    loss_name_ = loss_name;
    return mindspore::lite::RET_OK;
  }
  /// \brief Save model for inference (LiteSession)
  /// \param[in] fb_name pretrained model file name prefix. '.ms' is added as extension.
  /// \return STATUS as an error code of the set operation, STATUS is defined in errorcode.h
  virtual int ExportInference(std::string fb_name) { return mindspore::lite::RET_ERROR; }

 protected:
  bool train_mode_ = false;
@@ -125,4 +129,4 @@ class TrainSession : public session::LiteSession {
 };
 }  // namespace session
 }  // namespace mindspore
 #endif  // MINDSPORE_LITE_INCLUDE_TRAIN_SESSION_H_
 #endif  // MINDSPORE_LITE_INCLUDE_TRAIN_TRAIN_SESSION_H_
--- a/mindspore/lite/schema/ops.fbs
+++ b/mindspore/lite/schema/ops.fbs
@@ -443,11 +443,6 @@ table Crop {
    offsets: [long];
 }

 table CumSum {
    exclusive: bool = false;
    reverse: bool = false;
 }

 table CustomExtractFeatures {
 }

@@ -1111,6 +1106,11 @@ table LogSoftmax {
 table Call {
 }

 table CumSum {
    exclusive: bool;
    reverse: bool;
 }

 table Custom {
    type: string;
    attr: [Attribute];
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -123,6 +123,7 @@ if(SUPPORT_TRAIN)
            ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_metrics.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/train/accuracy_monitor.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/train/classification_train_accuracy_monitor.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/train/train_export.cc
            )
    if(ENABLE_V0)
      set(LITE_SRC
--- a/mindspore/lite/src/train/train_export.cc
+++ b/mindspore/lite/src/train/train_export.cc
@@ -0,0 +1,202 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #define _STUB
 #include "src/train/train_export.h"
 #include <unistd.h>
 #include <sys/stat.h>
 #include <fstream>
 #include <utility>
 #include <map>
 #include <set>
 #include "schema/inner/model_generated.h"
 #include "src/train/train_utils.h"

 namespace mindspore {
 namespace lite {

 std::vector<uint8_t> TrainExport::CreateData(const mindspore::lite::Tensor *tensor) {
  uint8_t *tensor_data = reinterpret_cast<uint8_t *>(tensor->data_c());
  auto size = tensor->Size();
  std::vector<uint8_t> data(tensor_data, tensor_data + size);
  return data;
 }

 std::unique_ptr<schema::TensorT> TrainExport::CreateTensor(const mindspore::lite::Tensor *tensor,
                                                           schema::Tensor *scTensor) {
  auto tensorT = std::make_unique<schema::TensorT>();
  tensorT->nodeType = scTensor->nodeType();
  tensorT->dataType = tensor->data_type();
  tensorT->dims = tensor->shape();
  tensorT->format = tensor->format();
  tensorT->name = tensor->tensor_name();
  tensorT->refCount = 0;
  tensorT->offset = 0;
  tensorT->enableHuffmanCode = false;
  if ((tensorT->nodeType == NodeType_ValueNode) && (scTensor->data() != nullptr) && (scTensor->data()->size() > 0)) {
    tensorT->data = CreateData(tensor);
  }
  for (auto quant_param : tensor->quant_params()) {
    auto quantParamT = std::make_unique<schema::QuantParamT>();
    quantParamT->scale = quant_param.scale;
    quantParamT->zeroPoint = quant_param.zeroPoint;
    quantParamT->min = 0;
    quantParamT->max = 0;
    quantParamT->narrowRange = true;
    quantParamT->numBits = quant_param.bitNum;
    quantParamT->inited = quant_param.inited;
    quantParamT->varCorr = quant_param.var_corr;
    quantParamT->meanCorr = quant_param.mean_corr;
    quantParamT->dstDtype = quant_param.dstDtype;
    quantParamT->roundType = quant_param.roundType;
    quantParamT->multiplier = quant_param.multiplier;
    tensorT->quantParams.emplace_back(std::move(quantParamT));
  }
  tensorT->quantClusters = tensor->quant_clusters();
  return tensorT;
 }

 mindspore::lite::Model::Node *TrainExport::FindNode(const mindspore::kernel::LiteKernel *kernel) {
  auto nodes = model_->all_nodes_;
  auto it = std::find_if(nodes.begin(), nodes.end(),
                         [&kernel](mindspore::lite::Model::Node *n) { return (kernel->name() == n->name_); });
  if (it == nodes.end()) {
    return nullptr;
  }
  return *it;
 }

 std::unique_ptr<schema::CNodeT> TrainExport::CreateCNode(const mindspore::kernel::LiteKernel *kernel,
                                                         std::vector<uint32_t> inputIndex,
                                                         std::vector<uint32_t> outputIndex) {
  auto cnodeT = std::make_unique<schema::CNodeT>();
  cnodeT->inputIndex = inputIndex;
  cnodeT->outputIndex = outputIndex;
  cnodeT->name = kernel->name();
  cnodeT->quantType = schema::QuantType_QUANT_NONE;
  // find kernel in model
  auto *node = FindNode(kernel);
  if (node == nullptr) {
    MS_LOG(ERROR) << "cannot find kernel " + kernel->name() + " in model";
    return nullptr;
  }
  auto primitive = reinterpret_cast<schema::Primitive *>(const_cast<void *>(node->primitive_));
  cnodeT->primitive = std::unique_ptr<schema::PrimitiveT>(primitive->UnPack());
  return cnodeT;
 }

 int TrainExport::Export(const std::vector<mindspore::kernel::LiteKernel *> &kernels,
                        const std::vector<mindspore::lite::Tensor *> &tensors,
                        const std::vector<std::string> &output_names) {
  std::map<size_t, size_t> remap;
  std::vector<size_t> map_index;
  std::set<size_t> out_set;
  int tensor_idx = 0;
  auto meta_graph = std::make_unique<schema::MetaGraphT>();
  meta_graph->fmkType = 3;
  meta_graph->name = model_->name_;
  meta_graph->version = model_->version_;
  for (const auto kernel : kernels) {
    std::vector<uint32_t> in_idx, out_idx;
    for (const auto tensor : kernel->in_tensors()) {
      size_t id = TSFindTensor(tensors, tensor);
      if (id == tensors.size()) {
        MS_LOG(ERROR) << "cannot find tensor " + tensor->ToString() + " in model";
        return RET_ERROR;
      }
      auto it = remap.find(id);
      if (it == remap.end()) {
        remap[id] = tensor_idx;
        in_idx.push_back(tensor_idx);
        map_index.push_back(id);
        tensor_idx++;
      } else {
        in_idx.push_back(it->second);
      }
    }
    for (const auto tensor : kernel->out_tensors()) {
      size_t id = TSFindTensor(tensors, tensor);
      if (id == tensors.size()) {
        MS_LOG(ERROR) << "cannot find tensor " + tensor->ToString() + " in model";
        return RET_ERROR;
      }
      out_set.insert(id);
      auto it = remap.find(id);
      if (it == remap.end()) {
        remap[id] = tensor_idx;
        map_index.push_back(id);
        out_idx.push_back(tensor_idx);
        out_set.insert(tensor_idx);
        tensor_idx++;
      } else {
        out_idx.push_back(it->second);
        out_set.insert(it->second);
      }
    }
    auto cnode = CreateCNode(kernel, in_idx, out_idx);
    meta_graph->nodes.emplace_back(std::move(cnode));
  }
  for (auto id : map_index) {
    mindspore::lite::Tensor *tensor = tensors.at(id);
    schema::Tensor *scTensor = model_->all_tensors_.at(id);
    auto tensorT = CreateTensor(tensor, scTensor);
    // find a tensor which is not an output
    if (out_set.find(id) == out_set.end()) {
      if ((tensorT->nodeType == NodeType_ValueNode) && (tensorT->data.size() == 0)) {
        meta_graph->inputIndex.push_back(remap[id]);
      }
    }
    // find output tensor
    if (std::find(output_names.begin(), output_names.end(), tensor->tensor_name()) != output_names.end()) {
      meta_graph->outputIndex.push_back(remap[id]);
    }
    meta_graph->allTensors.emplace_back(std::move(tensorT));
  }
  auto graph = meta_graph.release();
  int err = SaveToFile(graph, file_name_);
  if (err != RET_OK) {
    MS_LOG(ERROR) << "failed to save flatbuffer file " << file_name_;
  }
  delete graph;
  return err;
 }

 int TrainExport::SaveToFile(const schema::MetaGraphT *graph, const std::string &outputPath) {
  flatbuffers::FlatBufferBuilder builder(1024);
  auto offset = schema::MetaGraph::Pack(builder, graph);
  builder.Finish(offset);
  schema::FinishMetaGraphBuffer(builder, offset);
  int size = builder.GetSize();
  auto content = builder.GetBufferPointer();
  if (content == nullptr) {
    MS_LOG(ERROR) << "GetBufferPointer nullptr";
    return RET_ERROR;
  }
  if (access((outputPath + ".ms").c_str(), F_OK) == 0) {
    chmod((outputPath + ".ms").c_str(), S_IWUSR);
  }
  std::ofstream output(outputPath + ".ms", std::ofstream::binary);
  if (!output.is_open()) {
    MS_LOG(ERROR) << "Can not open output file: " << outputPath << ".ms";
    return RET_ERROR;
  }
  output.write((const char *)content, size);
  output.close();
  chmod((outputPath + ".ms").c_str(), S_IRUSR);
  return RET_OK;
 }

 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/src/train/train_export.h
+++ b/mindspore/lite/src/train/train_export.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
 #define MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
 #include <string>
 #include <vector>
 #include <memory>
 #include "schema/inner/model_generated.h"
 #include "src/lite_kernel.h"
 #include "src/lite_model.h"

 namespace mindspore {
 #ifndef _STUB
 namespace schema {
 struct CNodeT;
 struct TensorT;
 struct MetaGraphT;
 }  // namespace schema
 #endif
 namespace lite {

 class TrainExport {
 public:
  TrainExport(const std::string file_name, const mindspore::lite::Model *model)
      : model_(model), file_name_(file_name) {}
  virtual ~TrainExport() {}
  int Export(const std::vector<mindspore::kernel::LiteKernel *> &kernels,
             const std::vector<mindspore::lite::Tensor *> &tensors, const std::vector<std::string> &output_names);

 protected:
  virtual std::vector<uint8_t> CreateData(const mindspore::lite::Tensor *tensor);

 private:
  const Model *model_;
  std::string file_name_;
  mindspore::lite::Model::Node *FindNode(const mindspore::kernel::LiteKernel *kernel);
  std::unique_ptr<schema::TensorT> CreateTensor(const mindspore::lite::Tensor *tensor, schema::Tensor *scTensor);
  std::unique_ptr<schema::CNodeT> CreateCNode(const mindspore::kernel::LiteKernel *kernel,
                                              std::vector<uint32_t> inputIndex, std::vector<uint32_t> outputIndex);
  int SaveToFile(const schema::MetaGraphT *graph, const std::string &outputPath);
 };
 };  // namespace lite
 }  // namespace mindspore

 #endif  // MINDSPORE_LITE_SRC_TRAIN_TRAIN_EXPORT_H_
--- a/mindspore/lite/src/train/train_session.cc
+++ b/mindspore/lite/src/train/train_session.cc
@@ -37,25 +37,12 @@
 #include "src/runtime/kernel/arm/fp32_grad/convolution.h"
 #include "src/runtime/kernel/arm/fp32/batchnorm_fp32.h"
 #include "src/common/tensor_util.h"
 #include "src/train/train_utils.h"
 #include "src/train/train_export.h"

 namespace mindspore {
 namespace lite {

 static size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter) {
  for (size_t i = 0; i < where.size(); i++) {
    if (where[i] == searchParameter) {
      return i;
    }
  }
  return where.size();
 }

 static kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where,
                                        const std::string &searchParameter) {
  auto it = std::find_if(where.begin(), where.end(),
                         [&searchParameter](const kernel::LiteKernel *k) { return (k->name() == searchParameter); });
  return *it;
 }
 TrainSession::TrainSession() {
  is_train_session_ = true;
 #ifdef ENABLE_V0
@@ -476,6 +463,16 @@ int TrainSession::SetLossName(std::string loss_name) {
  }
  return RET_OK;
 }

 int TrainSession::ExportInference(std::string file_name) {
  bool orig_train_state = IsTrain();
  Eval();
  TrainExport texport(file_name, model_);
  int status = texport.Export(inference_kernels_, tensors_, GetOutputTensorNames());
  if (orig_train_state) Train();
  return status;
 }

 }  // namespace lite

 session::TrainSession *session::TrainSession::CreateSession(mindspore::lite::Model *model, lite::Context *context,
--- a/mindspore/lite/src/train/train_session.h
+++ b/mindspore/lite/src/train/train_session.h
@@ -87,6 +87,7 @@ class TrainSession : virtual public session::TrainSession, virtual public lite::
    }
    return outputs;
  }
  int ExportInference(std::string file_name) override;

 protected:
  void AllocWorkSpace();
--- a/mindspore/lite/src/train/train_utils.cc
+++ b/mindspore/lite/src/train/train_utils.cc
@@ -19,10 +19,26 @@
 #include "include/errorcode.h"
 #include "include/ms_tensor.h"
 #include "src/common/utils.h"
 #include "src/lite_kernel.h"

 namespace mindspore {
 namespace lite {

 size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter) {
  for (size_t i = 0; i < where.size(); i++) {
    if (where[i] == searchParameter) {
      return i;
    }
  }
  return where.size();
 }

 kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where, const std::string &searchParameter) {
  auto it = std::find_if(where.begin(), where.end(),
                         [&searchParameter](const kernel::LiteKernel *k) { return (k->name() == searchParameter); });
  return *it;
 }

 float CalculateSparseClassification(tensor::MSTensor *input, tensor::MSTensor *output) {
  if ((input->shape().size() != 1) || (input->data_type() != kNumberTypeInt32) || (output->shape().size() != 2)) {
    MS_LOG(WARNING) << "SparceClassification got a " << input->shape() << "-D input tensor, " << output->shape()
--- a/mindspore/lite/src/train/train_utils.h
+++ b/mindspore/lite/src/train/train_utils.h
@@ -16,11 +16,21 @@
 #ifndef MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_
 #define MINDSPORE_LITE_SRC_TRAIN_TRAIN_UTILS_H_

 #include <vector>
 #include <string>
 #include "include/ms_tensor.h"
 #include "src/tensor.h"

 namespace mindspore {
 namespace kernel {
 class LiteKernel;
 }

 namespace lite {

 kernel::LiteKernel *TSFindKernel(const std::vector<kernel::LiteKernel *> &where, const std::string &searchParameter);
 size_t TSFindTensor(const std::vector<lite::Tensor *> &where, const lite::Tensor *searchParameter);

 float CalculateSparseClassification(tensor::MSTensor *input, tensor::MSTensor *output);
 float CalculateOneHotClassification(tensor::MSTensor *input, tensor::MSTensor *output);

--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -292,6 +292,8 @@ if(SUPPORT_TRAIN)
            ${LITE_DIR}/src/train/train_populate_parameter.cc
            ${LITE_DIR}/src/train/train_populate_parameter_v0.cc
            ${LITE_DIR}/src/train/train_session.cc
            ${LITE_DIR}/src/train/train_export.cc
            ${LITE_DIR}/src/train/train_utils.cc
            ${LITE_DIR}/src/train/transfer_session.cc
            ${LITE_DIR}/src/lite_session.cc
            )
--- a/mindspore/lite/test/run_net_train.sh
+++ b/mindspore/lite/test/run_net_train.sh
@@ -89,13 +89,15 @@ function Run_x86() {
            model_name=${line_array[0]}'_train_quant'
            accuracy_limit=${line_array[2]}
        fi
        
        if [[ "${save_lite}" == "1" ]]; then
          inference_file="${ms_models_path}/${model_name}_infer"
        fi
        echo ${model_name} >> "${run_x86_log_file}"
        ${run_valgrind}./tools/benchmark_train/benchmark_train \
        --modelFile=${ms_models_path}/${model_name}.ms \
        --inDataFile=${train_io_path}/${model_prefix}_input1.bin,${train_io_path}/${model_prefix}_input2.bin \
        --expectedDataFile=${train_io_path}/${model_prefix}_output --epochs=${epoch_num} --numThreads=${threads} \
        --accuracyThreshold=${accuracy_limit} >> "${run_x86_log_file}"
        --accuracyThreshold=${accuracy_limit} --inferenceFile=${inference_file} >> "${run_x86_log_file}"
        if [ $? = 0 ]; then
            run_result='x86: '${model_name}' pass'; echo ${run_result} >> ${run_benchmark_train_result_file}
        else
@@ -138,8 +140,8 @@ function Run_arm() {
    # If build with minddata, copy the minddata related libs
    cd ${benchmark_train_test_path} || exit 1
    if [ -f ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/lib/libminddata-lite.so ]; then
        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libjpeg.so ${benchmark_train_test_path}/libjpeg.so || exit 1
        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libturbojpeg.so ${benchmark_train_test_path}/libturbojpeg.so || exit 1
        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libjpeg.so* ${benchmark_train_test_path}/ || exit 1
        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/third_party/libjpeg-turbo/lib/libturbojpeg.so* ${benchmark_train_test_path}/ || exit 1
        cp -a ${arm_path}/mindspore-lite-${version_arm}-train-android-${process_unit}/train/lib/libminddata-lite.so ${benchmark_train_test_path}/libminddata-lite.so || exit 1
    fi
    if [ "$1" == arm64 ]; then
@@ -178,8 +180,9 @@ function Run_arm() {
            run_result=$1': '${model_name}' irrelevant'; echo ${run_result} >> ${run_benchmark_train_result_file}
            continue
        fi
  

        if [[ "${save_lite}" == "1" ]]; then
          inference_file="${ms_models_path}/${model_name}_infer"
        fi
        # run benchmark_train test without clib data
        echo ${model_name} >> "${run_arm_log_file}"
        adb -s ${device_id} push ${train_io_path}/${model_prefix}_input*.bin ${train_io_path}/${model_prefix}_output*.bin  /data/local/tmp/benchmark_train_test >> ${adb_push_log_file}
@@ -198,7 +201,7 @@ function Run_arm() {
        --modelFile=${model_name}.ms \
        --inDataFile=${tmp_dir}/${model_prefix}_input1.bin,${tmp_dir}/${model_prefix}_input2.bin \
        --expectedDataFile=${tmp_dir}/${model_prefix}_output \
        --numThreads=${threads} --accuracyThreshold=${accuracy_limit}
        --numThreads=${threads} --accuracyThreshold=${accuracy_limit} --inferenceFile=${inference_file}
 ENDM
        )
        echo "${adb_cmd}" >> ${run_arm_log_file}
@@ -249,7 +252,7 @@ models_mindspore_train_config=${basepath}/models_ms_train.cfg
 epoch_num=1
 threads=2
 train_io_path=""
 while getopts "r:M:c:m:d:i:e:vt:q:D" opt; do
 while getopts "r:M:c:m:d:i:e:vt:q:DF" opt; do
    case ${opt} in
        r)
           release_path=${OPTARG}
@@ -291,6 +294,8 @@ while getopts "r:M:c:m:d:i:e:vt:q:D" opt; do
        t)
            epoch_num=${OPTARG}
            echo "train epoch num is ${epoch_num}"
            ;;
        F)  save_lite=1
            ;;                          
        ?)
            echo "unknown para"
@@ -342,7 +347,7 @@ if [[ $enable_export == 1 ]]; then
    Run_Export
    Print_Result ${export_result_file}

 fi    
 fi 

 # Write converter result to temp file
 run_converter_log_file=${logs_path}/run_converter_log.txt
--- a/mindspore/lite/tools/benchmark_train/net_train.cc
+++ b/mindspore/lite/tools/benchmark_train/net_train.cc
@@ -20,6 +20,9 @@
 #undef __STDC_FORMAT_MACROS
 #include <algorithm>
 #include <utility>
 #ifdef ENABLE_NEON
 #include <arm_neon.h>
 #endif
 #include "src/common/common.h"
 #include "include/ms_tensor.h"
 #include "include/context.h"
@@ -178,6 +181,88 @@ int NetTrain::CompareOutput() {
      MS_LOG(ERROR) << "ReadFile return nullptr";
      return RET_ERROR;
    }

    if (flags_->enable_fp16_ && tensor->data_type() == kNumberTypeFloat16) {
      if (static_cast<int>(size / sizeof(float)) != tensor->ElementsNum()) {
        MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
                      << ", read size: " << size / sizeof(float);
        return RET_ERROR;
      }
    } else {
      if (size != tensor->Size()) {
        MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
                      << ", read size: " << size;
        return RET_ERROR;
      }
    }
    float bias = 0.f;
    if (flags_->enable_fp16_ && tensor->data_type() == kNumberTypeFloat16) {
 #ifdef ENABLE_FP16
      bias = CompareData<float16_t>(bin_buf, tensor->ElementsNum(), reinterpret_cast<float16_t *>(outputs));
 #endif
    } else {
      bias = CompareData<float>(bin_buf, tensor->ElementsNum(), reinterpret_cast<float *>(outputs));
    }
    if (bias >= 0) {
      total_bias += bias;
      total_size++;
    } else {
      has_error = true;
      break;
    }
    i++;
    delete[] bin_buf;
  }

  if (!has_error) {
    float mean_bias;
    if (total_size != 0) {
      mean_bias = total_bias / total_size * 100;
    } else {
      mean_bias = 0;
    }

    std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%"
              << " threshold is:" << this->flags_->accuracy_threshold_ << std::endl;
    std::cout << "=======================================================" << std::endl << std::endl;

    if (mean_bias > this->flags_->accuracy_threshold_) {
      MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
      std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
      return RET_ERROR;
    } else {
      return RET_OK;
    }
  } else {
    MS_LOG(ERROR) << "Error in CompareData";
    std::cerr << "Error in CompareData" << std::endl;
    std::cout << "=======================================================" << std::endl << std::endl;
    return RET_ERROR;
  }
 }
 int NetTrain::CompareOutputLite(const std::unique_ptr<session::LiteSession> &lite_session) {
  std::cout << "================ Comparing Forward Output data ================" << std::endl;
  float total_bias = 0;
  int total_size = 0;
  bool has_error = false;
  auto tensors_list = lite_session->GetOutputs();
  if (tensors_list.empty()) {
    MS_LOG(ERROR) << "Cannot find output tensors, get model output failed";
    return RET_ERROR;
  }
  mindspore::tensor::MSTensor *tensor = nullptr;
  int i = 1;
  for (auto it = tensors_list.begin(); it != tensors_list.end(); ++it) {
    tensor = lite_session->GetOutputByTensorName(it->first);
    std::cout << "output is tensor " << it->first << "\n";
    auto outputs = tensor->MutableData();
    size_t size;
    std::string output_file = flags_->data_file_ + std::to_string(i) + ".bin";
    auto *bin_buf = ReadFileBuf(output_file.c_str(), &size);
    if (bin_buf == nullptr) {
      MS_LOG(ERROR) << "ReadFile return nullptr";
      return RET_ERROR;
    }
    if (size != tensor->Size()) {
      MS_LOG(ERROR) << "Output buffer and output file differ by size. Tensor size: " << tensor->Size()
                    << ", read size: " << size;
@@ -288,7 +373,7 @@ int NetTrain::MarkAccuracy() {
  }
  session_->Eval();

  auto status = session_->RunGraph();
  auto status = session_->RunGraph(before_call_back_, after_call_back_);
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Inference error " << status;
    std::cerr << "Inference error " << status << std::endl;
@@ -303,6 +388,40 @@ int NetTrain::MarkAccuracy() {
  }
  return RET_OK;
 }
 int NetTrain::MarkAccuracyLite(const std::unique_ptr<session::LiteSession> &lite_session) {
  MS_LOG(INFO) << "MarkAccuracy";
  std::cout << "MarkAccuracy" << std::endl;
  for (auto &msInput : ms_inputs_) {
    switch (msInput->data_type()) {
      case TypeId::kNumberTypeFloat:
        PrintInputData<float>(msInput);
        break;
      case TypeId::kNumberTypeFloat32:
        PrintInputData<float>(msInput);
        break;
      case TypeId::kNumberTypeInt32:
        PrintInputData<int>(msInput);
        break;
      default:
        MS_LOG(ERROR) << "Datatype " << msInput->data_type() << " is not supported.";
        return RET_ERROR;
    }
  }
  auto status = lite_session->RunGraph();
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Inference error " << status;
    std::cerr << "Inference error " << status << std::endl;
    return status;
  }

  status = CompareOutputLite(lite_session);
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Compare output error " << status;
    std::cerr << "Compare output error " << status << std::endl;
    return status;
  }
  return RET_OK;
 }

 int NetTrain::RunExportedNet() {
  auto start_prepare_time = GetTimeUs();
@@ -375,6 +494,80 @@ int NetTrain::RunExportedNet() {
  return RET_OK;
 }

 int NetTrain::RunExportedNetLite(std::string file_name) {
  auto start_prepare_time = GetTimeUs();
  // Load graph
  std::string model_name = file_name.substr(file_name.find_last_of(DELIM_SLASH) + 1);

  MS_LOG(INFO) << "start reading exported model file";
  std::cout << "reading " << file_name << std::endl;
  auto context = std::make_shared<Context>();
  if (context == nullptr) {
    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
    std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }

  if (flags_->cpu_bind_mode_ == 2) {
    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = MID_CPU;
  } else if (flags_->cpu_bind_mode_ == 1) {
    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = HIGHER_CPU;
  } else {
    context->device_list_[0].device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
  }

  context->thread_num_ = flags_->num_threads_;

  auto *model = mindspore::lite::Model::Import(file_name.c_str());
  if (model == nullptr) {
    MS_LOG(ERROR) << "create model for lite session failed";
    return RET_ERROR;
  }
  auto lite_session = std::unique_ptr<session::LiteSession>(session::LiteSession::CreateSession(context.get()));
  if (lite_session == nullptr) {
    MS_LOG(ERROR) << "ExportedFile CreateSession failed while running " << model_name.c_str();
    std::cout << "CreateSession failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }
  if (lite_session->CompileGraph(model) != RET_OK) {
    MS_LOG(ERROR) << "Cannot compile model";
    delete model;
    return RET_ERROR;
  }
  ms_inputs_ = lite_session->GetInputs();
  auto end_prepare_time = GetTimeUs();
  MS_LOG(INFO) << "Exported model PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
  std::cout << "Exported model PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms" << std::endl;

  // Load input
  MS_LOG(INFO) << "start generate input data";
  auto status = LoadInput();
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Generate input data error";
    delete model;
    return status;
  }
  if (!flags_->data_file_.empty()) {
    MS_LOG(INFO) << "Check accuracy for exported model";
    std::cout << "Check accuracy for exported model " << std::endl;
    status = MarkAccuracyLite(lite_session);
    for (auto &data : data_) {
      data.second->shape.clear();
      data.second->data.clear();
      delete data.second;
    }
    data_.clear();
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run MarkAccuracy on exported model error: " << status;
      std::cout << "Run MarkAccuracy on exported model error: " << status << std::endl;
      delete model;
      return status;
    }
  }
  delete model;
  return RET_OK;
 }

 int NetTrain::RunNetTrain() {
  auto start_prepare_time = GetTimeUs();
  // Load graph
@@ -451,6 +644,17 @@ int NetTrain::RunNetTrain() {
      return status;
    }
  }
  status = CheckExecute(model);
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Run CheckExecute error: " << status;
    std::cout << "Run CheckExecute error: " << status << std::endl;
    return status;
  }
  return RET_OK;
 }

 int NetTrain::CheckExecute(mindspore::lite::Model *model) {
  int status;
  if (!flags_->export_file_.empty()) {
    auto ret = Model::Export(model, flags_->export_file_.c_str());
    if (ret != RET_OK) {
@@ -459,12 +663,33 @@ int NetTrain::RunNetTrain() {
      return RET_ERROR;
    }
    delete session_;
    session_ = nullptr;
    status = RunExportedNet();
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Run Exported model error: " << status;
      std::cout << "Run Exported model error: " << status << std::endl;
      return status;
    }
  } else {
    if (!flags_->inference_file_.empty()) {
      auto tick = GetTimeUs();
      status = session_->ExportInference(flags_->inference_file_);
      if (status != RET_OK) {
        MS_LOG(ERROR) << "Save model error: " << status;
        std::cout << "Save model error: " << status << std::endl;
        return status;
      }
      std::cout << "ExportInference() execution time is " << GetTimeUs() - tick << "us\n";
      delete session_;
      session_ = nullptr;

      status = RunExportedNetLite(flags_->inference_file_ + ".ms");
      if (status != RET_OK) {
        MS_LOG(ERROR) << "Running saved model error: " << status;
        std::cout << "Running saved model error: " << status << std::endl;
        return status;
      }
    }
  }
  return RET_OK;
 }
@@ -554,6 +779,11 @@ int NetTrain::InitCallbackParameter() {
        case kNumberTypeInt32:
          std::cout << TensorSum<int>(output, tensor_size);
          break;
 #ifdef ENABLE_FP16
        case kNumberTypeFloat16:
          std::cout << TensorSum<float16_t>(output, tensor_size);
          break;
 #endif
        default:
          std::cout << "unsupported type:" << type;
          break;
--- a/mindspore/lite/tools/benchmark_train/net_train.h
+++ b/mindspore/lite/tools/benchmark_train/net_train.h
@@ -30,6 +30,7 @@
 #include <cfloat>
 #include <utility>
 #include <algorithm>

 #include "tools/common/flag_parser.h"
 #include "src/common/file_utils.h"
 #include "src/common/utils.h"
@@ -51,14 +52,15 @@ struct MS_API CheckTensor {
 };

 template <typename T>
 T TensorSum(void *data, int size) {
 float TensorSum(void *data, int size) {
  T *typed_data = reinterpret_cast<T *>(data);
  T sum = static_cast<T>(0);
  float sum = 0.f;
  for (int i = 0; i < size; i++) {
    sum += typed_data[i];
    sum += static_cast<float>(typed_data[i]);
  }
  return sum;
 }

 class MS_API NetTrainFlags : public virtual FlagParser {
 public:
  NetTrainFlags() {
@@ -77,6 +79,7 @@ class MS_API NetTrainFlags : public virtual FlagParser {
    AddFlag(&NetTrainFlags::layer_checksum_, "layerCheckSum", "layer output checksum print (debug)", false);
    AddFlag(&NetTrainFlags::enable_fp16_, "enableFp16", "Enable float16", false);
    AddFlag(&NetTrainFlags::loss_name_, "lossName", "loss layer name", "");
    AddFlag(&NetTrainFlags::inference_file_, "inferenceFile", "MS file to export inference model", "");
  }

  ~NetTrainFlags() override = default;
@@ -109,6 +112,7 @@ class MS_API NetTrainFlags : public virtual FlagParser {
  bool layer_checksum_ = false;
  std::vector<std::vector<int64_t>> resize_dims_;
  std::string loss_name_ = "";
  std::string inference_file_ = "";
 };

 class MS_API NetTrain {
@@ -166,6 +170,7 @@ class MS_API NetTrain {
    for (int j = 0; j < std::min(50, size); j++) {
      std::cout << refOutput[j] << " ";
    }
    std::cout << std::endl;
    for (int j = 0; j < size; j++) {
      if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
        std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
@@ -174,7 +179,7 @@ class MS_API NetTrain {
      }

      auto tolerance = absoluteTolerance + relativeTolerance * fabs(refOutput[j]);
      auto absoluteError = std::fabs(msTensorData[j] - refOutput[j]);
      auto absoluteError = std::fabs(static_cast<float>(msTensorData[j]) - refOutput[j]);
      if (absoluteError > tolerance) {
        if (fabs(refOutput[j]) == 0) {
          if (absoluteError > 1e-5) {
@@ -208,6 +213,10 @@ class MS_API NetTrain {
  int MarkAccuracy();

 private:
  int RunExportedNetLite(std::string file_name);
  int MarkAccuracyLite(const std::unique_ptr<session::LiteSession> &lite_session);
  int CompareOutputLite(const std::unique_ptr<session::LiteSession> &lite_session);
  int CheckExecute(mindspore::lite::Model *model);
  NetTrainFlags *flags_;
  session::TrainSession *session_ = nullptr;
  std::vector<mindspore::tensor::MSTensor *> ms_inputs_;