zzy34407230
/
mindspore2022

/**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "tools/benchmark/benchmark_unified_api.h"
#define __STDC_FORMAT_MACROS
#include <cinttypes>
#undef __STDC_FORMAT_MACROS
#include <algorithm>
#include <utility>
#include <functional>
#include "include/context.h"
#include "include/ms_tensor.h"
#include "include/version.h"
#include "schema/model_generated.h"
#include "src/common/common.h"
#include "src/tensor.h"
#include "tools/common/string_util.h"
#ifdef ENABLE_ARM64
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <asm/unistd.h>
#include <unistd.h>
#endif

namespace mindspore {
constexpr size_t kDataToStringMaxNum = 40;
constexpr int kPrintDataNum = 20;
constexpr int kFrequencyDefault = 3;
constexpr int kPercentageDivisor = 100;
constexpr int kDumpInputsAndOutputs = 0;
constexpr int kDumpOutputs = 2;

namespace lite {
int BenchmarkUnifiedApi::GenerateInputData() {
  for (auto &tensor : ms_inputs_for_api_) {
    if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
      MSTensor *input = MSTensor::StringsToTensor(tensor.Name(), {"you're the best."});
      if (input == nullptr) {
        std::cerr << "StringsToTensor failed" << std::endl;
        MS_LOG(ERROR) << "StringsToTensor failed";
        return RET_ERROR;
      }
      tensor = *input;
    } else {
      auto input_data = tensor.MutableData();
      if (input_data == nullptr) {
        MS_LOG(ERROR) << "MallocData for inTensor failed";
        return RET_ERROR;
      }
      int status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
      if (status != RET_OK) {
        std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
        MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
        return status;
      }
    }
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::ReadInputFile() {
  if (ms_inputs_for_api_.empty()) {
    return RET_OK;
  }

  if (this->flags_->in_data_type_ == kImage) {
    MS_LOG(ERROR) << "Not supported image input";
    return RET_ERROR;
  } else {
    for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
      auto &cur_tensor = ms_inputs_for_api_.at(i);
      MS_ASSERT(cur_tensor != nullptr);
      size_t size;
      char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
      if (bin_buf == nullptr) {
        MS_LOG(ERROR) << "ReadFile return nullptr";
        return RET_ERROR;
      }
      if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
        std::string str(bin_buf, size);
        MSTensor *input = MSTensor::StringsToTensor(cur_tensor.Name(), {str});
        if (input == nullptr) {
          std::cerr << "StringsToTensor failed" << std::endl;
          MS_LOG(ERROR) << "StringsToTensor failed";
          delete[] bin_buf;
          return RET_ERROR;
        }
        cur_tensor = *input;
      } else {
        auto tensor_data_size = cur_tensor.DataSize();
        if (size != tensor_data_size) {
          std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
                    << std::endl;
          MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
          delete[] bin_buf;
          return RET_ERROR;
        }
        auto input_data = cur_tensor.MutableData();
        if (input_data == nullptr) {
          MS_LOG(ERROR) << "input_data is nullptr.";
          return RET_ERROR;
        }
        memcpy(input_data, bin_buf, tensor_data_size);
      }
      delete[] bin_buf;
    }
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
                                        const std::vector<size_t> &dims) {
  std::string line;
  getline(in_file_stream, line);
  std::stringstream line_stream(line);
  if (this->benchmark_data_.find(tensor_name) != this->benchmark_data_.end()) {
    return RET_OK;
  }
  mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
  if (tensor == nullptr) {
    MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
    return RET_ERROR;
  }
  std::vector<float> data;
  std::vector<std::string> strings_data;
  size_t shape_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
  if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
    strings_data.push_back(line);
    for (size_t i = 1; i < shape_size; i++) {
      getline(in_file_stream, line);
      strings_data.push_back(line);
    }
  } else {
    for (size_t i = 0; i < shape_size; i++) {
      float tmp_data;
      line_stream >> tmp_data;
      data.push_back(tmp_data);
    }
  }
  auto *check_tensor = new (std::nothrow) CheckTensor(dims, data, strings_data);
  if (check_tensor == nullptr) {
    MS_LOG(ERROR) << "New CheckTensor failed, tensor name: " << tensor_name;
    return RET_ERROR;
  }
  this->benchmark_tensor_names_.push_back(tensor_name);
  this->benchmark_data_.insert(std::make_pair(tensor_name, check_tensor));
  return RET_OK;
}

void BenchmarkUnifiedApi::UpdateDistributionModelName(const std::shared_ptr<mindspore::Context> &context,
                                                      std::string *name) {
  if (flags_->device_ != "GPU") {
    return;
  }

  auto device_info = context->MutableDeviceInfo().front();
  GPUDeviceInfo *gpu_info = reinterpret_cast<GPUDeviceInfo *>(device_info.get());
  auto rank_id = gpu_info->GetRankID();
  if (rank_id == 0) {
    return;
  }

  *name = name->replace(name->find("."), sizeof('.'), to_string(rank_id) + ".");
  return;
}

void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
  context->SetThreadNum(flags_->num_threads_);
  context->SetEnableParallel(flags_->enable_parallel_);
  context->SetThreadAffinity(flags_->cpu_bind_mode_);
  auto &device_list = context->MutableDeviceInfo();

  if (flags_->device_ == "GPU") {
    std::shared_ptr<GPUDeviceInfo> gpu_device_info = std::make_shared<GPUDeviceInfo>();
    gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
    device_list.push_back(gpu_device_info);
  }

  if (flags_->device_ == "NPU") {
    std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
    npu_device_info->SetFrequency(kFrequencyDefault);
    device_list.push_back(npu_device_info);
  }

  if (flags_->device_ == "Ascend310") {
    std::shared_ptr<Ascend310DeviceInfo> ascend310_device_info = std::make_shared<Ascend310DeviceInfo>();
    ascend310_device_info->SetDeviceID(0);
    device_list.push_back(ascend310_device_info);
  }

  // CPU priority is behind GPU and NPU
  std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
  device_info->SetEnableFP16(flags_->enable_fp16_);
  device_list.push_back(device_info);
}

int BenchmarkUnifiedApi::CompareOutput() {
  std::cout << "================ Comparing Output data ================" << std::endl;
  float total_bias = 0;
  int total_size = 0;
  // check the output tensor name.
  if (this->benchmark_tensor_names_ != ms_model_.GetOutputTensorNames()) {
    MS_LOG(ERROR) << "The output tensor name is wrong.";
    return RET_ERROR;
  }
  for (const auto &calib_tensor : benchmark_data_) {
    std::string tensor_name = calib_tensor.first;
    mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
    if (tensor == nullptr) {
      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
      return RET_ERROR;
    }
    int ret;
    if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
      std::vector<std::string> output_strings = MSTensor::TensorToStrings(tensor);
      ret = CompareStringData(tensor_name, calib_tensor.second->strings_data, output_strings);
    } else {
      ret = CompareDataGetTotalBiasAndSize(tensor_name, &tensor, &total_bias, &total_size);
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Error in CompareData";
      std::cerr << "Error in CompareData" << std::endl;
      std::cout << "=======================================================" << std::endl << std::endl;
      return ret;
    }
  }
  float mean_bias;
  if (total_size != 0) {
    mean_bias = ((total_bias / float_t(total_size)) * kPercentageDivisor);
  } else {
    mean_bias = 0;
  }

  std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
  std::cout << "=======================================================" << std::endl << std::endl;

  if (mean_bias > this->flags_->accuracy_threshold_) {
    MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
    std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
    return RET_ERROR;
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
                                                        float *total_bias, int *total_size) {
  float bias = 0;
  auto mutableData = tensor->MutableData();
  if (mutableData == nullptr) {
    MS_LOG(ERROR) << "mutableData is nullptr.";
    return RET_ERROR;
  }
  switch (static_cast<int>(tensor->DataType())) {
    case TypeId::kNumberTypeFloat:
    case TypeId::kNumberTypeFloat32: {
      bias = CompareData<float, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    case TypeId::kNumberTypeInt8: {
      bias = CompareData<int8_t, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    case TypeId::kNumberTypeUInt8: {
      bias = CompareData<uint8_t, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    case TypeId::kNumberTypeInt32: {
      bias = CompareData<int32_t, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    case TypeId::kNumberTypeInt16: {
      bias = CompareData<int16_t, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    case TypeId::kNumberTypeBool: {
      bias = CompareData<bool, int64_t>(name, tensor->Shape(), mutableData);
      break;
    }
    default:
      MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
      return RET_ERROR;
  }
  if (bias < 0) {
    MS_LOG(ERROR) << "CompareData failed, name: " << name;
    return RET_ERROR;
  }
  *total_bias += bias;
  *total_size += 1;
  return RET_OK;
}

int BenchmarkUnifiedApi::MarkPerformance() {
  MS_LOG(INFO) << "Running warm up loops...";
  std::cout << "Running warm up loops..." << std::endl;
  std::vector<MSTensor> outputs;

  for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
    auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
    if (status != kSuccess) {
      MS_LOG(ERROR) << "Inference error ";
      std::cerr << "Inference error " << std::endl;
      return RET_ERROR;
    }
  }

  MS_LOG(INFO) << "Running benchmark loops...";
  std::cout << "Running benchmark loops..." << std::endl;
  uint64_t time_min = 1000000;
  uint64_t time_max = 0;
  uint64_t time_avg = 0;

  for (int i = 0; i < flags_->loop_count_; i++) {
    auto inputs = ms_model_.GetInputs();
    for (auto tensor : inputs) {
      tensor.MutableData();  // prepare data
    }
    auto start = GetTimeUs();
    auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
    if (status != kSuccess) {
      MS_LOG(ERROR) << "Inference error ";
      std::cerr << "Inference error ";
      return RET_ERROR;
    }

    auto end = GetTimeUs();
    auto time = end - start;
    time_min = std::min(time_min, time);
    time_max = std::max(time_max, time);
    time_avg += time;
  }

  if (flags_->time_profiling_) {
    const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
    const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
    PrintResult(per_op_name, op_times_by_name_);
    PrintResult(per_op_type, op_times_by_type_);
#ifdef ENABLE_ARM64
  } else if (flags_->perf_profiling_) {
    if (flags_->perf_event_ == "CACHE") {
      const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
      const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
      PrintPerfResult(per_op_name, op_perf_by_name_);
      PrintPerfResult(per_op_type, op_perf_by_type_);
    } else if (flags_->perf_event_ == "STALL") {
      const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
                                                    "backendend(%)"};
      const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
                                                    "backendend(%)"};
      PrintPerfResult(per_op_name, op_perf_by_name_);
      PrintPerfResult(per_op_type, op_perf_by_type_);
    } else {
      const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
      const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
      PrintPerfResult(per_op_name, op_perf_by_name_);
      PrintPerfResult(per_op_type, op_perf_by_type_);
    }
#endif
  }

  if (flags_->loop_count_ > 0) {
    time_avg /= flags_->loop_count_;
    MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
                 << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / kFloatMSEC
                 << ", MaxRuntime = " << time_max / kFloatMSEC << ", AvgRunTime = " << time_avg / kFloatMSEC;
    printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
           flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
           time_min / kFloatMSEC, time_max / kFloatMSEC, time_avg / kFloatMSEC);
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::MarkAccuracy() {
  MS_LOG(INFO) << "MarkAccuracy";
  std::cout << "MarkAccuracy" << std::endl;

  auto status = PrintInputData();
  if (status != RET_OK) {
    MS_LOG(ERROR) << "PrintInputData error " << status;
    std::cerr << "PrintInputData error " << status << std::endl;
    return status;
  }
  std::vector<MSTensor> outputs;
  auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
  if (ret != kSuccess) {
    MS_LOG(ERROR) << "Inference error ";
    std::cerr << "Inference error " << std::endl;
    return RET_ERROR;
  }
  status = ReadCalibData();
  if (status != RET_OK) {
    MS_LOG(ERROR) << "Read calib data error " << status;
    std::cerr << "Read calib data error " << status << std::endl;
    return status;
  }
  auto use_cosine_distance_threshold = getenv("COSINE_DISTANCE_THRESHOLD");
  if (use_cosine_distance_threshold != nullptr) {
    double cosine_distance_threshold;
    auto ret_bool = lite::ConvertDoubleNum(use_cosine_distance_threshold, &cosine_distance_threshold);
    if (!ret_bool) {
      MS_LOG(ERROR) << "Compare output error " << status;
      std::cerr << "Compare output error " << status << std::endl;
      return RET_ERROR;
    }
    status = CompareOutputByCosineDistance(static_cast<float>(cosine_distance_threshold));
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Compare output error by consine distance" << status;
      std::cerr << "Compare output error by consine distance" << status << std::endl;
      return status;
    }
  } else {
    status = CompareOutput();
    if (status != RET_OK) {
      MS_LOG(ERROR) << "Compare output error " << status;
      std::cerr << "Compare output error " << status << std::endl;
      return status;
    }
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::PrintInputData() {
  for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
    auto input = ms_inputs_for_api_[i];
    MS_ASSERT(input != nullptr);
    auto tensor_data_type = static_cast<int>(input.DataType());

    std::cout << "InData" << i << ": ";
    if (tensor_data_type == TypeId::kObjectTypeString) {
      std::vector<std::string> output_strings = MSTensor::TensorToStrings(input);
      size_t print_num = std::min(output_strings.size(), static_cast<size_t>(20));
      for (size_t j = 0; j < print_num; j++) {
        std::cout << output_strings[j] << std::endl;
      }
      continue;
    }
    size_t print_num = std::min(static_cast<int>(input.ElementNum()), kPrintDataNum);
    const void *in_data = input.MutableData();
    if (in_data == nullptr) {
      MS_LOG(ERROR) << "in_data is nullptr.";
      return RET_ERROR;
    }

    for (size_t j = 0; j < print_num; j++) {
      if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
        std::cout << static_cast<const float *>(in_data)[j] << " ";
      } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
        std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
      } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
        std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
      } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
        std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
      } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
        std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
      } else if (tensor_data_type == TypeId::kNumberTypeBool) {
        std::cout << static_cast<const bool *>(in_data)[j] << " ";
      } else {
        MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
        return RET_ERROR;
      }
    }
    std::cout << std::endl;
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::RunBenchmark() {
  auto start_prepare_time = GetTimeUs();
  // Load graph
  std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
  mindspore::ModelType model_type = ModelTypeMap.at(flags_->model_type_);

  MS_LOG(INFO) << "start unified benchmark run";
  std::cout << "start unified benchmark run" << std::endl;

  auto context = std::make_shared<mindspore::Context>();
  if (context == nullptr) {
    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
    std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
    return RET_ERROR;
  }

  (void)InitMSContext(context);
  (void)UpdateDistributionModelName(context, &model_name);

  if (!flags_->config_file_.empty()) {
    auto config_ret = ms_model_.LoadConfig(flags_->config_file_);
    if (config_ret != kSuccess) {
      MS_LOG(ERROR) << "ms_model_.LoadConfig failed while running ", model_name.c_str();
      std::cout << "ms_model_.LoadConfig failed while running ", model_name.c_str();
    }
  }

  auto ret = ms_model_.Build(model_name, model_type, context);
  if (ret != kSuccess) {
    MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
    std::cout << "ms_model_.Build failed while running ", model_name.c_str();
    return RET_ERROR;
  }

  if (!flags_->resize_dims_.empty()) {
    std::vector<std::vector<int64_t>> resize_dims;
    (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
                         [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });

    ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
    if (ret != kSuccess) {
      MS_LOG(ERROR) << "Input tensor resize failed.";
      std::cout << "Input tensor resize failed.";
      return RET_ERROR;
    }
  }

  ms_inputs_for_api_ = ms_model_.GetInputs();
  auto end_prepare_time = GetTimeUs();
  MS_LOG(INFO) << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms";
  std::cout << "PrepareTime = " << ((end_prepare_time - start_prepare_time) / kFloatMSEC) << " ms" << std::endl;

  // Load input
  MS_LOG(INFO) << "start generate input data";
  auto status = LoadInput();
  if (status != 0) {
    MS_LOG(ERROR) << "Generate input data error";
    return status;
  }
  if (!flags_->benchmark_data_file_.empty()) {
    status = MarkAccuracy();
    for (auto &data : benchmark_data_) {
      data.second->shape.clear();
      data.second->data.clear();
      delete data.second;
      data.second = nullptr;
    }
    benchmark_data_.clear();
    if (status != 0) {
      MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
      std::cout << "Run MarkAccuracy error: " << status << std::endl;
      return status;
    }
  } else {
    status = MarkPerformance();
    if (status != 0) {
      MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
      std::cout << "Run MarkPerformance error: " << status << std::endl;
      return status;
    }
  }
  if (flags_->dump_tensor_data_) {
    std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
  // before callback
  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
                             const std::vector<mindspore::MSTensor> &before_outputs,
                             const MSCallBackParam &call_param) {
    if (before_inputs.empty()) {
      MS_LOG(INFO) << "The num of beforeInputs is empty";
    }
    if (before_outputs.empty()) {
      MS_LOG(INFO) << "The num of beforeOutputs is empty";
    }
    if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
      op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
    }
    if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
      op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
    }

    op_call_times_total_++;
    op_begin_ = GetTimeUs();
    return true;
  };

  // after callback
  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
    uint64_t opEnd = GetTimeUs();

    if (after_inputs.empty()) {
      MS_LOG(INFO) << "The num of after inputs is empty";
    }
    if (after_outputs.empty()) {
      MS_LOG(INFO) << "The num of after outputs is empty";
    }

    float cost = static_cast<float>(opEnd - op_begin_) / kFloatMSEC;
    if (flags_->device_ == "GPU") {
      auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
      cost = static_cast<float>(gpu_param.execute_time);
    }
    op_cost_total_ += cost;
    op_times_by_type_[call_param.node_type].first++;
    op_times_by_type_[call_param.node_type].second += cost;
    op_times_by_name_[call_param.node_name].first++;
    op_times_by_name_[call_param.node_name].second += cost;
    return true;
  };
  return RET_OK;
}

int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
#ifndef ENABLE_ARM64
  MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
  return RET_ERROR;
#else
  struct perf_event_attr pe, pe2;
  memset(&pe, 0, sizeof(struct perf_event_attr));
  memset(&pe2, 0, sizeof(struct perf_event_attr));
  pe.type = PERF_TYPE_HARDWARE;
  pe2.type = PERF_TYPE_HARDWARE;
  pe.size = sizeof(struct perf_event_attr);
  pe2.size = sizeof(struct perf_event_attr);
  pe.disabled = 1;
  pe2.disabled = 1;
  pe.exclude_kernel = 1;   // don't count kernel
  pe2.exclude_kernel = 1;  // don't count kernel
  pe.exclude_hv = 1;       // don't count hypervisor
  pe2.exclude_hv = 1;      // don't count hypervisor
  pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
  pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
  if (flags_->perf_event_ == "CACHE") {
    pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
    pe2.config = PERF_COUNT_HW_CACHE_MISSES;
  } else if (flags_->perf_event_ == "STALL") {
    pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
    pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
  } else {
    pe.config = PERF_COUNT_HW_CPU_CYCLES;
    pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
  }
  perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
  if (perf_fd == -1) {
    MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
    return RET_ERROR;
  }
  perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
  if (perf_fd2 == -1) {
    MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
    return RET_ERROR;
  }
  struct PerfCount zero;
  zero.value[0] = 0;
  zero.value[1] = 0;
  // before callback
  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
                             const std::vector<mindspore::MSTensor> &before_outputs,
                             const MSCallBackParam &call_param) {
    if (before_inputs.empty()) {
      MS_LOG(INFO) << "The num of beforeInputs is empty";
    }
    if (before_outputs.empty()) {
      MS_LOG(INFO) << "The num of beforeOutputs is empty";
    }
    if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
      op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
    }
    if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
      op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
    }

    op_call_times_total_++;
    ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
    ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
    return true;
  };

  // after callback
  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
    struct PerfResult res;
    ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
    if (read(perf_fd, &res, sizeof(struct PerfResult)) == -1) {
      MS_LOG(ERROR) << "Failed to read perf_fd";
      return false;
    }

    if (after_inputs.empty()) {
      MS_LOG(INFO) << "The num of after inputs is empty";
    }
    if (after_outputs.empty()) {
      MS_LOG(INFO) << "The num of after outputs is empty";
    }
    float cost1 = static_cast<float>(res.values[0].value);
    float cost2 = static_cast<float>(res.values[1].value);
    op_cost_total_ += cost1;
    op_cost2_total_ += cost2;
    op_perf_by_type_[call_param.node_type].first++;
    op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
    op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
    op_perf_by_name_[call_param.node_name].first++;
    op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
    op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
    return true;
  };
#endif
  return RET_OK;
}

namespace {
template <typename T>
std::string DataToString(void *data, size_t data_number) {
  if (data == nullptr) {
    return "Data of tensor is nullptr";
  }
  std::ostringstream oss;
  auto casted_data = static_cast<T *>(data);
  for (size_t i = 0; i < kDataToStringMaxNum && i < data_number; i++) {
    oss << " " << casted_data[i];
  }
  return oss.str();
}

std::string DumpMSTensor(mindspore::MSTensor *tensor) {
  if (tensor == nullptr) {
    return "Tensor is nullptr";
  }
  std::ostringstream oss;
  oss << " DataType: " << static_cast<int>(tensor->DataType());
  oss << " Shape:";
  for (auto &dim : tensor->Shape()) {
    oss << " " << dim;
  }
  oss << std::endl << " Data:";
  switch (static_cast<int>(tensor->DataType())) {
    case kNumberTypeFloat32: {
      oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
    } break;
    case kNumberTypeFloat16: {
      oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
    } break;
    case kNumberTypeInt32: {
      oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
    } break;
    case kNumberTypeInt16: {
      oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
    } break;
    case kNumberTypeInt8: {
      oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
    } break;
    default:
      oss << "Unsupported data type to print";
      break;
  }
  return oss.str();
}

std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
                                   const std::string &file_type, const size_t &idx) {
  std::string file_name = op_name;
  auto pos = file_name.find_first_of('/');
  while (pos != std::string::npos) {
    file_name.replace(pos, 1, ".");
    pos = file_name.find_first_of('/');
  }
  file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
  for (const auto &dim : tensor->Shape()) {
    file_name += std::to_string(dim) + "_";
  }
  if (kTypeIdMap.find(static_cast<int>(tensor->DataType())) != kTypeIdMap.end()) {
    file_name += kTypeIdMap.at(static_cast<int>(tensor->DataType()));
  }

  file_name += +".bin";
  return file_name;
}
}  // namespace

int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
  // before callback
  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
                             const std::vector<mindspore::MSTensor> &before_outputs,
                             const MSCallBackParam &call_param) { return true; };

  // after callback
  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
    std::cout << "================================================================" << std::endl;
    std::cout << call_param.node_name << " inputs : " << std::endl;
    for (auto ms_tensor : after_inputs) {
      std::cout << DumpMSTensor(&ms_tensor) << std::endl;
    }
    std::cout << "----------------------------------------------------------------" << std::endl;
    std::cout << call_param.node_name << " outputs : " << std::endl;
    for (auto ms_tensor : after_outputs) {
      std::cout << DumpMSTensor(&ms_tensor) << std::endl;
    }
    std::cout << "================================================================" << std::endl;
    return true;
  };
  return RET_OK;
}
int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
  // before callback
  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
                             const std::vector<mindspore::MSTensor> &before_outputs,
                             const MSCallBackParam &call_param) {
    auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
    auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
    auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
    if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
      if (input_output_mode == 0 || input_output_mode == 1) {
        for (size_t i = 0; i < before_inputs.size(); i++) {
          auto ms_tensor = before_inputs.at(i);
          auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "input", i);
          auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
          if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
            MS_LOG(ERROR) << "write tensor data to file failed.";
            return false;
          }
        }
      }
    }
    return true;
  };

  // after callback
  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
    auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
    auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
    auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
    if (dump_mode == kDumpInputsAndOutputs ||
        std::find(kernels.begin(), kernels.end(), call_param.node_name) != kernels.end()) {
      if (input_output_mode == kDumpInputsAndOutputs || input_output_mode == kDumpOutputs) {
        for (size_t i = 0; i < after_outputs.size(); i++) {
          auto ms_tensor = after_outputs.at(i);
          auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name, "output", i);
          auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
          if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
            MS_LOG(ERROR) << "write tensor data to file failed.";
            return false;
          }
        }
      }
    }
    return true;
  };
  return RET_OK;
}

int BenchmarkUnifiedApi::CompareOutputByCosineDistance(float cosine_distance_threshold) {
  std::cout << "================ Comparing Output data by Cosine Distance================" << std::endl;
  float total_cosine_distance = 0;
  int total_size = 0;
  for (const auto &calib_tensor : benchmark_data_) {
    std::string tensor_name = calib_tensor.first;
    mindspore::MSTensor tensor = ms_model_.GetOutputByTensorName(tensor_name);
    if (tensor == nullptr) {
      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
      return RET_ERROR;
    }
    int ret;
    if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
      std::cerr << tensor.Name() << " data type is kObjectTypeString, can not compared data " << std::endl;
      return RET_ERROR;
    } else {
      ret = CompareDataGetTotalCosineDistanceAndSize(tensor_name, &tensor, &total_cosine_distance, &total_size);
    }
    if (ret != RET_OK) {
      MS_LOG(ERROR) << "Error in CompareData";
      std::cerr << "Error in CompareData" << std::endl;
      std::cout << "=======================================================" << std::endl << std::endl;
      return ret;
    }
  }
  float mean_cosine_distance;
  if (total_size != 0) {
    mean_cosine_distance = total_cosine_distance / float_t(total_size);
  } else {
    mean_cosine_distance = 0;
  }

  std::cout << "Cosine distance of all nodes/tensors: " << mean_cosine_distance << std::endl;
  std::cout << "=======================================================" << std::endl << std::endl;

  if (mean_cosine_distance < cosine_distance_threshold) {
    MS_LOG(ERROR) << "cosine distance of all nodes/tensors is too big: " << mean_cosine_distance;
    std::cerr << "Mean cosine distance of all nodes/tensors is too big: " << mean_cosine_distance << std::endl;
    return RET_ERROR;
  }
  return RET_OK;
}

int BenchmarkUnifiedApi::CompareDataGetTotalCosineDistanceAndSize(const std::string &name, mindspore::MSTensor *tensor,
                                                                  float *total_cosine_distance, int *total_size) {
  float cos = 0;
  auto mutableData = tensor->MutableData();
  if (mutableData == nullptr) {
    MS_LOG(ERROR) << "mutableData is nullptr.";
    return RET_ERROR;
  }
  int ret = RET_ERROR;
  switch (static_cast<int>(tensor->DataType())) {
    case TypeId::kNumberTypeFloat:
    case TypeId::kNumberTypeFloat32: {
      ret = CompareDatabyCosineDistance<float>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    case TypeId::kNumberTypeInt8: {
      ret = CompareDatabyCosineDistance<int8_t>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    case TypeId::kNumberTypeUInt8: {
      ret = CompareDatabyCosineDistance<uint8_t>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    case TypeId::kNumberTypeInt32: {
      ret = CompareDatabyCosineDistance<int32_t>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    case TypeId::kNumberTypeInt16: {
      ret = CompareDatabyCosineDistance<int16_t>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    case TypeId::kNumberTypeBool: {
      ret = CompareDatabyCosineDistance<bool>(name, tensor->Shape(), mutableData, &cos);
      break;
    }
    default:
      MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
      return RET_ERROR;
  }
  if (ret != RET_OK) {
    MS_LOG(ERROR) << "CompareData failed, name: " << name;
    return RET_ERROR;
  }
  *total_cosine_distance += cos;
  *total_size += 1;
  return RET_OK;
}

BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {}
}  // namespace lite
}  // namespace mindspore