Browse Source

!10740 [MS][LITE][Develop]add perf profiling for benchmark on arm64

From: @lx0095
Reviewed-by: @zhang_xue_tong
Signed-off-by:
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 5 years ago
parent
commit
eba1e58140
2 changed files with 265 additions and 43 deletions
  1. +235
    -42
      mindspore/lite/tools/benchmark/benchmark.cc
  2. +30
    -1
      mindspore/lite/tools/benchmark/benchmark.h

+ 235
- 42
mindspore/lite/tools/benchmark/benchmark.cc View File

@@ -26,6 +26,12 @@
#include "include/version.h"
#include "src/common/common.h"
#include "src/runtime/runtime_api.h"
#ifdef ENABLE_ARM64
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <asm/unistd.h>
#include <unistd.h>
#endif

namespace mindspore {
namespace lite {
@@ -372,8 +378,9 @@ int Benchmark::MarkPerformance() {
for (int i = 0; i < flags_->loop_count_; i++) {
session_->BindThread(true);
auto start = GetTimeUs();
auto status =
flags_->time_profiling_ ? session_->RunGraph(before_call_back_, after_call_back_) : session_->RunGraph();
auto status = (flags_->time_profiling_ || flags_->perf_profiling_)
? session_->RunGraph(before_call_back_, after_call_back_)
: session_->RunGraph();
if (status != 0) {
MS_LOG(ERROR) << "Inference error " << status;
std::cerr << "Inference error " << status;
@@ -393,6 +400,27 @@ int Benchmark::MarkPerformance() {
const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
PrintResult(per_op_name, op_times_by_name_);
PrintResult(per_op_type, op_times_by_type_);
#ifdef ENABLE_ARM64
} else if (flags_->perf_profiling_) {
if (flags_->perf_event_ == "CACHE") {
const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
} else if (flags_->perf_event_ == "STALL") {
const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
"backendend(%)"};
const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
"backendend(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
} else {
const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
}
#endif
}

if (flags_->loop_count_ > 0) {
@@ -625,50 +653,144 @@ void BenchmarkFlags::InitResizeDimsList() {
}

int Benchmark::InitCallbackParameter() {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
}
if (flags_->time_profiling_) {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
}

op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};
op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};

// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();

if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}

float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
op_cost_total_ += cost;
op_times_by_type_[call_param.node_type].first++;
op_times_by_type_[call_param.node_type].second += cost;
op_times_by_name_[call_param.node_name].first++;
op_times_by_name_[call_param.node_name].second += cost;
return true;
};
} else if (flags_->perf_profiling_) {
#ifndef ENABLE_ARM64
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
return RET_ERROR;
#else
struct perf_event_attr pe, pe2;
memset(&pe, 0, sizeof(struct perf_event_attr));
memset(&pe2, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe2.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe2.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe2.disabled = 1;
pe.exclude_kernel = 1; // don't count kernel
pe2.exclude_kernel = 1; // don't count kernel
pe.exclude_hv = 1; // don't count hypervisor
pe2.exclude_hv = 1; // don't count hypervisor
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
if (flags_->perf_event_ == "CACHE") {
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
} else if (flags_->perf_event_ == "STALL") {
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
} else {
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
if (perf_fd == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
return RET_ERROR;
}
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
if (perf_fd2 == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
return RET_ERROR;
}
struct PerfCount zero;
zero.value[0] = 0;
zero.value[1] = 0;
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
}
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
}

float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
op_cost_total_ += cost;
op_times_by_type_[call_param.node_type].first++;
op_times_by_type_[call_param.node_type].second += cost;
op_times_by_name_[call_param.node_name].first++;
op_times_by_name_[call_param.node_name].second += cost;
return true;
};

op_call_times_total_++;
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
return true;
};

// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));

if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost1 = static_cast<float>(res.values[0].value);
float cost2 = static_cast<float>(res.values[1].value);
op_cost_total_ += cost1;
op_cost2_total_ += cost2;
op_perf_by_type_[call_param.node_type].first++;
op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
op_perf_by_name_[call_param.node_name].first++;
op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
return true;
};
#endif
}
return RET_OK;
}

@@ -751,7 +873,10 @@ int Benchmark::Init() {
return RET_ERROR;
}

if (flags_->time_profiling_) {
if (flags_->time_profiling_ || flags_->perf_profiling_) {
if (flags_->time_profiling_ && flags_->perf_profiling_) {
MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
}
auto status = InitCallbackParameter();
if (status != RET_OK) {
MS_LOG(ERROR) << "Init callback Parameter failed.";
@@ -771,7 +896,7 @@ int Benchmark::PrintResult(const std::vector<std::string> &title,
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len;
size_t len = 0;

len = iter.first.size();
if (len > columnLenMax.at(0)) {
@@ -827,6 +952,74 @@ int Benchmark::PrintResult(const std::vector<std::string> &title,
return RET_OK;
}

#ifdef ENABLE_ARM64
int Benchmark::PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;

for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len = 0;

len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);

float tmp = float_t(flags_->num_threads_) * iter.second.second.value[0] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%.2f", tmp);
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);

len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second.value[0] / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);

tmp = float_t(flags_->num_threads_) * iter.second.second.value[1] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%.2f", tmp);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);

len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second.value[1] / op_cost2_total_);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);

rows.push_back(columns);
}

printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (auto &row : rows) {
for (int j = 0; j < 5; j++) {
auto printBuf = row[j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
#endif

Benchmark::~Benchmark() {
for (const auto &iter : this->benchmark_data_) {
delete (iter.second);


+ 30
- 1
mindspore/lite/tools/benchmark/benchmark.h View File

@@ -42,6 +42,19 @@ enum MS_API InDataType { kImage = 0, kBinary = 1 };
constexpr float relativeTolerance = 1e-5;
constexpr float absoluteTolerance = 1e-8;

#ifdef ENABLE_ARM64
struct PerfResult {
int64_t nr;
struct {
int64_t value;
int64_t id;
} values[2];
};
struct PerfCount {
int64_t value[2];
};
#endif

struct MS_API CheckTensor {
CheckTensor(const std::vector<size_t> &shape, const std::vector<float> &data,
const std::vector<std::string> &strings_data = {""}) {
@@ -69,6 +82,9 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
AddFlag(&BenchmarkFlags::enable_fp16_, "enableFp16", "Enable float16", false);
AddFlag(&BenchmarkFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 3);
AddFlag(&BenchmarkFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
AddFlag(&BenchmarkFlags::perf_profiling_, "perfProfiling",
"Perf event profiling(only instructions statics enabled currently)", false);
AddFlag(&BenchmarkFlags::perf_event_, "perfEvent", "CYCLE|CACHE|STALL", "CYCLE");
// MarkAccuracy
AddFlag(&BenchmarkFlags::benchmark_data_file_, "benchmarkDataFile", "Benchmark data file path", "");
AddFlag(&BenchmarkFlags::benchmark_data_type_, "benchmarkDataType",
@@ -98,6 +114,8 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
bool enable_fp16_ = false;
int warm_up_loop_count_ = 3;
bool time_profiling_ = false;
bool perf_profiling_ = false;
std::string perf_event_ = "CYCLE";
// MarkAccuracy
std::string benchmark_data_file_;
std::string benchmark_data_type_ = "FLOAT";
@@ -146,6 +164,11 @@ class MS_API Benchmark {

int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);

#ifdef ENABLE_ARM64
int PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result);
#endif

int PrintInputData();

// tensorData need to be converter first
@@ -255,7 +278,13 @@ class MS_API Benchmark {
float op_cost_total_ = 0.0f;
std::map<std::string, std::pair<int, float>> op_times_by_type_;
std::map<std::string, std::pair<int, float>> op_times_by_name_;

#ifdef ENABLE_ARM64
int perf_fd = 0;
int perf_fd2 = 0;
float op_cost2_total_ = 0.0f;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
#endif
KernelCallBack before_call_back_;
KernelCallBack after_call_back_;
std::mt19937 random_engine_;


Loading…
Cancel
Save