!25092 [MD] API for autotune

Merge pull request !25092 from harshvardhangupta/auto_tune_api
4 years ago · c0f6286fcb
--- a/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
@@ -151,8 +151,8 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
 #ifndef ENABLE_SECURITY
  if (tracing_ != nullptr) {
    cur_batch_num_++;
    RETURN_IF_NOT_OK(tracing_->Record(static_cast<int32_t>(CONNECTOR_DEPTH), cur_connector_capacity_, cur_batch_num_,
                                      cur_connector_size_, ProfilingTime::GetCurMilliSecond()));
    tracing_->Record(static_cast<int32_t>(CONNECTOR_DEPTH), cur_connector_capacity_, cur_batch_num_,
                     cur_connector_size_, ProfilingTime::GetCurMilliSecond());
  }
 #endif
  return Status::OK();
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -389,7 +389,7 @@ Status DeviceQueueOp::LaunchParallelCopyThread() {
    RETURN_IF_NOT_OK(CircularPool::CreateCircularPool(&pool, -1, kDeviceQueGpuThreadMemory, false, true));
    pool_.push_back(pool);
  }
  gpu_item_connector_ = std::make_unique<GpuItemConnector>(num_workers_, 1, queue_capacity_);
  gpu_connector_ = std::make_unique<GpuConnector>(num_workers_, 1, queue_capacity_);
  receive_queues_.Init(num_workers_, queue_capacity_);
  RETURN_IF_NOT_OK(receive_queues_.Register(tree_->AllTasks()));
  RETURN_IF_NOT_OK(
@@ -417,73 +417,78 @@ Status DeviceQueueOp::PushDataToGPU() {
    RETURN_IF_NOT_OK(tree_->GetProfilingManager()->GetTracingNode(kDeviceQueueTracingName, &node));
    profiling_node = std::dynamic_pointer_cast<DeviceQueueTracing>(node);
    batch_start_time = ProfilingTime::GetCurMilliSecond();
    connector_capacity = gpu_item_connector_->capacity();
    connector_capacity = gpu_connector_->capacity();
  }
 #endif
 #ifdef ENABLE_DUMP_IR
  md_channel_info_->RecordBatchQueue(gpu_item_connector_->size());
  md_channel_info_->RecordBatchQueue(gpu_connector_->size());
  md_channel_info_->RecordPreprocessBatch(0);
 #endif
  std::vector<device::DataItemGpu> items;
  RETURN_IF_NOT_OK(gpu_item_connector_->Pop(0, &items));
  GpuConnectorItem item;
  RETURN_IF_NOT_OK(gpu_connector_->Pop(0, &item));
  auto items = std::move(item.data_item);
  bool eoe_flag = item.eoe_flag;
  int64_t send_batch = 0;
  bool is_open = false;
  uint32_t handle = INVALID_HANDLE;
  auto release_function = std::bind(&DeviceQueueOp::ReleaseData, this, std::placeholders::_1, std::placeholders::_2);
  while (!items.empty() && !GpuBufferMgr::GetInstance().IsClosed()) {
  handle = GpuBufferMgr::GetInstance().Open(0, channel_name_, {}, release_function);
  if (handle == INVALID_HANDLE) {
    return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
                  "[Internal ERROR] Failed to open channel for sending data.");
  }

  while (!(items.empty() && !eoe_flag) && !GpuBufferMgr::GetInstance().IsClosed()) {
    if (!eoe_flag) {
 #ifdef ENABLE_DUMP_IR
    md_channel_info_->RecordBatchQueue(gpu_item_connector_->size());
    md_channel_info_->RecordPreprocessBatch(send_batch);
    md_channel_info_->RecordPushStartTime();
 #endif
    if (!is_open) {
      std::vector<size_t> data_size;
      for (int32_t index = 0; index < items.size(); index++) {
        data_size.push_back(items[index].data_len_);
      }
      handle = GpuBufferMgr::GetInstance().Open(0, channel_name_, data_size, release_function);
      if (handle == INVALID_HANDLE) {
        return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__,
                      "[Internal ERROR] Failed to open channel for sending data.");
      md_channel_info_->RecordBatchQueue(gpu_connector_->size());
      md_channel_info_->RecordPreprocessBatch(send_batch);
      md_channel_info_->RecordPushStartTime();
 #endif
      // Data prefetch only when PS mode enables cache.
      if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, items[0].data_ptr_, items[0].data_len_,
                                                          items[0].data_type_)) {
        return Status(StatusCode::kMDTimeOut, __LINE__, __FILE__,
                      "Failed to prefetch data in current PS mode(cache data when sending).");
      }
      is_open = true;
    }

    // Data prefetch only when PS mode enables cache.
    if (!ps::PsDataPrefetch::GetInstance().PrefetchData(channel_name_, items[0].data_ptr_, items[0].data_len_,
                                                        items[0].data_type_)) {
      return Status(StatusCode::kMDTimeOut, __LINE__, __FILE__,
                    "Failed to prefetch data in current PS mode(cache data when sending).");
    }
    RETURN_IF_NOT_OK(RetryPushData(handle, items));
    send_batch++;
      RETURN_IF_NOT_OK(RetryPushData(handle, items));
      send_batch++;
 #ifndef ENABLE_SECURITY
    if (is_profiling_enable) {
      uint64_t end_time = ProfilingTime::GetCurMilliSecond();
      // record push data time
      profiling_node->Record(TIME, TDT_PUSH_TIME, send_batch, push_cost, end_time);
      int32_t batch_cost = (int32_t)(end_time - batch_start_time);
      // record batch time
      profiling_node->Record(TIME, BATCH_TIME, send_batch, batch_cost, end_time);
      // record pipeline time
      profiling_node->Record(TIME, PIPELINE_TIME, send_batch, batch_cost - push_cost, end_time);
      batch_start_time = end_time;
      // record connector depth
      profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, send_batch, connector_size, end_time);
      connector_size = gpu_item_connector_->size();
      connector_capacity = gpu_item_connector_->capacity();
    }
      if (is_profiling_enable) {
        uint64_t end_time = ProfilingTime::GetCurMilliSecond();
        // record push data time
        profiling_node->Record(TIME, TDT_PUSH_TIME, send_batch, push_cost, end_time);
        int32_t batch_cost = (int32_t)(end_time - batch_start_time);
        // record batch time
        profiling_node->Record(TIME, BATCH_TIME, send_batch, batch_cost, end_time);
        // record pipeline time
        profiling_node->Record(TIME, PIPELINE_TIME, send_batch, batch_cost - push_cost, end_time);
        batch_start_time = end_time;
        // record connector depth
        profiling_node->Record(CONNECTOR_DEPTH, connector_capacity, send_batch, connector_size, end_time);
        connector_size = gpu_connector_->size();
        connector_capacity = gpu_connector_->capacity();
      }
 #endif
 #ifdef ENABLE_DUMP_IR
    md_channel_info_->RecordBatchQueue(gpu_item_connector_->size());
    md_channel_info_->RecordPreprocessBatch(send_batch);
    md_channel_info_->RecordPushEndTime();
      md_channel_info_->RecordBatchQueue(gpu_connector_->size());
      md_channel_info_->RecordPreprocessBatch(send_batch);
      md_channel_info_->RecordPushEndTime();
 #endif
      if (total_batch_ > 0 && send_batch >= total_batch_) {
        break;
      }
    } else {
 #ifndef ENABLE_SECURITY
      if (is_profiling_enable) {
        tree_->SetEpochEnd();
        tree_->GetProfilingManager()->RecordEndOfEpoch(send_batch);
      }
 #endif
    if (total_batch_ > 0 && send_batch >= total_batch_) {
      break;
    }
    if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
      auto rc = gpu_item_connector_->Pop(0, &items);
      auto rc = gpu_connector_->Pop(0, &item);
      items = std::move(item.data_item);
      eoe_flag = item.eoe_flag;
      // If the batches send by dataset are more than gpu calculate, gpu will core for no signal notify.
      if (rc.IsError()) {
        GpuBufferMgr::GetInstance().Close(handle);
@@ -543,25 +548,30 @@ Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
  uint32_t batch_num = 0;
  RETURN_IF_NOT_OK(receive_queues_[worker_id]->PopFront(&current_row));
  while (!current_row.quit() && !GpuBufferMgr::GetInstance().IsClosed()) {
    std::vector<device::DataItemGpu> items;
    for (int i = 0; i < current_row.size(); i++) {
      device::DataItemGpu data_item;
      data_item.data_len_ = static_cast<size_t>(current_row[i]->SizeInBytes());
      data_item.data_ptr_ = nullptr;
      data_item.worker_id_ = worker_id;
      items.push_back(data_item);
    GpuConnectorItem connector_item = {{}, current_row.eoe()};
    if (!connector_item.eoe_flag) {
      std::vector<device::DataItemGpu> items;
      for (auto &i : current_row) {
        device::DataItemGpu data_item;
        data_item.data_len_ = static_cast<size_t>(i->SizeInBytes());
        data_item.data_ptr_ = nullptr;
        data_item.worker_id_ = worker_id;
        items.push_back(data_item);
      }
      RETURN_IF_NOT_OK(MallocForGPUData(&items, current_row, worker_id));
      connector_item.data_item = std::move(items);
      batch_num++;
    } else {
      MS_LOG(INFO) << "EOE Detected";
    }
    RETURN_IF_NOT_OK(MallocForGPUData(&items, current_row, worker_id));
    RETURN_IF_NOT_OK(gpu_item_connector_->Add(worker_id, std::move(items)));
    batch_num++;

    RETURN_IF_NOT_OK(gpu_connector_->Add(worker_id, std::move(connector_item)));
    RETURN_IF_NOT_OK(receive_queues_[worker_id]->PopFront(&current_row));
  }

  MS_LOG(INFO) << "Device queue worker id " << worker_id << "proc " << batch_num << "batch.";
  // Add empty vector as quit flag.
  std::vector<device::DataItemGpu> items;
  RETURN_IF_NOT_OK(gpu_item_connector_->Add(worker_id, std::move(items)));
  // Add empty data_item vector with eoe_flag=false as quit flag.
  GpuConnectorItem connector_item = {{}, false};
  RETURN_IF_NOT_OK(gpu_connector_->Add(worker_id, std::move(connector_item)));
  return Status::OK();
 }

@@ -599,12 +609,12 @@ Status DeviceQueueOp::SendDataToGPU() {
      }
    }

 #ifndef ENABLE_SECURITY
    if (current_row.eoe() && tree_->GetProfilingManager()->IsProfilingEnable()) {
      tree_->SetEpochEnd();
      tree_->GetProfilingManager()->RecordEndOfEpoch(batch_num);
    if (current_row.eoe()) {
      MS_LOG(INFO) << "EOE Detected";
      TensorRow eoe_flag(TensorRow::kFlagEOE);
      RETURN_IF_NOT_OK(receive_queues_[num_buf % num_workers_]->Add(std::move(eoe_flag)));
    }
 #endif

    if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
      RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
    } else {
@@ -613,6 +623,7 @@ Status DeviceQueueOp::SendDataToGPU() {
  }

  for (uint32_t index = 0; index < num_workers_; index++) {
    MS_LOG(INFO) << "Adding quit flag to Workers";
    TensorRow quit_flag(TensorRow::kFlagQuit);
    RETURN_IF_NOT_OK(receive_queues_[num_buf++ % num_workers_]->Add(std::move(quit_flag)));
  }
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
@@ -152,7 +152,7 @@ class DeviceQueueOp : public PipelineOp {

  QueueList<TensorRow> receive_queues_;
  std::vector<std::shared_ptr<MemoryPool>> pool_;
  std::unique_ptr<GpuItemConnector> gpu_item_connector_;
  std::unique_ptr<GpuConnector> gpu_connector_;
  const uint32_t kDeviceQueGpuNumThreads = 2;
  const uint32_t kDeviceQueGpuQueueCapacity = 8;
  const uint32_t kDeviceQueGpuThreadMemory = 1024;
--- a/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/execution_tree.cc
@@ -235,7 +235,7 @@ void ExecutionTree::Iterator::PostOrderTraverse(const std::shared_ptr<DatasetOp>
 ExecutionTree::Iterator::Iterator(const std::shared_ptr<DatasetOp> &root) : ind_(0) {
  // post-order traverse the tree, if root is null, it return
  PostOrderTraverse(root);
  nodes_.emplace_back(nullptr);
  (void)nodes_.emplace_back(nullptr);
 }

 // Given the number of workers, launches the worker entry function for each. Essentially a
--- a/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/gpu_item_connector.h
@@ -30,34 +30,41 @@ using mindspore::device::DataItemGpu;

 namespace mindspore {
 namespace dataset {
 class GpuItemConnector : public Connector<std::vector<device::DataItemGpu>> {

 struct GpuConnectorItem {
  std::vector<device::DataItemGpu> data_item;
  bool eoe_flag;  // flag to indicate an EOE item in the connector
 };

 class GpuConnector : public Connector<GpuConnectorItem> {
 public:
  GpuItemConnector(int32_t num_producers, int32_t num_consumers, int32_t queue_capacity)
      : Connector<std::vector<device::DataItemGpu>>(num_producers, num_consumers, queue_capacity) {
  GpuConnector(int32_t num_producers, int32_t num_consumers, int32_t queue_capacity)
      : Connector<GpuConnectorItem>(num_producers, num_consumers, queue_capacity) {
    for (int i = 0; i < num_producers; i++) {
      is_queue_finished_.push_back(false);
    }
  }

  ~GpuItemConnector() = default;
  ~GpuConnector() = default;

  Status Add(int32_t worker_d, std::vector<device::DataItemGpu> &&element) noexcept {
    return Connector<std::vector<device::DataItemGpu>>::Push(worker_d, std::move(element));
  Status Add(int32_t worker_d, GpuConnectorItem &&element) noexcept {
    return Connector<GpuConnectorItem>::Push(worker_d, std::move(element));
  }

  Status Pop(int32_t worker_id, std::vector<device::DataItemGpu> *result) noexcept override {
  Status Pop(int32_t worker_id, GpuConnectorItem *result) noexcept override {
    RETURN_UNEXPECTED_IF_NULL(result);
    {
      MS_ASSERT(worker_id < num_consumers_);
      std::unique_lock<std::mutex> lock(m_);
      RETURN_IF_NOT_OK(cv_.Wait(&lock, [this, worker_id]() { return expect_consumer_ == worker_id; }));
      if (is_queue_finished_[pop_from_]) {
        std::string errMsg = "ERROR: popping from a finished queue in GpuItemConnector";
        std::string errMsg = "ERROR: popping from a finished queue in GpuConnector";
        RETURN_STATUS_UNEXPECTED(errMsg);
      }

      RETURN_IF_NOT_OK(queues_[pop_from_]->PopFront(result));
      if ((*result).empty()) {
      // empty data_item and eoe_flag=false is EOF
      if ((*result).data_item.empty() && !(*result).eoe_flag) {
        is_queue_finished_[pop_from_] = true;
      }

@@ -81,5 +88,5 @@ class GpuItemConnector : public Connector<std::vector<device::DataItemGpu>> {
 };
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
 #endif  // ENABLE_GPUQUE
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_GPU_ITEM_CONNECTOR_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/CMakeLists.txt
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/CMakeLists.txt
@@ -1,9 +1,9 @@
 add_library(engine-perf OBJECT
    profiling.cc
    monitor.cc
    device_queue_tracing.cc
    connector_size.cc
    dataset_iterator_tracing.cc
    connector_throughput.cc
    cpu_sampling.cc
        )
 add_library(
        engine-perf OBJECT
        profiling.cc
        monitor.cc
        device_queue_tracing.cc
        connector_size.cc
        dataset_iterator_tracing.cc
        cpu_sampler.cc
 )
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.cc
@@ -15,6 +15,8 @@
 */
 #include "minddata/dataset/engine/perf/connector_size.h"
 #include <fstream>
 #include <algorithm>
 #include <memory>
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/util/path.h"
@@ -27,10 +29,12 @@ using Qrow = std::vector<int>;
 // Sample action
 Status ConnectorSize::Sample() {
  Qrow cur_row;
  std::transform(tree_->begin(), tree_->end(), std::back_inserter(cur_row),
                 [](DatasetOp &op) { return op.ConnectorSize(); });
  (void)std::transform(tree_->begin(), tree_->end(), std::back_inserter(cur_row),
                       [](DatasetOp &op) { return op.ConnectorSize(); });
  std::lock_guard<std::mutex> guard(lock_);
  // Push new row of sample
  sample_table_.push_back(cur_row);
  (void)ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
  return Status::OK();
 }

@@ -70,8 +74,8 @@ Status ConnectorSize::SaveToFile() {
  // Traverse the ExecutionTree for JSON node generation
  for (auto &node : *tree_) {
    std::vector<int32_t> cur_queue_size;
    std::transform(sample_table_.begin(), sample_table_.end(), std::back_inserter(cur_queue_size),
                   [&](const ConnectorSizeSample &sample) { return sample[idx]; });
    (void)std::transform(sample_table_.begin(), sample_table_.end(), std::back_inserter(cur_queue_size),
                         [&](const ConnectorSizeSample &sample) { return sample[idx]; });
    if (!path.Exists()) {
      json json_node = ParseOpInfo(node, cur_queue_size);
      output["op_info"].push_back(json_node);
@@ -102,5 +106,37 @@ Status ConnectorSize::Init(const std::string &dir_path, const std::string &devic
 }

 Status ConnectorSize::Analyze() { return Status::OK(); }

 Status ConnectorSize::GetOpConnectorSize(int32_t op_id, uint64_t start_time, uint64_t end_time,
                                         std::vector<int32_t> *result) {
  MS_LOG(DEBUG) << "Op_id: " << op_id << " start_ts: " << start_time << " end_ts: " << end_time;
  CHECK_FAIL_RETURN_UNEXPECTED(start_time < end_time,
                               "Expected start_time < end_time. Got start_ts: " + std::to_string(start_time) +
                                 " end_ts: " + std::to_string(end_time));
  std::lock_guard<std::mutex> guard(lock_);
  CHECK_FAIL_RETURN_UNEXPECTED(
    ts_.size() == sample_table_.size(),
    "Expected ts_.size() == sample_table_.size(). Got ts_.size: " + std::to_string(ts_.size()) +
      " sample_table_.size: " + std::to_string(sample_table_.size()));
  // find first ts that is not less than start_ts
  auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_time);
  // find first ts that is greater than end_ts
  auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_time);
  // get ts_ indices
  auto start_index = std::distance(ts_.begin(), lower);
  auto end_index = std::distance(ts_.begin(), upper);
  MS_LOG(INFO) << "start_index: " << start_index << " end_index: " << end_index;
  CHECK_FAIL_RETURN_UNEXPECTED(start_index < end_index,
                               "Expected start_index < end_index. Got start_index: " + std::to_string(start_index) +
                                 " end_index: " + std::to_string(end_index));
  // convert indices to sample_table_ iterator
  auto first_iter = sample_table_.begin() + start_index;
  auto last_iter = sample_table_.begin() + end_index;
  // op_id corresponds to the index in sample vector
  (void)std::transform(first_iter, last_iter, std::back_inserter(*result),
                       [&](const ConnectorSizeSample &sample) { return sample[op_id]; });

  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_size.h
@@ -40,6 +40,7 @@ class ConnectorSize : public Sampling {
  // A circular buffer will be implemented in the future to make this table more flexible.
  using ConnectorSizeSample = std::vector<int>;
  using ConnectorSizeSampleTable = std::vector<ConnectorSizeSample>;
  using Timestamps = std::vector<uint64_t>;

 public:
  explicit ConnectorSize(ExecutionTree *tree) : tree_(tree) {}
@@ -62,13 +63,17 @@ class ConnectorSize : public Sampling {
  json ParseOpInfo(const DatasetOp &node, const std::vector<int32_t> &size);

  // Change file mode after save throughput data
  Status ChangeFileMode() { return Status::OK(); }
  Status ChangeFileMode() override { return Status::OK(); }

  Status Analyze() override;

  // Get the vector of connector sizes of given op for samples taken between start and end time
  Status GetOpConnectorSize(int32_t op_id, uint64_t start_time, uint64_t end_time, std::vector<int32_t> *result);

 private:
  ExecutionTree *tree_ = nullptr;          // ExecutionTree pointer
  ConnectorSizeSampleTable sample_table_;  // Dataset structure to store all samples of connector size sampling
  Timestamps ts_;                          // time of sample
 };

 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.cc
@@ -1,154 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include <sys/stat.h>
 #include <iterator>
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <nlohmann/json.hpp>
 #include "minddata/dataset/engine/perf/connector_throughput.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/util/path.h"

 namespace mindspore {
 namespace dataset {

 // temporary helper
 int ConnectorThroughput::InitNodes() {
  if (tree_ == nullptr) {
    return 0;
  }
  auto it = (*tree_).begin();
  return it.NumNodes();
 }
 // Sample action
 Status ConnectorThroughput::Sample() {
  std::vector<int64_t> out_row_count_row(n_nodes_);
  std::vector<double> throughput_row(n_nodes_);
  TimePoint cur_time;  // initialised inside the loop, used outside the loop to update prev sample time.
  auto col = 0;
  for (const auto &node : *tree_) {
    auto cur_out_rows_count = node.ConnectorOutRowsCount();
    out_row_count_row[col] = cur_out_rows_count;
    auto sz = timestamps_.size();
    cur_time = std::chrono::steady_clock::now();
    double data_time = 0;
    if (sz > 1) {
      auto full_time =
        std::chrono::duration_cast<std::chrono::microseconds>(timestamps_[0][sz - 1] - timestamps_[0][sz - 2]);
      data_time = std::chrono::duration<double>(full_time).count();
    }
    auto prev_out_rows_count = out_row_count_table_[col][out_row_count_table_.size() - 1];
    if (data_time != 0) {
      const int32_t multiplier = 1000;
      auto thr = (cur_out_rows_count - prev_out_rows_count) / (multiplier * data_time);
      throughput_row[col] = thr;
    } else {
      throughput_row[col] = 0;
    }
    col++;
  }
  std::vector<TimePoint> v = {cur_time};  // temporary fix
  timestamps_.AddSample(v);
  // Push new row of sample
  out_row_count_table_.AddSample(out_row_count_row);
  throughput_.AddSample(throughput_row);
  return Status::OK();
 }

 json ConnectorThroughput::ParseOpInfo(const DatasetOp &node, const std::vector<double> &thr) {
  auto children = node.Children();
  std::vector<int32_t> children_id;
  std::transform(children.begin(), children.end(), std::back_inserter(children_id),
                 [](const std::shared_ptr<DatasetOp> &op) -> int32_t { return op ? op->id() : 0; });
  json json_node;
  json_node["op_id"] = node.id();
  json_node["op_type"] = node.Name();
  json_node["num_workers"] = node.NumWorkers();
  json metrics;
  // DeviceQueueOp is a special op,it is not inlined but its output queue is invalid.
  // So we should not output its connector throughput.
  if (!node.inlined() && node.Name() != "DeviceQueueOp") {
    metrics["output_queue"] = {{"throughput", thr}};
  }
  json_node["metrics"] = metrics;
  if (!children_id.empty()) {
    json_node["children"] = children_id;
  }

  return json_node;
 }

 // Save profiling data to file
 // If the file is already exist (created by other sampling node), simply add the data to metrics field.
 Status ConnectorThroughput::SaveToFile() {
  json output;
  RETURN_IF_NOT_OK(ReadJson(&output));

  Path path = Path(file_path_);
  // Traverse the ExecutionTree for JSON node generation
  int col = 0;
  for (auto &node : *tree_) {
    std::vector<double> throughput;
    if (throughput_.size() > col) {
      for (auto i = 0; i < throughput_[col].size(); i++) {
        throughput.push_back(throughput_[col][i]);
      }
    }

    if (!path.Exists()) {
      json json_node = ParseOpInfo(node, throughput);
      output["op_info"].push_back(json_node);
    } else {
      if (!node.inlined() && node.Name() != "DeviceQueueOp") {
        auto &ops_data = output["op_info"];
        ops_data[col]["metrics"]["output_queue"]["throughput"] = throughput;
      }
    }
    col++;
  }

  // Discard the content of the file when opening.
  std::ofstream os(file_path_, std::ios::trunc);
  os << output;
  os.close();
  return Status::OK();
 }

 Status ConnectorThroughput::Init(const std::string &dir_path, const std::string &device_id) {
  file_path_ = (Path(dir_path) / Path("pipeline_profiling_" + device_id + ".json")).ToString();
  Path path = Path(file_path_);
  // Remove the file if it exists (from prior profiling usage)
  RETURN_IF_NOT_OK(path.Remove());
  return Status::OK();
 }

 Status ConnectorThroughput::ChangeFileMode() {
  if (file_path_.empty()) {
    return Status::OK();
  }

  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
  }
  return Status::OK();
 }

 Status ConnectorThroughput::Analyze() { return Status::OK(); }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/connector_throughput.h
@@ -1,92 +0,0 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CONNECTOR_THROUGHPUT_H
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_CONNECTOR_THROUGHPUT_H

 #include <vector>
 #include <chrono>
 #include <fstream>
 #include <string>
 #include <nlohmann/json.hpp>
 #include "minddata/dataset/engine/perf/profiling.h"
 #include "minddata/dataset/engine/perf/perf_data.h"
 #include "minddata/dataset/engine/perf/cyclic_array.h"
 #include "minddata/dataset/engine/datasetops/dataset_op.h"
 #include "minddata/dataset/engine/execution_tree.h"

 using json = nlohmann::json;
 namespace mindspore {
 namespace dataset {
 // Connector throughput samples the output connector size of each op in the pipeline.
 // For the description of the data structure see perf_data.h
 // It support JSON serialization for external usage.
 class ConnectorThroughput : public Sampling {
  using OutRowCount = PerfData<CyclicArray<int64_t>>;
  using Throughput = PerfData<CyclicArray<double>>;
  using TimePoint = std::chrono::time_point<std::chrono::steady_clock>;
  using TimeStamps = PerfData<CyclicArray<TimePoint>>;

 public:
  explicit ConnectorThroughput(ExecutionTree *tree, int64_t max_rows = 1000000)
      : tree_(tree),
        max_rows_(max_rows),
        n_nodes_(InitNodes()),
        out_row_count_table_(OutRowCount(max_rows_, n_nodes_)),
        throughput_(Throughput(max_rows_, n_nodes_)),
        timestamps_(TimeStamps(max_rows_, 1)) {
    timestamps_.AddSample(std::vector<TimePoint>(1));
    out_row_count_table_.AddSample(std::vector<int64_t>(n_nodes_));
  }

  /// \brief Destructor
  ~ConnectorThroughput() = default;

  // Driver function for connector size sampling.
  // This function samples the connector size of every nodes within the ExecutionTree
  Status Sample() override;

  // Traverse the tree nodes and count them
  int InitNodes();

  std::string Name() const override { return name_; };

  // Save sampling data to file
  // @return Status The status code returned
  Status SaveToFile() override;

  Status Init(const std::string &dir_path, const std::string &device_id) override;

  json ParseOpInfo(const DatasetOp &node, const std::vector<double> &thr);

  Status ChangeFileMode() override;

  Status Analyze() override;

 private:
  ExecutionTree *tree_ = nullptr;  // ExecutionTree pointer
  int64_t max_rows_;
  int32_t n_nodes_;
  OutRowCount out_row_count_table_;
  Throughput throughput_;
  TimeStamps timestamps_;
  std::string name_ = kConnectorThroughputSamplingName;
 };

 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_CONNECTOR_THROUGHPUT_H
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampler.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampler.cc
@@ -0,0 +1,511 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minddata/dataset/engine/perf/cpu_sampler.h"

 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #include <sys/syscall.h>
 #endif
 #include <cmath>
 #include <cstdio>
 #include <algorithm>
 #include <utility>
 #include <fstream>
 #include <memory>
 #include <string>

 #include "minddata/dataset/api/python/pybind_conversion.h"
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/util/path.h"

 namespace mindspore {
 namespace dataset {

 using json = nlohmann::json;
 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #define USING_LINUX
 #endif

 #if defined(USING_LINUX)
 int32_t SystemCpuInfo::num_cpu_ = get_nprocs_conf();
 #else
 int32_t SystemCpuInfo::num_cpu_ = 0;
 #endif

 Status SystemCpuInfo::ParseCpuInfo(const std::string &str) {
  SystemStat system_cpu_stat;
  uint64_t nice = 0;
  uint64_t irq = 0;
  uint64_t softirq = 0;
  if (sscanf_s(str.c_str(), "%*s %lu %lu %lu %lu %lu %lu %lu", &system_cpu_stat.user_stat, &nice,
               &system_cpu_stat.sys_stat, &system_cpu_stat.idle_stat, &system_cpu_stat.io_stat, &irq,
               &softirq) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get System CPU failed.");
  }

  system_cpu_stat.total_stat = system_cpu_stat.user_stat + nice + system_cpu_stat.sys_stat + system_cpu_stat.idle_stat +
                               system_cpu_stat.io_stat + irq + softirq;
  SystemUtil system_cpu_util = {0, 0, 0, 0};
  // Calculate the utilization from the second sampling
  if (!first_sample_) {
    system_cpu_util.user_utilization = round((system_cpu_stat.user_stat - prev_sys_stat_.user_stat) * 1.0 /
                                             (system_cpu_stat.total_stat - prev_sys_stat_.total_stat) * 100);
    system_cpu_util.sys_utilization = round((system_cpu_stat.sys_stat - prev_sys_stat_.sys_stat) * 1.0 /
                                            (system_cpu_stat.total_stat - prev_sys_stat_.total_stat) * 100);
    system_cpu_util.io_utilization = round((system_cpu_stat.io_stat - prev_sys_stat_.io_stat) * 1.0 /
                                           (system_cpu_stat.total_stat - prev_sys_stat_.total_stat) * 100);
    system_cpu_util.idle_utilization = round((system_cpu_stat.idle_stat - prev_sys_stat_.idle_stat) * 1.0 /
                                             (system_cpu_stat.total_stat - prev_sys_stat_.total_stat) * 100);
  }
  // append the 0 util as well to maintain sys_cpu_util_.size == ts_.size
  (void)sys_cpu_util_.emplace_back(system_cpu_util);
  prev_sys_stat_ = system_cpu_stat;
  return Status::OK();
 }

 Status SystemCpuInfo::ParseCtxt(const std::string &str) {
  uint64_t ctxt;
  if (sscanf_s(str.c_str(), "%*s %lu", &ctxt) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get context switch count failed.");
  }
  // first context switch count will be 0
  auto val = first_sample_ ? 0 : ctxt - prev_context_switch_count_;
  context_switch_count_.push_back(val);
  prev_context_switch_count_ = ctxt;
  return Status::OK();
 }

 Status SystemCpuInfo::ParseRunningProcess(const std::string &str) {
  uint32_t running_process;
  if (sscanf_s(str.c_str(), "%*s %ud", &running_process) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get context switch count failed.");
  }
  running_process_.push_back(running_process);
  return Status::OK();
 }

 Status SystemCpuInfo::SampleAndGetCurrPrevStat(SystemStat *current_stat, SystemStat *previous_stat) {
  std::ifstream file("/proc/stat");
  if (!file.is_open()) {
    MS_LOG(INFO) << "Failed to open /proc/stat file";
    return {StatusCode::kMDUnexpectedError, "Failed to open /proc/stat file"};
  }
  *previous_stat = prev_sys_stat_;
  bool first_line = true;
  std::string line;
  while (getline(file, line)) {
    if (first_line) {
      first_line = false;
      RETURN_IF_NOT_OK(ParseCpuInfo(line));
    }
    if (line.find("ctxt") != std::string::npos) {
      RETURN_IF_NOT_OK(ParseCtxt(line));
    }
    if (line.find("procs_running") != std::string::npos) {
      RETURN_IF_NOT_OK(ParseRunningProcess(line));
    }
  }
  // after the loop above, prev_sys_stat_ has the current value
  *current_stat = prev_sys_stat_;
  file.close();

  first_sample_ = false;
  return Status::OK();
 }

 Status SystemCpuInfo::GetUserCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const {
  MS_LOG(DEBUG) << "start_index: " << start_index << " end_index: " << end_index
                << " sys_cpu_util.size: " << sys_cpu_util_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(start_index < end_index,
                               "Expected start_index < end_index. Got start_index: " + std::to_string(start_index) +
                                 " end_index: " + std::to_string(end_index));
  CHECK_FAIL_RETURN_UNEXPECTED(
    end_index <= sys_cpu_util_.size(),
    "Expected end_index <= sys_cpu_util_.size(). Got end_index: " + std::to_string(end_index) +
      " sys_cpu_util_.size: " + std::to_string(sys_cpu_util_.size()));
  (void)std::transform(sys_cpu_util_.begin() + start_index, sys_cpu_util_.begin() + end_index,
                       std::back_inserter(*result), [&](const SystemUtil &info) { return info.user_utilization; });
  return Status::OK();
 }

 Status SystemCpuInfo::GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const {
  MS_LOG(DEBUG) << "start_index: " << start_index << " end_index: " << end_index
                << "sys_cpu_util.size: " << sys_cpu_util_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(start_index < end_index,
                               "Expected start_index < end_index. Got start_index: " + std::to_string(start_index) +
                                 " end_index: " + std::to_string(end_index));
  CHECK_FAIL_RETURN_UNEXPECTED(
    end_index <= sys_cpu_util_.size(),
    "Expected end_index <= sys_cpu_util_.size(). Got end_index: " + std::to_string(end_index) +
      " sys_cpu_util_.size: " + std::to_string(sys_cpu_util_.size()));
  (void)std::transform(sys_cpu_util_.begin() + start_index, sys_cpu_util_.begin() + end_index,
                       std::back_inserter(*result), [&](const SystemUtil &info) { return info.sys_utilization; });
  return Status::OK();
 }

 std::vector<uint8_t> SystemCpuInfo::GetIOCpuUtil() const {
  std::vector<uint8_t> io_util;
  (void)std::transform(sys_cpu_util_.begin(), sys_cpu_util_.end(), std::back_inserter(io_util),
                       [&](const SystemUtil &info) { return info.io_utilization; });
  return io_util;
 }

 std::vector<uint8_t> SystemCpuInfo::GetIdleCpuUtil() const {
  std::vector<uint8_t> idle_util;
  (void)std::transform(sys_cpu_util_.begin(), sys_cpu_util_.end(), std::back_inserter(idle_util),
                       [&](const SystemUtil &info) { return info.idle_utilization; });
  return idle_util;
 }

 std::vector<uint16_t> TaskCpuInfo::GetSysCpuUtil() const {
  std::vector<uint16_t> sys_util;
  (void)std::transform(
    task_cpu_util_.begin(), task_cpu_util_.end(), std::back_inserter(sys_util), [&](const TaskUtil &info) {
      return static_cast<uint16_t>(info.sys_utilization * static_cast<float>(SystemCpuInfo::num_cpu_));
    });
  return sys_util;
 }

 std::vector<uint16_t> TaskCpuInfo::GetUserCpuUtil() const {
  std::vector<uint16_t> user_util;
  (void)std::transform(
    task_cpu_util_.begin(), task_cpu_util_.end(), std::back_inserter(user_util), [&](const TaskUtil &info) {
      return static_cast<uint16_t>(info.user_utilization * static_cast<float>(SystemCpuInfo::num_cpu_));
    });
  return user_util;
 }

 TaskUtil TaskCpuInfo::GetLatestCpuUtil() const {
  TaskUtil ret = {0, 0};
  if (!task_cpu_util_.empty() && !last_sampling_failed_) {
    ret = task_cpu_util_.back();
  }
  return ret;
 }

 Status ProcessCpuInfo::Sample(uint64_t total_time_elapsed) {
  std::ifstream file("/proc/" + std::to_string(pid_) + "/stat");
  if (!file.is_open()) {
    MS_LOG(INFO) << "Failed to open /proc/" << pid_ << "/stat/ file";
    last_sampling_failed_ = true;
    return Status::OK();
  }
  std::string str;
  (void)getline(file, str);
  uint64_t utime = 0, stime = 0;
  if (sscanf_s(str.c_str(), "%*d %*s %*s %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %lu %lu", &utime, &stime) ==
      EOF) {
    file.close();
    last_sampling_failed_ = true;
    return Status(StatusCode::kMDUnexpectedError, "Get device CPU failed.");
  }
  file.close();
  last_sampling_failed_ = false;
  if (!first_sample_) {
    float user_util = (utime - prev_task_stat_.user_stat) * 1.0 / (total_time_elapsed)*100.0;
    float sys_util = (stime - prev_task_stat_.sys_stat) * 1.0 / (total_time_elapsed)*100.0;
    (void)task_cpu_util_.emplace_back(TaskUtil{user_util, sys_util});
  }
  prev_task_stat_.user_stat = utime;
  prev_task_stat_.sys_stat = stime;
  first_sample_ = false;
  return Status::OK();
 }

 Status ThreadCpuInfo::Sample(uint64_t total_time_elapsed) {
  std::ifstream file("/proc/" + std::to_string(pid_) + "/task/" + std::to_string(tid_) + "/stat");
  if (!file.is_open()) {
    MS_LOG(INFO) << "Failed to open /proc/" << pid_ << "/task/" << tid_ << "/stat file";
    last_sampling_failed_ = true;
    return Status::OK();
  }
  std::string str;
  (void)getline(file, str);
  uint64_t utime;
  uint64_t stime;
  if (sscanf_s(str.c_str(), "%*d %*s %*s %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %lu %lu", &utime, &stime) ==
      EOF) {
    file.close();
    last_sampling_failed_ = true;
    return Status(StatusCode::kMDUnexpectedError, "Get thread CPU failed.");
  }
  file.close();
  last_sampling_failed_ = false;
  if (!first_sample_) {
    float user_util = ((utime - prev_task_stat_.user_stat) * 1.0 / total_time_elapsed) * 100.0;
    float sys_util = ((stime - prev_task_stat_.sys_stat) * 1.0 / total_time_elapsed) * 100.0;
    (void)task_cpu_util_.emplace_back(TaskUtil{user_util, sys_util});
  }
  prev_task_stat_.user_stat = utime;
  prev_task_stat_.sys_stat = stime;
  first_sample_ = false;
  return Status::OK();
 }

 bool MDOperatorCpuInfo::TaskExists(pid_t id) const { return task_by_id_.find(id) != task_by_id_.end(); }

 void MDOperatorCpuInfo::AddTask(const std::shared_ptr<TaskCpuInfo> &task_ptr) {
  auto id = task_ptr->GetId();
  if (!TaskExists(id)) {
    (void)task_by_id_.emplace(id, task_ptr);
  }
 }

 void MDOperatorCpuInfo::CalculateOperatorUtilization() {
  OpUtil op_util{0, 0};
  for (auto const &[task_id, task_ptr] : task_by_id_) {
    MS_LOG(DEBUG) << "Processing task_id: " << task_id;
    auto task_util = task_ptr->GetLatestCpuUtil();
    op_util.user_utilization += task_util.user_utilization;
    op_util.sys_utilization += task_util.sys_utilization;
  }
  (void)op_cpu_util_.emplace_back(op_util);
 }

 Status MDOperatorCpuInfo::GetUserCpuUtil(uint64_t start_index, uint64_t end_index,
                                         std::vector<uint16_t> *result) const {
  MS_LOG(DEBUG) << "start_index: " << start_index << " end_index: " << end_index
                << " op_cpu_util_.size: " << op_cpu_util_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(start_index < end_index,
                               "Expected start_index < end_index. Got start_index: " + std::to_string(start_index) +
                                 " end_index: " + std::to_string(end_index));
  CHECK_FAIL_RETURN_UNEXPECTED(
    end_index <= op_cpu_util_.size(),
    "Expected end_index <= op_cpu_util_.size(). Got end_index: " + std::to_string(end_index) +
      " op_cpu_util_.size: " + std::to_string(op_cpu_util_.size()));
  auto first_iter = op_cpu_util_.begin() + start_index;
  auto last_iter = op_cpu_util_.begin() + end_index;
  (void)std::transform(first_iter, last_iter, std::back_inserter(*result), [&](const OpUtil &info) {
    return static_cast<uint16_t>(info.user_utilization * static_cast<float>(SystemCpuInfo::num_cpu_));
  });
  return Status::OK();
 }

 Status MDOperatorCpuInfo::GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint16_t> *result) const {
  MS_LOG(DEBUG) << "start_index: " << start_index << " end_index: " << end_index
                << " op_cpu_util_.size: " << op_cpu_util_.size();
  CHECK_FAIL_RETURN_UNEXPECTED(start_index < end_index,
                               "Expected start_index < end_index. Got start_index: " + std::to_string(start_index) +
                                 " end_index: " + std::to_string(end_index));
  CHECK_FAIL_RETURN_UNEXPECTED(
    end_index <= op_cpu_util_.size(),
    "Expected end_index <= op_cpu_util_.size(). Got end_index: " + std::to_string(end_index) +
      " op_cpu_util_.size: " + std::to_string(op_cpu_util_.size()));
  auto first_iter = op_cpu_util_.begin() + start_index;
  auto last_iter = op_cpu_util_.begin() + end_index;
  (void)std::transform(first_iter, last_iter, std::back_inserter(*result), [&](const OpUtil &info) {
    return static_cast<uint16_t>(info.sys_utilization * static_cast<float>(SystemCpuInfo::num_cpu_));
  });
  return Status::OK();
 }

 Status CpuSampler::Sample() {
  std::lock_guard<std::mutex> guard(lock_);
  // Function to Update TaskList
  // Loop through all tasks to find any new threads
  // Get all multi-processing Ops from Python only if fetched_all_process = False
  // Create new TaskCpuInfo as required and update OpInfo
  RETURN_IF_NOT_OK(UpdateTaskList());

  // Sample SystemInfo - Update current and move current to previous stat and calc Util
  SystemStat current_sys_stat;
  SystemStat previous_sys_stat;
  RETURN_IF_NOT_OK(sys_cpu_info_.SampleAndGetCurrPrevStat(&current_sys_stat, &previous_sys_stat));
  auto total_time_elapsed = current_sys_stat.total_stat - previous_sys_stat.total_stat;

  // Call Sample on all
  // Read /proc/ files and get stat, calculate util
  for (auto &task_ptr : tasks_) {
    (void)task_ptr->Sample(total_time_elapsed);
  }

  // Calculate OperatorCpuInfo
  for (auto &[op_id, op_info] : op_info_by_id_) {
    MS_LOG(DEBUG) << "Calculate operator cpu utilization for OpId: " << op_id;
    op_info.CalculateOperatorUtilization();
  }

  // Get sampling time.
  (void)ts_.emplace_back(ProfilingTime::GetCurMilliSecond());

  return Status::OK();
 }

 Status CpuSampler::UpdateTaskList() {
  List<Task> allTasks = tree->AllTasks()->GetTask();
  for (auto &task : allTasks) {
    int32_t op_id = task.get_operator_id();
    // check if the op_info was initialized in Init
    auto iter = op_info_by_id_.find(op_id);
    if (iter != op_info_by_id_.end()) {
      int32_t tid = task.get_linux_id();
      if (!iter->second.TaskExists(tid)) {
        auto task_cpu_info_ptr = std::make_shared<ThreadCpuInfo>(main_pid_, tid);
        (void)tasks_.emplace_back(task_cpu_info_ptr);
        iter->second.AddTask(task_cpu_info_ptr);
      }
    }
  }

  if (!fetched_all_python_multiprocesses_) {
    py::gil_scoped_acquire gil_acquire;
    py::module ds = py::module::import("mindspore.dataset.engine.datasets");
    py::tuple process_info = ds.attr("_get_operator_process")();
    auto sub_process = py::reinterpret_borrow<py::dict>(process_info[0]);
    fetched_all_python_multiprocesses_ = py::reinterpret_borrow<py::bool_>(process_info[1]);
    // parse dict value
    auto op_to_process = toIntMap(sub_process);
    for (auto const &[op_id, process_list] : op_to_process) {
      for (auto pid : process_list) {
        auto iter = op_info_by_id_.find(op_id);
        if (iter != op_info_by_id_.end()) {
          if (!iter->second.TaskExists(pid)) {
            auto task_cpu_info_ptr = std::make_shared<ProcessCpuInfo>(pid);
            (void)tasks_.emplace_back(task_cpu_info_ptr);
            iter->second.AddTask(task_cpu_info_ptr);
          }
        }
      }
    }
  }

  return Status::OK();
 }

 Status CpuSampler::Init(const std::string &dir_path, const std::string &device_id) {
 #if defined(USING_LINUX)
  main_pid_ = syscall(SYS_getpid);
 #endif
  auto path = Path(dir_path) / Path("minddata_cpu_utilization_" + device_id + ".json");
  // remove file if it already exists
  RETURN_IF_NOT_OK(path.Remove());
  file_path_ = path.ToString();
  for (auto iter = tree->begin(); iter != tree->end(); iter++) {
    auto op_id = iter->id();
    (void)op_info_by_id_.emplace(std::make_pair(op_id, MDOperatorCpuInfo(op_id)));
  }
  // thread id of main thread is same as the process ID
  main_thread_cpu_info_ = std::make_shared<ThreadCpuInfo>(main_pid_, main_pid_);
  (void)tasks_.emplace_back(main_thread_cpu_info_);
  main_process_cpu_info_ = std::make_shared<ProcessCpuInfo>(main_pid_);
  (void)tasks_.emplace_back(main_process_cpu_info_);
  return Status::OK();
 }

 Status CpuSampler::ChangeFileMode() {
  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
  }
  return Status::OK();
 }

 Status CpuSampler::SaveToFile() {
  // construct json obj to write to file
  json output;
  output["cpu_processor_num"] = SystemCpuInfo::num_cpu_;
  std::vector<uint8_t> system_user_util, system_sys_util;
  // end_index = ts_.size() essentially means to get all sampled points
  (void)sys_cpu_info_.GetUserCpuUtil(0, ts_.size(), &system_user_util);
  (void)sys_cpu_info_.GetSysCpuUtil(0, ts_.size(), &system_sys_util);
  output["device_info"] = {{"context_switch_count", sys_cpu_info_.GetContextSwitchCount()},
                           {"idle_utilization", sys_cpu_info_.GetIdleCpuUtil()},
                           {"io_utilization", sys_cpu_info_.GetIOCpuUtil()},
                           {"sys_utilization", system_sys_util},
                           {"user_utilization", system_user_util},
                           {"runnable_process", sys_cpu_info_.GetRunningProcess()}};
  // array of op_info json objects
  json op_infos;
  for (auto &[op_id, op_info] : op_info_by_id_) {
    MS_LOG(INFO) << "Processing op_id: " << op_id;
    std::vector<uint16_t> user_util, sys_util;
    (void)op_info.GetSysCpuUtil(0, ts_.size(), &sys_util);
    (void)op_info.GetUserCpuUtil(0, ts_.size(), &user_util);
    json op_info_json = {{"metrics", {{"user_utilization", user_util}, {"sys_utilization", sys_util}}},
                         {"op_id", op_id}};
    op_infos.emplace_back(op_info_json);
  }
  output["op_info"] = op_infos;

  output["process_info"] = {{"user_utilization", main_process_cpu_info_->GetUserCpuUtil()},
                            {"sys_utilization", main_process_cpu_info_->GetSysCpuUtil()}};

  output["sampling_interval"] = GlobalContext::config_manager()->monitor_sampling_interval();
  output["time_stamp"] = ts_;

  // Discard the content of the file when opening.
  std::ofstream os(file_path_, std::ios::trunc);
  os << output;
  os.close();

  return Status::OK();
 }

 Status CpuSampler::Analyze() { return Status::OK(); }

 Status CpuSampler::GetOpUserCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result) {
  std::lock_guard<std::mutex> guard(lock_);
  // find first ts that is not less than start_ts
  auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  // find first ts that is greater than end_ts
  auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  // std::distance is O(1) since vector allows random access
  auto start_index = std::distance(ts_.begin(), lower);
  auto end_index = std::distance(ts_.begin(), upper);
  auto op_info = op_info_by_id_.find(op_id);
  CHECK_FAIL_RETURN_UNEXPECTED(op_info != op_info_by_id_.end(), "Op Id: " + std::to_string(op_id) + " not found.");
  return op_info->second.GetUserCpuUtil(start_index, end_index, result);
 }

 Status CpuSampler::GetOpSysCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result) {
  std::lock_guard<std::mutex> guard(lock_);
  // find first ts that is not less than start_ts
  auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  // find first ts that is greater than end_ts
  auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  // std::distance is O(1) since vector allows random access
  auto start_index = std::distance(ts_.begin(), lower);
  auto end_index = std::distance(ts_.begin(), upper);
  auto op_info = op_info_by_id_.find(op_id);
  CHECK_FAIL_RETURN_UNEXPECTED(op_info != op_info_by_id_.end(), "Op Id: " + std::to_string(op_id) + " not found.");
  return op_info->second.GetSysCpuUtil(start_index, end_index, result);
 }

 Status CpuSampler::GetSystemUserCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  std::lock_guard<std::mutex> guard(lock_);
  // find first ts that is not less than start_ts
  auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  // find first ts that is greater than end_ts
  auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  // std::distance is O(1) since vector allows random access
  auto start_index = std::distance(ts_.begin(), lower);
  auto end_index = std::distance(ts_.begin(), upper);
  return sys_cpu_info_.GetUserCpuUtil(start_index, end_index, result);
 }

 Status CpuSampler::GetSystemSysCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result) {
  std::lock_guard<std::mutex> guard(lock_);
  // find first ts that is not less than start_ts
  auto lower = std::lower_bound(ts_.begin(), ts_.end(), start_ts);
  // find first ts that is greater than end_ts
  auto upper = std::upper_bound(ts_.begin(), ts_.end(), end_ts);
  // std::distance is O(1) since vector allows random access
  auto start_index = std::distance(ts_.begin(), lower);
  auto end_index = std::distance(ts_.begin(), upper);
  return sys_cpu_info_.GetSysCpuUtil(start_index, end_index, result);
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampler.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampler.h
@@ -0,0 +1,169 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_

 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <nlohmann/json.hpp>
 #include "minddata/dataset/engine/perf/profiling.h"
 #include "minddata/dataset/engine/datasetops/dataset_op.h"

 namespace mindspore {
 namespace dataset {

 class ExecutionTree;

 typedef struct SystemStat_s {
  uint64_t user_stat;
  uint64_t sys_stat;
  uint64_t io_stat;
  uint64_t idle_stat;
  uint64_t total_stat;
 } SystemStat;

 typedef struct SystemUtil_s {
  uint8_t user_utilization;
  uint8_t sys_utilization;
  uint8_t io_utilization;
  uint8_t idle_utilization;
 } SystemUtil;

 typedef struct TaskStat_s {
  uint64_t user_stat;
  uint64_t sys_stat;
 } TaskStat;

 struct TaskUtil_s {
  float user_utilization;
  float sys_utilization;
 };

 typedef struct TaskUtil_s TaskUtil;
 typedef struct TaskUtil_s OpUtil;

 class SystemCpuInfo {
 public:
  SystemCpuInfo() : first_sample_(true), prev_context_switch_count_(0) {}
  // Read in current stats and return previous and currently read stats
  Status SampleAndGetCurrPrevStat(SystemStat *current_stat, SystemStat *previous_stat);
  static int32_t num_cpu_;
  const std::vector<uint32_t> &GetRunningProcess() const { return running_process_; }
  const std::vector<uint64_t> &GetContextSwitchCount() const { return context_switch_count_; }
  Status GetUserCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const;
  Status GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint8_t> *result) const;
  std::vector<uint8_t> GetIOCpuUtil() const;
  std::vector<uint8_t> GetIdleCpuUtil() const;

 private:
  Status ParseCpuInfo(const std::string &str);
  Status ParseCtxt(const std::string &str);
  Status ParseRunningProcess(const std::string &str);
  SystemStat prev_sys_stat_{};                  // last read data /proc/stat file
  std::vector<SystemUtil> sys_cpu_util_;        // vector of system cpu utilization
  std::vector<uint32_t> running_process_;       // vector of running processes in system
  std::vector<uint64_t> context_switch_count_;  // vector of number of context switches between two sampling points
  bool first_sample_;                           // flag to indicate first time sampling
  uint64_t prev_context_switch_count_;          // last read context switch count from /proc/stat file
 };

 class TaskCpuInfo {
 public:
  explicit TaskCpuInfo(pid_t pid) : pid_(pid), first_sample_(true), last_sampling_failed_(false) {}
  virtual ~TaskCpuInfo() = default;
  virtual Status Sample(uint64_t total_time_elapsed) = 0;
  virtual pid_t GetId() = 0;
  TaskUtil GetLatestCpuUtil() const;
  std::vector<uint16_t> GetSysCpuUtil() const;
  std::vector<uint16_t> GetUserCpuUtil() const;

 protected:
  pid_t pid_;
  TaskStat prev_task_stat_;
  std::vector<TaskUtil> task_cpu_util_;
  bool first_sample_;
  bool last_sampling_failed_;
 };

 class ProcessCpuInfo : public TaskCpuInfo {
 public:
  explicit ProcessCpuInfo(pid_t pid) : TaskCpuInfo(pid) {}
  ~ProcessCpuInfo() override = default;
  Status Sample(uint64_t total_time_elapsed) override;
  pid_t GetId() override { return pid_; }
 };

 class ThreadCpuInfo : public TaskCpuInfo {
 public:
  explicit ThreadCpuInfo(pid_t pid, pid_t tid) : TaskCpuInfo(pid), tid_(tid) {}
  ~ThreadCpuInfo() override = default;
  Status Sample(uint64_t total_time_elapsed) override;
  pid_t GetId() override { return tid_; }

 private:
  pid_t tid_;
 };

 class MDOperatorCpuInfo {
 public:
  void AddTask(const std::shared_ptr<TaskCpuInfo> &task_ptr);
  bool TaskExists(pid_t id) const;
  explicit MDOperatorCpuInfo(const int32_t op_id) : id_(op_id) {}
  void CalculateOperatorUtilization();
  Status GetUserCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint16_t> *result) const;
  Status GetSysCpuUtil(uint64_t start_index, uint64_t end_index, std::vector<uint16_t> *result) const;

 private:
  int32_t id_;
  // tid is key for threadinfo, pid is key for processinfo
  std::unordered_map<pid_t, std::shared_ptr<TaskCpuInfo>> task_by_id_;
  std::vector<OpUtil> op_cpu_util_;
 };

 class CpuSampler : public Sampling {
  using Timestamps = std::vector<uint64_t>;

 public:
  explicit CpuSampler(ExecutionTree *tree) : fetched_all_python_multiprocesses_(false), tree(tree) {}
  ~CpuSampler() = default;
  Status Sample() override;
  Status Init(const std::string &dir_path, const std::string &device_id) override;
  Status ChangeFileMode() override;
  Status SaveToFile() override;
  std::string Name() const override { return kCpuSamplerName; }
  Status Analyze() override;
  Status GetSystemUserCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
  Status GetSystemSysCpuUtil(uint64_t start_ts, uint64_t end_ts, std::vector<uint8_t> *result);
  Status GetOpUserCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);
  Status GetOpSysCpuUtil(int32_t op_id, uint64_t start_ts, uint64_t end_ts, std::vector<uint16_t> *result);

 private:
  Status UpdateTaskList();
  bool fetched_all_python_multiprocesses_{};
  ExecutionTree *tree = nullptr;
  pid_t main_pid_{};
  Timestamps ts_;
  SystemCpuInfo sys_cpu_info_;                       // store the system cpu utilization
  std::vector<std::shared_ptr<TaskCpuInfo>> tasks_;  // vector of all process and thread tasks
  std::shared_ptr<ThreadCpuInfo> main_thread_cpu_info_;
  std::shared_ptr<ProcessCpuInfo> main_process_cpu_info_;
  std::unordered_map<int32_t, MDOperatorCpuInfo> op_info_by_id_;
 };
 }  // namespace dataset
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_ENGINE_PERF_CPU_SAMPLER_H_
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.cc
@@ -1,699 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "minddata/dataset/engine/perf/cpu_sampling.h"

 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #include <sys/syscall.h>
 #endif
 #include <cmath>
 #include <cstdio>
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <string>

 #include "minddata/dataset/api/python/pybind_conversion.h"
 #include "minddata/dataset/core/config_manager.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/util/path.h"

 using json = nlohmann::json;
 namespace mindspore {
 namespace dataset {
 bool BaseCpu::fetched_all_process_shared_ = false;
 std::unordered_map<int32_t, std::vector<pid_t>> BaseCpu::op_process_shared_ = {};

 #if !defined(_WIN32) && !defined(_WIN64) && !defined(__ANDROID__) && !defined(ANDROID) && !defined(__APPLE__)
 #define USING_LINUX
 #endif

 BaseCpu::BaseCpu() {
  pre_cpu_stat_.user_stat_ = 0;
  pre_cpu_stat_.sys_stat_ = 0;
  pre_cpu_stat_.io_stat_ = 0;
  pre_cpu_stat_.idle_stat_ = 0;
  pre_cpu_stat_.total_stat_ = 0;
  fetched_all_process_ = false;
  pre_fetched_state_ = false;
  cpu_processor_num_ = 0;
 }

 Status BaseCpu::GetTotalCpuTime(uint64_t *total_stat) {
  std::ifstream file("/proc/stat");
  if (!file.is_open()) {
    MS_LOG(INFO) << "Open CPU file failed when collect CPU information";
    return Status::OK();
  }
  std::string str;
  getline(file, str);
  uint64_t user = 0, sys = 0, idle = 0, iowait = 0, nice = 0, irq = 0, softirq = 0;
  if (sscanf_s(str.c_str(), "%*s %lu %lu %lu %lu %lu %lu %lu", &user, &nice, &sys, &idle, &iowait, &irq, &softirq) ==
      EOF) {
    file.close();
    return Status(StatusCode::kMDUnexpectedError, "Get device CPU failed.");
  }
  file.close();
  *total_stat = user + nice + sys + idle + iowait + irq + softirq;

  return Status::OK();
 }

 Status DeviceCpu::ParseCpuInfo(const std::string &str) {
  CpuStat cpu_stat;
  uint64_t nice = 0;
  uint64_t irq = 0;
  uint64_t softirq = 0;
  if (sscanf_s(str.c_str(), "%*s %lu %lu %lu %lu %lu %lu %lu", &cpu_stat.user_stat_, &nice, &cpu_stat.sys_stat_,
               &cpu_stat.idle_stat_, &cpu_stat.io_stat_, &irq, &softirq) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get device CPU failed.");
  }

  cpu_stat.total_stat_ =
    cpu_stat.user_stat_ + nice + cpu_stat.sys_stat_ + cpu_stat.idle_stat_ + cpu_stat.io_stat_ + irq + softirq;
  // Calculate the utilization from the second sampling
  if (!first_collect_) {
    CpuUtil info;
    info.user_utilization_ = round((cpu_stat.user_stat_ - pre_cpu_stat_.user_stat_) * 1.0 /
                                   (cpu_stat.total_stat_ - pre_cpu_stat_.total_stat_) * 100);
    info.sys_utilization_ = round((cpu_stat.sys_stat_ - pre_cpu_stat_.sys_stat_) * 1.0 /
                                  (cpu_stat.total_stat_ - pre_cpu_stat_.total_stat_) * 100);
    info.io_utilization_ = round((cpu_stat.io_stat_ - pre_cpu_stat_.io_stat_) * 1.0 /
                                 (cpu_stat.total_stat_ - pre_cpu_stat_.total_stat_) * 100);
    info.idle_utilization_ = round((cpu_stat.idle_stat_ - pre_cpu_stat_.idle_stat_) * 1.0 /
                                   (cpu_stat.total_stat_ - pre_cpu_stat_.total_stat_) * 100);
    cpu_util_.emplace_back(info);
  }
  pre_cpu_stat_.user_stat_ = cpu_stat.user_stat_;
  pre_cpu_stat_.sys_stat_ = cpu_stat.sys_stat_;
  pre_cpu_stat_.io_stat_ = cpu_stat.io_stat_;
  pre_cpu_stat_.idle_stat_ = cpu_stat.idle_stat_;
  pre_cpu_stat_.total_stat_ = cpu_stat.total_stat_;

  return Status::OK();
 }

 Status DeviceCpu::ParseCtxt(const std::string &str) {
  uint64_t ctxt;
  if (sscanf_s(str.c_str(), "%*s %lu", &ctxt) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get context switch count failed.");
  }
  // Calculate the utilization from the second sampling
  if (!first_collect_) {
    context_switch_count_.push_back(ctxt - pre_context_switch_count_);
  }
  pre_context_switch_count_ = ctxt;
  return Status::OK();
 }

 Status DeviceCpu::ParseRunningProcess(const std::string &str) {
  uint32_t running_process;
  if (sscanf_s(str.c_str(), "%*s %ud", &running_process) == EOF) {
    return Status(StatusCode::kMDUnexpectedError, "Get context switch count failed.");
  }
  // Drop the first value in order to collect same amount of CPU utilization
  if (!first_collect_) {
    running_process_.push_back(running_process);
  }

  return Status::OK();
 }

 Status DeviceCpu::Collect(const ExecutionTree *tree) {
  std::ifstream file("/proc/stat");
  if (!file.is_open()) {
    MS_LOG(INFO) << "Open CPU file failed when collect CPU information";
    return Status::OK();
  }
  bool first_line = true;
  std::string line;
  while (getline(file, line)) {
    if (first_line) {
      first_line = false;
      RETURN_IF_NOT_OK(ParseCpuInfo(line));
    }
    if (line.find("ctxt") != std::string::npos) {
      RETURN_IF_NOT_OK(ParseCtxt(line));
    }
    if (line.find("procs_running") != std::string::npos) {
      RETURN_IF_NOT_OK(ParseRunningProcess(line));
    }
  }
  file.close();

  first_collect_ = false;
  return Status::OK();
 }
 Status DeviceCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
  RETURN_UNEXPECTED_IF_NULL(name);
  name->clear();
  name->append("device_info");
  int total_samples = cpu_util_.size();
  int sum = 0;
  // Only analyze the middle half of the samples
  // Starting and ending may be impacted by startup or ending pipeline activities
  int start_analyze = total_samples / 4;
  int end_analyze = total_samples - start_analyze;

  for (int i = start_analyze; i < end_analyze; i++) {
    sum += cpu_util_[i].user_utilization_;
    sum += cpu_util_[i].sys_utilization_;
  }

  // Note device utilization is already in range of 0-1, so don't
  // need to divide by number of CPUS
  if ((end_analyze - start_analyze) > 0) {
    *utilization = sum / (end_analyze - start_analyze);
  }
  return Status::OK();
 }

 Status DeviceCpu::SaveToFile(const std::string &file_path) {
  Path path = Path(file_path);
  json output;
  if (path.Exists()) {
    MS_LOG(DEBUG) << file_path << " exists already";
    try {
      std::ifstream file(file_path);
      file >> output;
    } catch (const std::exception &err) {
      RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path +
                               ", please delete it and try again!");
    }
  } else {
    output["sampling_interval"] = GlobalContext::config_manager()->monitor_sampling_interval();
  }

  std::vector<int8_t> user_util;
  std::transform(cpu_util_.begin(), cpu_util_.end(), std::back_inserter(user_util),
                 [&](const CpuUtil &info) { return info.user_utilization_; });
  std::vector<int8_t> sys_util;
  std::transform(cpu_util_.begin(), cpu_util_.end(), std::back_inserter(sys_util),
                 [&](const CpuUtil &info) { return info.sys_utilization_; });
  std::vector<int8_t> io_util;
  std::transform(cpu_util_.begin(), cpu_util_.end(), std::back_inserter(io_util),
                 [&](const CpuUtil &info) { return info.io_utilization_; });
  std::vector<int8_t> idle_util;
  std::transform(cpu_util_.begin(), cpu_util_.end(), std::back_inserter(idle_util),
                 [&](const CpuUtil &info) { return info.idle_utilization_; });

  output["device_info"] = {{"user_utilization", user_util},
                           {"sys_utilization", sys_util},
                           {"io_utilization", io_util},
                           {"idle_utilization", idle_util},
                           {"runable_processes", running_process_},
                           {"context_switch_count", context_switch_count_}};

  // Discard the content of the file when opening.
  std::ofstream os(file_path, std::ios::trunc);
  os << output;
  os.close();

  MS_LOG(INFO) << "Save device CPU success.";
  return Status::OK();
 }

 Status OperatorCpu::ParseCpuInfo(int32_t op_id, int64_t thread_id,
                                 std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> *op_stat) {
  RETURN_UNEXPECTED_IF_NULL(op_stat);
  pid_t pid = 0;
 #if defined(USING_LINUX)
  pid = syscall(SYS_getpid);
 #endif
  std::string stat_path = "/proc/" + std::to_string(pid) + "/task/" + std::to_string(thread_id) + "/stat";

  // Judge whether file exist first
  Path temp_path(stat_path);
  if (!temp_path.Exists()) {
    (*op_stat)[op_id][thread_id].user_stat_ = 0;
    (*op_stat)[op_id][thread_id].sys_stat_ = 0;
    return Status(StatusCode::kMDFileNotExist);
  }

  std::ifstream file(stat_path);
  if (!file.is_open()) {
    MS_LOG(INFO) << "Open CPU file failed when collect CPU information";
    return Status::OK();
  }
  std::string str;
  getline(file, str);
  uint64_t utime;
  uint64_t stime;
  if (sscanf_s(str.c_str(), "%*d %*s %*s %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %lu %lu", &utime, &stime) ==
      EOF) {
    file.close();
    return Status(StatusCode::kMDUnexpectedError, "Get device CPU failed.");
  }
  file.close();
  (*op_stat)[op_id][thread_id].user_stat_ = utime;
  (*op_stat)[op_id][thread_id].sys_stat_ = stime;

  return Status::OK();
 }

 Status OperatorCpu::Collect(const ExecutionTree *tree) {
  RETURN_UNEXPECTED_IF_NULL(tree);
  if (first_collect_) {
    for (auto iter = tree->begin(); iter != tree->end(); ++iter) {
      id_count_++;
      op_name_[iter->id()] = iter->NameWithID();
      op_parallel_workers_[iter->id()] = iter->NumWorkers();
    }
 #if defined(USING_LINUX)
    cpu_processor_num_ = get_nprocs_conf();
 #endif
  }

  // Obtain the op and thread mapping
  op_thread_.clear();
  List<Task> allTasks = tree->AllTasks()->GetTask();
  for (auto &task1 : allTasks) {
    int32_t op_id = task1.get_operator_id();
    op_thread_[op_id].emplace_back(task1.get_linux_id());
  }

  // add process id into op_thread
  if (!fetched_all_process_) {
    {
      py::gil_scoped_acquire gil_acquire;
      py::module ds = py::module::import("mindspore.dataset.engine.datasets");
      py::tuple process_info = ds.attr("_get_operator_process")();
      py::dict sub_process = py::reinterpret_borrow<py::dict>(process_info[0]);
      fetched_all_process_ = py::reinterpret_borrow<py::bool_>(process_info[1]);
      // parse dict value
      op_process_ = toIntMap(sub_process);
      BaseCpu::op_process_shared_ = op_process_;
      BaseCpu::fetched_all_process_shared_ = fetched_all_process_;
    }

    // judge whether there is device_que operator, if so operator id may need increase by one, temp use directly
    for (auto item : op_process_) {
      if (!item.second.empty()) {
        if (op_thread_.find(item.first) != op_thread_.end()) {
          op_thread_[item.first].insert(op_thread_[item.first].end(), item.second.begin(), item.second.end());
        } else {
          op_thread_[item.first] = item.second;
        }
      }
    }
  }

  uint64_t total_stat_;
  RETURN_IF_NOT_OK(GetTotalCpuTime(&total_stat_));
  std::vector<CpuOpUtil> cpu_step_util_;
  std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> op_stat_;

  if (!first_collect_) {
    // obtain all the op id in current tasks
    std::vector<int32_t> total_op_id;
    (void)std::transform(op_thread_.begin(), op_thread_.end(), std::back_inserter(total_op_id),
                         [](const auto &iter) { return iter.first; });

    // iter all the op, and obtain the CPU utilization of each operator
    for (auto op_id = -1; op_id < id_count_; op_id++) {
      float user_util = 0, sys_util = 0;
      auto iter = std::find(total_op_id.begin(), total_op_id.end(), op_id);
      if (iter != total_op_id.end()) {
        for (auto thread_id : op_thread_[op_id]) {
          if (ParseCpuInfo(op_id, thread_id, &op_stat_) == Status::OK()) {
            user_util += (op_stat_[op_id][thread_id].user_stat_ - pre_op_stat_[op_id][thread_id].user_stat_) * 1.0 /
                         (total_stat_ - pre_total_stat_) * 100;
            sys_util += (op_stat_[op_id][thread_id].sys_stat_ - pre_op_stat_[op_id][thread_id].sys_stat_) * 1.0 /
                        (total_stat_ - pre_total_stat_) * 100;
          }
        }
      }
      CpuOpUtil info;
      info.op_id_ = op_id;
      info.sys_utilization_ = sys_util;
      info.user_utilization_ = user_util;
      cpu_step_util_.emplace_back(info);
    }
    cpu_op_util_.emplace_back(cpu_step_util_);
  } else {
    // mainly obtain the init CPU execute time in first collect
    for (const auto &iter : op_thread_) {
      int32_t op_id = iter.first;
      for (auto thread_id_ : iter.second) {
        // ParseCpuInfo may execute failed for cpu data not ready, but we still get next thread cpu info
        (void)ParseCpuInfo(op_id, thread_id_, &op_stat_);
      }
    }
  }

  // copy current op_stat into pre_op_stat
  pre_op_stat_ = op_stat_;
  pre_total_stat_ = total_stat_;

  first_collect_ = false;
  return Status::OK();
 }

 Status OperatorCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
  RETURN_UNEXPECTED_IF_NULL(name);
  RETURN_UNEXPECTED_IF_NULL(extra_message);
  int total_samples = cpu_op_util_.size();

  // Only analyze the middle half of the samples
  // Starting and ending may be impacted by startup or ending pipeline activities
  constexpr int64_t sample_sections = 4;
  int64 start_analyze = total_samples / sample_sections;
  int64 end_analyze = total_samples - start_analyze;
  double op_util = 0;
  *utilization = 0;

  // start loop from 0 was as don't want to analyze op -1
  for (auto op_id = 0; op_id < id_count_; op_id++) {
    int64 sum = 0;
    int64 index = op_id + 1;
    for (int i = start_analyze; i < end_analyze; i++) {
      sum += cpu_op_util_[i][index].user_utilization_;
      sum += cpu_op_util_[i][index].sys_utilization_;
    }
    if ((end_analyze - start_analyze) > 0) {
      op_util = 1.0 * sum * cpu_processor_num_ / (op_parallel_workers_[op_id] * (end_analyze - start_analyze));
    }
    if (op_util > *utilization) {
      *utilization = op_util;
      name->clear();
      (void)name->append(op_name_[op_id]);
    }
    (void)extra_message->append(op_name_[op_id] + " utilization per thread: " + std::to_string(op_util) + "% (" +
                                std::to_string(op_parallel_workers_[op_id]) + " parallel_workers); ");
  }
  return Status::OK();
 }

 Status OperatorCpu::SaveToFile(const std::string &file_path) {
  Path path = Path(file_path);
  json output;
  if (path.Exists()) {
    MS_LOG(DEBUG) << file_path << "already exist.";
    try {
      std::ifstream file(file_path);
      file >> output;
    } catch (const std::exception &err) {
      RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path +
                               ", please delete it and try again!");
    }
  }

  uint8_t index = 0;
  json OpWriter;
  for (auto op_id = -1; op_id < id_count_; op_id++) {
    std::vector<uint16_t> user_util;
    std::vector<uint16_t> sys_util;
    std::transform(
      cpu_op_util_.begin(), cpu_op_util_.end(), std::back_inserter(user_util),
      [&](const std::vector<CpuOpUtil> &info) { return int16_t(info[index].user_utilization_ * cpu_processor_num_); });
    std::transform(
      cpu_op_util_.begin(), cpu_op_util_.end(), std::back_inserter(sys_util),
      [&](const std::vector<CpuOpUtil> &info) { return int16_t(info[index].sys_utilization_ * cpu_processor_num_); });

    json per_op_info = {{"metrics", {{"user_utilization", user_util}, {"sys_utilization", sys_util}}},
                        {"op_id", op_id}};
    OpWriter.emplace_back(per_op_info);
    index++;
  }
  output["op_info"] = OpWriter;

  // Discard the content of the file when opening.
  std::ofstream os(file_path, std::ios::trunc);
  os << output;
  os.close();

  MS_LOG(INFO) << "Save device CPU success.";
  return Status::OK();
 }

 Status ProcessCpu::ParseCpuInfo() {
  uint64_t total_stat_;
  RETURN_IF_NOT_OK(GetTotalCpuTime(&total_stat_));

  if (!pre_fetched_state_) {
    process_id_.clear();
    pid_t main_pid = 0;
 #if defined(USING_LINUX)
    main_pid = syscall(SYS_getpid);
 #endif
    process_id_.emplace_back(main_pid);
    op_process_ = BaseCpu::op_process_shared_;
    fetched_all_process_ = BaseCpu::fetched_all_process_shared_;
    for (const auto &item : op_process_) {
      for (const auto &id : item.second) {
        process_id_.emplace_back(id);
      }
    }
  }

  float user_util = 0, sys_util = 0;
  for (const auto &pid : process_id_) {
    std::string stat_path = "/proc/" + std::to_string(pid) + "/stat";

    std::ifstream file(stat_path);
    if (!file.is_open()) {
      MS_LOG(INFO) << "Open CPU file failed when collect CPU information";
      continue;
    }
    std::string str;
    getline(file, str);
    uint64_t user = 0, sys = 0;
    if (sscanf_s(str.c_str(), "%*d %*s %*s %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %lu %lu", &user, &sys) ==
        EOF) {
      file.close();
      return Status(StatusCode::kMDUnexpectedError, "Get device CPU failed.");
    }
    file.close();

    // Calculate the utilization from the second sampling
    if (!first_collect_ && (pre_process_stat_.find(pid) != pre_process_stat_.end())) {
      user_util += (user - pre_process_stat_[pid].user_stat_) * 1.0 / (total_stat_ - pre_total_stat_) * 100;
      sys_util += (sys - pre_process_stat_[pid].sys_stat_) * 1.0 / (total_stat_ - pre_total_stat_) * 100;
    }
    pre_process_stat_[pid].user_stat_ = user;
    pre_process_stat_[pid].sys_stat_ = sys;
  }
  if (!first_collect_) {
    CpuProcessUtil info;
    info.user_utilization_ = user_util;
    info.sys_utilization_ = sys_util;
    process_util_.emplace_back(info);
  }
  pre_total_stat_ = total_stat_;
  first_collect_ = false;
  pre_fetched_state_ = fetched_all_process_;
  return Status::OK();
 }

 Status ProcessCpu::Collect(const ExecutionTree *tree) {
  RETURN_UNEXPECTED_IF_NULL(tree);
  if (first_collect_) {
 #if defined(USING_LINUX)
    cpu_processor_num_ = get_nprocs_conf();
 #endif
  }
  RETURN_IF_NOT_OK(ParseCpuInfo());

  return Status::OK();
 }

 Status ProcessCpu::Analyze(std::string *name, double *utilization, std::string *extra_message) {
  RETURN_UNEXPECTED_IF_NULL(name);
  RETURN_UNEXPECTED_IF_NULL(utilization);
  RETURN_UNEXPECTED_IF_NULL(extra_message);
  name->clear();
  name->append("process_info");
  int total_samples = process_util_.size();
  int64 sum = 0;
  // Only analyze the middle half of the samples
  // Starting and ending may be impacted by startup or ending pipeline activities
  constexpr int64_t sample_sections = 4;
  int64 start_analyze = total_samples / sample_sections;
  int64 end_analyze = total_samples - start_analyze;

  for (int i = start_analyze; i < end_analyze; i++) {
    sum += process_util_[i].user_utilization_;
    sum += process_util_[i].sys_utilization_;
  }

  if ((end_analyze - start_analyze) > 0) {
    *utilization = sum / (end_analyze - start_analyze);
  }
  return Status::OK();
 }

 Status ProcessCpu::SaveToFile(const std::string &file_path) {
  Path path = Path(file_path);
  json output;
  if (path.Exists()) {
    MS_LOG(DEBUG) << file_path << "already exist.";
    try {
      std::ifstream file(file_path);
      file >> output;
    } catch (const std::exception &err) {
      RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path +
                               ", please delete it and try again!");
    }
  } else {
    output["sampling_interval"] = GlobalContext::config_manager()->monitor_sampling_interval();
  }

  std::vector<int16_t> user_util;
  std::transform(process_util_.begin(), process_util_.end(), std::back_inserter(user_util),
                 [&](const CpuProcessUtil &info) { return uint16_t(info.user_utilization_ * cpu_processor_num_); });
  std::vector<int16_t> sys_util;
  std::transform(process_util_.begin(), process_util_.end(), std::back_inserter(sys_util),
                 [&](const CpuProcessUtil &info) { return uint16_t(info.sys_utilization_ * cpu_processor_num_); });

  output["process_info"] = {{"user_utilization", user_util}, {"sys_utilization", sys_util}};
  output["cpu_processor_num"] = cpu_processor_num_;

  // Discard the content of the file when opening.
  std::ofstream os(file_path, std::ios::trunc);
  os << output;
  os.close();

  MS_LOG(INFO) << "Save process CPU success.";
  return Status::OK();
 }

 Status CpuSampling::CollectTimeStamp() {
  time_stamp_.emplace_back(ProfilingTime::GetCurMilliSecond());
  return Status::OK();
 }

 // Sample action
 Status CpuSampling::Sample() {
  // Collect cpu information
  for (auto cpu : cpu_) {
    RETURN_IF_NOT_OK(cpu->Collect(this->tree_));
  }

  // Collect time stamp
  RETURN_IF_NOT_OK(CollectTimeStamp());
  return Status::OK();
 }

 Status CpuSampling::SaveTimeStampToFile() {
  // Save time stamp to json file
  // If the file is already exist, simply add the data to corresponding field.
  Path path = Path(file_path_);
  json output;
  if (path.Exists()) {
    try {
      std::ifstream file(file_path_);
      file >> output;
    } catch (const std::exception &err) {
      RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path_ +
                               ", please delete it and try again!");
    }
  }
  output["time_stamp"] = time_stamp_;
  std::ofstream os(file_path_, std::ios::trunc);
  os << output;
  os.close();

  return Status::OK();
 }

 Status CpuSampling::SaveSamplingItervalToFile() {
  // If the file is already exist, simply add the data to corresponding field.
  Path path = Path(file_path_);
  json output;
  if (path.Exists()) {
    try {
      std::ifstream file(file_path_);
      file >> output;
    } catch (const std::exception &err) {
      RETURN_STATUS_UNEXPECTED("Invalid file, failed to open json file: " + file_path_ +
                               ", please delete it and try again!");
    }
  }
  output["sampling_interval"] = GlobalContext::config_manager()->monitor_sampling_interval();
  std::ofstream os(file_path_, std::ios::trunc);
  os << output;
  os.close();

  return Status::OK();
 }

 // Analyze profiling data and output warning messages
 Status CpuSampling::Analyze() {
  std::string name;
  double utilization = 0;
  constexpr double total_cpu_thold = 90;
  constexpr double op_cpu_thold = 80;
  // Keep track of specific information returned by differentn CPU sampling types
  double total_utilization = 0;
  double max_op_utilization = 0;
  std::string max_op_name;
  std::string detailed_op_cpu_message;

  // Save cpu information to json file
  for (auto cpu : cpu_) {
    std::string extra_message;
    RETURN_IF_NOT_OK(cpu->Analyze(&name, &utilization, &extra_message));
    if (name == "device_info") {
      total_utilization = utilization;
    } else if (name != "process_info") {
      max_op_utilization = utilization;
      max_op_name = name;
      detailed_op_cpu_message = extra_message;
    }
  }
  if ((total_utilization < total_cpu_thold) && (max_op_utilization > op_cpu_thold)) {
    MS_LOG(WARNING) << "Operator " << max_op_name << " is using " << max_op_utilization << "% CPU per thread.  "
                    << "This operator may benefit from increasing num_parallel_workers."
                    << "Full Operator CPU utiliization for all operators: " << detailed_op_cpu_message << std::endl;
  }
  return Status::OK();
 }

 // Save profiling data to file
 Status CpuSampling::SaveToFile() {
  // Save time stamp to json file
  RETURN_IF_NOT_OK(SaveTimeStampToFile());

  // Save time stamp to json file
  RETURN_IF_NOT_OK(SaveSamplingItervalToFile());

  // Save cpu information to json file
  for (auto cpu : cpu_) {
    RETURN_IF_NOT_OK(cpu->SaveToFile(file_path_));
  }

  return Status::OK();
 }

 Status CpuSampling::Init(const std::string &dir_path, const std::string &device_id) {
  file_path_ = (Path(dir_path) / Path("minddata_cpu_utilization_" + device_id + ".json")).ToString();
  std::shared_ptr<DeviceCpu> device_cpu = std::make_shared<DeviceCpu>();
  std::shared_ptr<OperatorCpu> operator_cpu = std::make_shared<OperatorCpu>();
  std::shared_ptr<ProcessCpu> process_cpu = std::make_shared<ProcessCpu>();
  cpu_.push_back(device_cpu);
  cpu_.push_back(operator_cpu);
  cpu_.push_back(process_cpu);
  return Status::OK();
 }

 Status CpuSampling::ChangeFileMode() {
  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
  }
  return Status::OK();
 }
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/cpu_sampling.h
@@ -1,210 +0,0 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H
 #define MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H

 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include <nlohmann/json.hpp>
 #include "minddata/dataset/engine/perf/profiling.h"
 #include "minddata/dataset/engine/datasetops/dataset_op.h"

 namespace mindspore {
 namespace dataset {
 class ExecutionTree;

 // CPU information from /proc/stat or /proc/pid/stat file
 typedef struct CpuStat_s {
  uint64_t user_stat_;
  uint64_t sys_stat_;
  uint64_t io_stat_;
  uint64_t idle_stat_;
  uint64_t total_stat_;
 } CpuStat;

 // Cpu utilization
 typedef struct CpuInfo_s {
  uint8_t user_utilization_;
  uint8_t sys_utilization_;
  uint8_t io_utilization_;
  uint8_t idle_utilization_;
 } CpuUtil;

 // CPU utilization of operator
 typedef struct CpuOpInfo_s {
  float user_utilization_;
  float sys_utilization_;
  int32_t op_id_;
 } CpuOpUtil;

 // CPU utilization of process
 typedef struct CpuProcessInfo_s {
  float user_utilization_;
  float sys_utilization_;
 } CpuProcessUtil;

 // CPU stat of operator
 typedef struct CpuOpStat_s {
  uint64_t user_stat_;
  uint64_t sys_stat_;
 } CpuOpStat;

 class BaseCpu {
 public:
  BaseCpu();
  ~BaseCpu() = default;
  // Collect CPU information
  virtual Status Collect(const ExecutionTree *tree) = 0;
  virtual Status SaveToFile(const std::string &file_path) = 0;
  virtual Status Analyze(std::string *name, double *utilization, std::string *extra_message) = 0;
  // Get the total CPU time of device
  Status GetTotalCpuTime(uint64_t *total_stat);

 protected:
  std::vector<CpuUtil> cpu_util_;
  CpuStat pre_cpu_stat_;
  static bool fetched_all_process_shared_;
  static std::unordered_map<int32_t, std::vector<pid_t>> op_process_shared_;
  bool fetched_all_process_;
  bool pre_fetched_state_;
  std::unordered_map<int32_t, std::vector<pid_t>> op_process_;
  int32_t cpu_processor_num_;
 };

 // Collect device CPU information
 class DeviceCpu : public BaseCpu {
 public:
  DeviceCpu() : pre_running_process_(0), pre_context_switch_count_(0), first_collect_(true) {}
  ~DeviceCpu() = default;
  Status Collect(const ExecutionTree *tree) override;
  Status SaveToFile(const std::string &file_path) override;
  Status Analyze(std::string *name, double *utilization, std::string *extra_message) override;

 private:
  // Get CPU information, include use/sys/idle/io utilization
  Status ParseCpuInfo(const std::string &str);

  // Get context switch count
  Status ParseCtxt(const std::string &str);

  // Get running process count
  Status ParseRunningProcess(const std::string &str);

  std::vector<uint32_t> running_process_;
  std::vector<uint64_t> context_switch_count_;
  uint32_t pre_running_process_;
  uint64_t pre_context_switch_count_;
  bool first_collect_;
 };

 // Collect operator CPU information
 class OperatorCpu : public BaseCpu {
 public:
  OperatorCpu() : first_collect_(true), pre_total_stat_(0), id_count_(0) {}
  ~OperatorCpu() = default;
  Status Collect(const ExecutionTree *tree) override;
  Status SaveToFile(const std::string &file_path) override;
  // Analyze will output the name of the metric, the avg utiliization of highest
  // object within the class and any extra message that would be useful for the user.
  // The Higher level CPUSampling class will combine information from different classes
  // to decide if warning should be output.
  Status Analyze(std::string *name, double *utilization, std::string *extra_message) override;

 private:
  // Get cpu information, include use/sys/idle/io utilization
  Status ParseCpuInfo(int32_t op_id, int64_t thread_id,
                      std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> *op_stat);

  // Store the CPU utilization of each operator
  std::vector<std::vector<CpuOpUtil>> cpu_op_util_;

  bool first_collect_;

  // Store the id and its corresponding threads.
  std::unordered_map<int32_t, std::vector<pid_t>> op_thread_;
  std::unordered_map<int32_t, std::string> op_name_;
  std::unordered_map<int32_t, int32_t> op_parallel_workers_;
  std::unordered_map<int32_t, std::unordered_map<int64_t, CpuOpStat>> pre_op_stat_;
  uint64_t pre_total_stat_;
  int32_t id_count_;
 };

 // Collect operator CPU information
 class ProcessCpu : public BaseCpu {
 public:
  ProcessCpu() : first_collect_(true), pre_total_stat_(0) {}
  ~ProcessCpu() = default;
  Status Collect(const ExecutionTree *tree) override;
  Status SaveToFile(const std::string &file_path) override;
  Status Analyze(std::string *name, double *utilization, std::string *extra_message) override;

 private:
  // Get CPU information, include use/sys/idle/io utilization
  Status ParseCpuInfo();

  bool first_collect_;
  std::vector<CpuProcessUtil> process_util_;
  uint64_t pre_total_stat_;
  std::unordered_map<int64_t, CpuOpStat> pre_process_stat_;
  std::vector<pid_t> process_id_;
 };

 // Sampling CPU information
 // It support JSON serialization for external usage.
 class CpuSampling : public Sampling {
  using TimeStamp = std::vector<uint32_t>;

 public:
  explicit CpuSampling(ExecutionTree *tree) : tree_(tree) {}

  ~CpuSampling() = default;

  // Driver function for CPU sampling.
  // This function samples the CPU information of device/process/op
  Status Sample() override;

  std::string Name() const override { return kCpuSamplingName; }

  // Save sampling data to file
  // @return Status - The error code return
  Status SaveToFile() override;

  Status Init(const std::string &dir_path, const std::string &device_id) override;

  // Change file mode after save CPU data
  Status ChangeFileMode() override;

  // Analyze sampling data and print message to log
  Status Analyze() override;

 private:
  Status CollectTimeStamp();

  Status SaveTimeStampToFile();

  Status SaveSamplingItervalToFile();

  ExecutionTree *tree_ = nullptr;              // ExecutionTree pointer
  std::vector<std::shared_ptr<BaseCpu>> cpu_;  // CPU information of device/process/op
  TimeStamp time_stamp_;                       // Time stamp
 };

 }  // namespace dataset
 }  // namespace mindspore

 #endif  // MINDSPORE_CCSRC_MINDDATA_DATASET_CPU_SAMPLING_H
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/dataset_iterator_tracing.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/dataset_iterator_tracing.cc
@@ -13,49 +13,66 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <sys/stat.h>
 #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h"
 #include <fstream>
 #include <string>
 #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h"
 #ifndef ENABLE_ANDROID
 #include "utils/log_adapter.h"
 #else
 #include "mindspore/lite/src/common/log_adapter.h"
 #endif
 #include "minddata/dataset/util/path.h"
 #include "mindspore/core/utils/ms_utils.h"

 namespace mindspore {
 namespace dataset {

 Status DatasetIteratorTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
                                      const int32_t value, const uint64_t time_stamp) {
  // Format: "type extra-info batch-num value"
  // type: 0: time,  1: connector size
  // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  //             if type is 1 - connector capacity
  // batch-num: batch number
  // value: if type is 0 - value is time(ms)
  //        if type is 1 - value is connector size
  // Examples:
  // 0 0 20 10 - The 20th batch took 10ms to get data from pipeline.
  // 1 64 20 5 - Connector size is 5 when get the 20th batch.Connector capacity is 64.
  std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
                     std::to_string(value) + " " + std::to_string(time_stamp);
  value_.emplace_back(data);
  return Status::OK();
 }
 constexpr int32_t CONNECTOR_CAPACITY_OFFSET = 0;

 Status DatasetIteratorTracing::Init(const std::string &dir_path, const std::string &device_id) {
  file_path_ = (Path(dir_path) / Path("dataset_iterator_profiling_" + device_id + ".txt")).ToString();
  return Status::OK();
 }

 Status DatasetIteratorTracing::ChangeFileMode() {
  if (value_.empty()) {
    return Status::OK();
  }
 Status DatasetIteratorTracing::GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return {StatusCode::kMDUnexpectedError, "Dataset Iterator Tracing does not record pipeline time."};
 }

 Status DatasetIteratorTracing::GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return {StatusCode::kMDUnexpectedError, "Dataset Iterator Tracing does not record push time."};
 }

 Status DatasetIteratorTracing::GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return {StatusCode::kMDUnexpectedError, "Dataset Iterator Tracing does not record batch time."};
 }

  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
 Status DatasetIteratorTracing::GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return GetRecordEntry(start_step, end_step, CONNECTOR_CAPACITY_OFFSET, result);
 }

 Status DatasetIteratorTracing::GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) {
  std::lock_guard<std::mutex> guard(lock_);
  auto total_steps = records_.size() / records_per_step_;
  MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= total_steps,
                               "Expected start_step <= total_steps. Got start_step: " + std::to_string(start_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(end_step <= total_steps,
                               "Expected end_step <= total_steps. Got end_step: " + std::to_string(end_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= end_step,
                               "Expected start_step <= end_step. Got start_step: " + std::to_string(start_step) +
                                 " end_step: " + std::to_string(end_step));

  uint32_t total = end_step - start_step + 1;
  uint32_t count = 0U;
  for (auto step_num = start_step; step_num <= end_step; step_num++) {
    auto idx = (step_num - 1) * records_per_step_ + CONNECTOR_CAPACITY_OFFSET;
    count += static_cast<uint32_t>(records_[idx].value == 0);
  }
  *empty_queue_freq = static_cast<float_t>(count) / static_cast<float_t>(total);
  return Status::OK();
 }

 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/dataset_iterator_tracing.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/dataset_iterator_tracing.h
@@ -23,24 +23,24 @@

 namespace mindspore {
 namespace dataset {
 constexpr int32_t RECORDS_PER_STEP_DATASET_ITERATOR = 1;
 class DatasetIteratorTracing : public Tracing {
 public:
  // Constructor
  DatasetIteratorTracing() = default;
  DatasetIteratorTracing() : Tracing(RECORDS_PER_STEP_DATASET_ITERATOR) {}

  // Destructor
  ~DatasetIteratorTracing() override = default;

  // Record tracing data
  // @return Status The status code returned
  Status Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
                const uint64_t time_stamp);

  std::string Name() const override { return kDatasetIteratorTracingName; };

  Status Init(const std::string &dir_path, const std::string &device_id) override;

  Status ChangeFileMode() override;
  Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) override;
 };
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/device_queue_tracing.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/device_queue_tracing.cc
@@ -14,47 +14,67 @@
 * limitations under the License.
 */

 #include <sys/stat.h>
 #include "minddata/dataset/engine/perf/device_queue_tracing.h"
 #include <fstream>
 #include <string>
 #include "minddata/dataset/engine/perf/device_queue_tracing.h"
 #ifndef ENABLE_ANDROID
 #include "utils/log_adapter.h"
 #else
 #include "mindspore/lite/src/common/log_adapter.h"
 #endif
 #include "minddata/dataset/util/path.h"
 #include "mindspore/core/utils/ms_utils.h"

 namespace mindspore {
 namespace dataset {

 void DeviceQueueTracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num,
                                const int32_t value, const uint64_t time_stamp) {
  // Format: "type extra-info batch-num value"
  // type: 0: time,  1: connector size
  // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  //             if type is 1 - connector capacity
  // batch-num: batch number
  // value: if type is 0 - value is time(ms)
  //        if type is 1 - value is connector size
  // time-stamp: time stamp
  // Examples:
  // 0 0 20 10 xxx- The 20th batch took 10ms to get data from pipeline.
  // 1 64 20 5 xxx- Connector size is 5 when get the 20th batch.Connector capacity is 64.
  std::string data = std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
                     std::to_string(value) + " " + std::to_string(time_stamp);
  value_.emplace_back(data);
 }
 constexpr int32_t PUSH_TIME_OFFSET = 0;
 constexpr int32_t BATCH_TIME_OFFSET = 1;
 constexpr int32_t PIPELINE_TIME_OFFSET = 2;
 constexpr int32_t CONNECTOR_CAPACITY_OFFSET = 3;

 Status DeviceQueueTracing::Init(const std::string &dir_path, const std::string &device_id) {
  file_path_ = (Path(dir_path) / Path("device_queue_profiling_" + device_id + ".txt")).ToString();
  return Status::OK();
 }

 Status DeviceQueueTracing::ChangeFileMode() {
  if (value_.empty()) {
    return Status::OK();
  }
 Status DeviceQueueTracing::GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return GetRecordEntry(start_step, end_step, PIPELINE_TIME_OFFSET, result);
 }

 Status DeviceQueueTracing::GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return GetRecordEntry(start_step, end_step, PUSH_TIME_OFFSET, result);
 }

 Status DeviceQueueTracing::GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return GetRecordEntry(start_step, end_step, BATCH_TIME_OFFSET, result);
 }

 Status DeviceQueueTracing::GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) {
  return GetRecordEntry(start_step, end_step, CONNECTOR_CAPACITY_OFFSET, result);
 }

 Status DeviceQueueTracing::GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) {
  std::lock_guard<std::mutex> guard(lock_);
  auto total_steps = records_.size() / records_per_step_;
  MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= total_steps,
                               "Expected start_step <= total_steps. Got start_step: " + std::to_string(start_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(end_step <= total_steps,
                               "Expected end_step <= total_steps. Got end_step: " + std::to_string(end_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= end_step,
                               "Expected start_step <= end_step. Got start_step: " + std::to_string(start_step) +
                                 " end_step: " + std::to_string(end_step));

  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
  uint32_t total = end_step - start_step + 1;
  uint32_t count = 0U;
  for (auto step_num = start_step; step_num <= end_step; step_num++) {
    auto idx = (step_num - 1) * records_per_step_ + CONNECTOR_CAPACITY_OFFSET;
    count += static_cast<uint32_t>(records_[idx].value == 0);
  }
  *empty_queue_freq = static_cast<float_t>(count) / static_cast<float_t>(total);
  return Status::OK();
 }
 }  // namespace dataset
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/device_queue_tracing.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/device_queue_tracing.h
@@ -23,24 +23,24 @@

 namespace mindspore {
 namespace dataset {
 constexpr int32_t RECORDS_PER_STEP_DEVICE_QUEUE = 4;
 class DeviceQueueTracing : public Tracing {
 public:
  // Constructor
  DeviceQueueTracing() = default;
  DeviceQueueTracing() : Tracing(RECORDS_PER_STEP_DEVICE_QUEUE) {}

  // Destructor
  ~DeviceQueueTracing() override = default;

  // Record tracing data
  // @return Status The status code returned
  void Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
              const uint64_t time_stamp);

  std::string Name() const override { return kDeviceQueueTracingName; };

  Status Init(const std::string &dir_path, const std::string &device_id) override;

  Status ChangeFileMode() override;
  Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) override;
  Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) override;
 };
 }  // namespace dataset
 }  // namespace mindspore
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/monitor.cc
@@ -60,7 +60,6 @@ Status Monitor::operator()() {
  RETURN_IF_NOT_OK(tree_consumer_->GetProfilingManager()->Analyze());
  RETURN_IF_NOT_OK(tree_consumer_->GetProfilingManager()->SaveProfilingData());
  RETURN_IF_NOT_OK(tree_consumer_->GetProfilingManager()->ChangeFileMode());

  cfg->set_profiler_file_status(true);
  return Status::OK();
 }
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
@@ -14,6 +14,7 @@
 * limitations under the License.
 */
 #include "minddata/dataset/engine/perf/profiling.h"
 #include <sys/stat.h>
 #include <cstdlib>
 #include <fstream>
 #include "utils/ms_utils.h"
@@ -25,9 +26,9 @@
 #include "minddata/dataset/engine/perf/monitor.h"
 #include "minddata/dataset/engine/perf/device_queue_tracing.h"
 #include "minddata/dataset/engine/perf/connector_size.h"
 #include "minddata/dataset/engine/perf/connector_throughput.h"
 #include "minddata/dataset/engine/perf/cpu_sampling.h"
 #include "minddata/dataset/engine/perf/cpu_sampler.h"
 #include "minddata/dataset/engine/perf/dataset_iterator_tracing.h"
 #include "minddata/dataset/engine/execution_tree.h"
 #include "minddata/dataset/util/log_adapter.h"

 namespace mindspore {
@@ -50,6 +51,63 @@ Status Tracing::SaveToFile() {
  return Status::OK();
 }

 Status Tracing::ChangeFileMode() {
  if (value_.empty()) {
    return Status::OK();
  }

  if (chmod(common::SafeCStr(file_path_), S_IRUSR | S_IWUSR) == -1) {
    std::string err_str = "Change file mode failed," + file_path_;
    return Status(StatusCode::kMDUnexpectedError, err_str);
  }
  return Status::OK();
 }

 void Tracing::Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
                     const uint64_t time_stamp) {
  // Format: "type extra-info batch-num value"
  // type: 0: time,  1: connector size
  // extra-info: if type is 0 - 0: pipeline time, 1: push tdt time, 2: batch time
  //             if type is 1 - connector capacity
  // batch-num: batch number
  // value: if type is 0 - value is time(ms)
  //        if type is 1 - value is connector size
  // time-stamp: time stamp
  // Examples:
  // 0 0 20 10 xxx- The 20th batch took 10ms to get data from pipeline.
  // 1 64 20 5 xxx- Connector size is 5 when get the 20th batch.Connector capacity is 64.
  TracingRecord record = {type, extra_info, batch_num, value, time_stamp};
  std::lock_guard<std::mutex> guard(lock_);
  (void)records_.emplace_back(record);
  (void)value_.emplace_back(record.ToString());
 }

 Status Tracing::GetRecordEntry(int32_t start_step, int32_t end_step, int32_t record_offset,
                               std::vector<int32_t> *result) {
  std::lock_guard<std::mutex> guard(lock_);
  auto total_steps = records_.size() / records_per_step_;
  MS_LOG(DEBUG) << "start_step: " << start_step << " end_step: " << end_step;
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= total_steps,
                               "Expected start_step <= total_steps. Got start_step: " + std::to_string(start_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(end_step <= total_steps,
                               "Expected end_step <= total_steps. Got end_step: " + std::to_string(end_step) +
                                 " total_steps: " + std::to_string(total_steps));
  CHECK_FAIL_RETURN_UNEXPECTED(start_step <= end_step,
                               "Expected start_step <= end_step. Got start_step: " + std::to_string(start_step) +
                                 " end_step: " + std::to_string(end_step));

  for (auto step_num = start_step; step_num <= end_step; step_num++) {
    // each step has 4 entries in device queue tracing
    auto idx = (step_num - 1) * records_per_step_ + record_offset;
    assert(idx < records_.size());
    (void)result->emplace_back(records_[idx].value);
  }
  return Status::OK();
 }

 Tracing::Tracing(int32_t records_per_step) : records_per_step_(records_per_step) {}

 Status Sampling::ReadJson(nlohmann::json *output) {
  RETURN_UNEXPECTED_IF_NULL(output);
  Path path = Path(file_path_);
@@ -134,13 +192,14 @@ Status ProfilingManager::Initialize(ExecutionTree *tree) {
  std::shared_ptr<Sampling> connector_size_sampling = std::make_shared<ConnectorSize>(tree_);
  RETURN_IF_NOT_OK(RegisterSamplingNode(connector_size_sampling));

  std::shared_ptr<Sampling> connector_thr_sampling = std::make_shared<ConnectorThroughput>(tree_);
  RETURN_IF_NOT_OK(RegisterSamplingNode(connector_thr_sampling));

 #ifndef ENABLE_ANDROID
  std::shared_ptr<Sampling> cpu_sampling = std::make_shared<CpuSampling>(tree_);
  RETURN_IF_NOT_OK(RegisterSamplingNode(cpu_sampling));
  std::shared_ptr<Sampling> cpu_sampler = std::make_shared<CpuSampler>(tree_);
  RETURN_IF_NOT_OK(RegisterSamplingNode(cpu_sampler));
 #endif
  // can insert a correct timestamp so that we can ignore the samples that were taken
  // during start up of the pipeline.
  (void)epoch_end_ts_.emplace_back(0);
  (void)epoch_end_step_.emplace_back(0);
  return Status::OK();
 }

@@ -214,6 +273,7 @@ Status ProfilingManager::SaveProfilingData() {
  MS_LOG(INFO) << "Save profiling data end.";
  return Status::OK();
 }

 Status ProfilingManager::Analyze() {
  if (!IsProfilingEnable()) {
    return Status::OK();
@@ -240,8 +300,138 @@ Status ProfilingManager::ChangeFileMode() {
  return Status::OK();
 }

 #ifndef ENABLE_ANDROID
 Status ProfilingManager::GetUserCpuUtil(int32_t epoch_num, std::vector<uint8_t> *result) {
  std::shared_ptr<CpuSampler> cpu_node;
  uint64_t start_ts, end_ts;
  RETURN_IF_NOT_OK(PopulateCpuSamplerAPIInputs(epoch_num, &start_ts, &end_ts, &cpu_node));
  return cpu_node->GetSystemUserCpuUtil(start_ts, end_ts, result);
 }

 Status ProfilingManager::GetSysCpuUtil(int32_t epoch_num, std::vector<uint8_t> *result) {
  std::shared_ptr<CpuSampler> cpu_node;
  uint64_t start_ts, end_ts;
  RETURN_IF_NOT_OK(PopulateCpuSamplerAPIInputs(epoch_num, &start_ts, &end_ts, &cpu_node));
  return cpu_node->GetSystemSysCpuUtil(start_ts, end_ts, result);
 }

 Status ProfilingManager::GetUserCpuUtil(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  std::shared_ptr<CpuSampler> cpu_node;
  uint64_t start_ts, end_ts;
  RETURN_IF_NOT_OK(PopulateCpuSamplerAPIInputs(epoch_num, &start_ts, &end_ts, &cpu_node));
  return cpu_node->GetOpUserCpuUtil(op_id, start_ts, end_ts, result);
 }

 Status ProfilingManager::GetSysCpuUtil(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result) {
  std::shared_ptr<CpuSampler> cpu_node;
  uint64_t start_ts, end_ts;
  RETURN_IF_NOT_OK(PopulateCpuSamplerAPIInputs(epoch_num, &start_ts, &end_ts, &cpu_node));
  return cpu_node->GetOpSysCpuUtil(op_id, start_ts, end_ts, result);
 }

 Status ProfilingManager::PopulateCpuSamplerAPIInputs(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts,
                                                     std::shared_ptr<CpuSampler> *node) {
  RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, start_ts, end_ts));
  std::shared_ptr<Sampling> sampling_node;
  RETURN_IF_NOT_OK(GetSamplingNode(kCpuSamplerName, &sampling_node));
  *node = std::dynamic_pointer_cast<CpuSampler>(sampling_node);
  return Status::OK();
 }
 #endif

 Status ProfilingManager::EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts) {
  if (epoch_num <= 0 || epoch_num >= epoch_end_ts_.size()) {
    std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
    MS_LOG(INFO) << err;
    return {StatusCode::kMDUnexpectedError, err};
  }
  *start_ts = epoch_end_ts_[epoch_num - 1];
  *end_ts = epoch_end_ts_[epoch_num];
  return Status::OK();
 }

 Status ProfilingManager::EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step) {
  if (epoch_num <= 0 || epoch_num >= epoch_end_step_.size()) {
    std::string err = "Epoch: " + std::to_string(epoch_num) + " is invalid.";
    MS_LOG(INFO) << err;
    return {StatusCode::kMDUnexpectedError, err};
  }
  *start_step = epoch_end_step_[epoch_num - 1] + 1;
  *end_step = epoch_end_step_[epoch_num];
  return Status::OK();
 }

 Status ProfilingManager::GetConnectorSize(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result) {
  uint64_t start_ts, end_ts;
  RETURN_IF_NOT_OK(EpochToTimeInterval(epoch_num, &start_ts, &end_ts));
  std::shared_ptr<Sampling> node;
  RETURN_IF_NOT_OK(GetSamplingNode(kConnectorSizeSamplingName, &node));
  auto connector_node = std::dynamic_pointer_cast<ConnectorSize>(node);
  return connector_node->GetOpConnectorSize(op_id, start_ts, end_ts, result);
 }

 Status ProfilingManager::GetPipelineTime(int32_t epoch_num, std::vector<int32_t> *result) {
  uint32_t start_step, end_step;
  RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  std::shared_ptr<Tracing> node;
  if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
      GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
    return node->GetPipelineTime(start_step, end_step, result);
  } else {
    return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  }
 }

 Status ProfilingManager::GetPushTime(int32_t epoch_num, std::vector<int32_t> *result) {
  uint32_t start_step, end_step;
  RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  std::shared_ptr<Tracing> node;
  if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
      GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
    return node->GetPushTime(start_step, end_step, result);
  } else {
    return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  }
 }

 Status ProfilingManager::GetBatchTime(int32_t epoch_num, std::vector<int32_t> *result) {
  uint32_t start_step, end_step;
  RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  std::shared_ptr<Tracing> node;
  if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
      GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
    return node->GetBatchTime(start_step, end_step, result);
  } else {
    return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  }
 }

 Status ProfilingManager::GetConnectorSize(int32_t epoch_num, std::vector<int32_t> *result) {
  uint32_t start_step, end_step;
  RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  std::shared_ptr<Tracing> node;
  if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
      GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
    return node->GetConnectorSize(start_step, end_step, result);
  } else {
    return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  }
 }

 Status ProfilingManager::GetEmptyQueueFrequency(int32_t epoch_num, float_t *result) {
  uint32_t start_step, end_step;
  RETURN_IF_NOT_OK(EpochToStepInterval(epoch_num, &start_step, &end_step));
  std::shared_ptr<Tracing> node;
  if (GetTracingNode(kDeviceQueueTracingName, &node).IsOk() ||
      GetTracingNode(kDatasetIteratorTracingName, &node).IsOk()) {
    return node->GetEmptyQueueFrequency(start_step, end_step, result);
  } else {
    return {StatusCode::kMDUnexpectedError, "Cannot find appropriate tracing node"};
  }
 }

 void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
  MS_LOG(INFO) << "Record end of epoch. step_num: " << step_num;
  MS_LOG(INFO) << "Recording end of epoch. step_num: " << step_num;
  (void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
  (void)epoch_end_step_.emplace_back(step_num);
 }
--- a/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/perf/profiling.h
@@ -21,6 +21,7 @@
 #include <unordered_map>
 #include <memory>
 #include <chrono>
 #include <mutex>
 #include <nlohmann/json.hpp>
 #include "minddata/dataset/util/status.h"

@@ -30,12 +31,12 @@ namespace dataset {
 class Monitor;
 class ExecutionTree;
 class TreeConsumer;
 class CpuSampler;

 const char kDeviceQueueTracingName[] = "Device_Queue_Tracing";
 const char kDatasetIteratorTracingName[] = "Dataset_Iterator_Tracing";
 const char kConnectorSizeSamplingName[] = "Connector_Size_Sampling";
 const char kConnectorThroughputSamplingName[] = "Connector_Throughput_Sampling";
 const char kCpuSamplingName[] = "Cpu_Sampling";
 const char kCpuSamplerName[] = "Cpu_Sampler";

 // Profiling is a class of basic unit of profiling action
 // This base class encapsulate the serialization output logic
@@ -59,6 +60,7 @@ class Profiling : std::enable_shared_from_this<Profiling> {

 protected:
  std::string file_path_;
  std::mutex lock_;
 };

 // Sampling is a class of profiling which generate samples periodically.
@@ -72,15 +74,40 @@ class Sampling : public Profiling {
  Status ReadJson(nlohmann::json *output);
 };

 typedef struct TracingRecord_s {
  int32_t type;
  int32_t extra_info;
  int32_t batch_num;
  int32_t value;
  uint64_t ts;

  std::string ToString() {
    return std::to_string(type) + " " + std::to_string(extra_info) + " " + std::to_string(batch_num) + " " +
           std::to_string(value) + " " + std::to_string(ts);
  }
 } TracingRecord;

 // Tracing is class of profiling which record samples upon request.
 class Tracing : public Profiling {
 public:
  // Tracing has minimal interface to provide flexible on data recording.
  // It only includes some common routines.
  Status SaveToFile();
  Status SaveToFile() override;
  Status ChangeFileMode() override;
  virtual Status GetPipelineTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) = 0;
  virtual Status GetPushTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) = 0;
  virtual Status GetBatchTime(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) = 0;
  virtual Status GetConnectorSize(int32_t start_step, int32_t end_step, std::vector<int32_t> *result) = 0;
  virtual Status GetEmptyQueueFrequency(int32_t start_step, int32_t end_step, float_t *empty_queue_freq) = 0;
  void Record(const int32_t type, const int32_t extra_info, const int32_t batch_num, const int32_t value,
              const uint64_t time_stamp);

 protected:
  explicit Tracing(int32_t records_per_step);
  const int32_t records_per_step_;
  std::vector<std::string> value_;
  std::vector<TracingRecord> records_;
  Status GetRecordEntry(int32_t start_step, int32_t end_step, int32_t record_offset, std::vector<int32_t> *result);
 };

 // ProfilingManager is a class manages all profiling infrastructure
@@ -135,13 +162,80 @@ class ProfilingManager {
  // Analyze profile data and print warning messages
  Status Analyze();

 #ifndef ENABLE_ANDROID
  /// \brief API to get User CPU utilization for the system
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the sampled User CPU Utilization for the entire system
  /// \return Status object with the error code
  Status GetUserCpuUtil(int32_t epoch_num, std::vector<uint8_t> *result);

  /// \brief API to get System CPU utilization for the system
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the sampled System CPU Utilization for the entire system
  /// \return Status object with the error code
  Status GetSysCpuUtil(int32_t epoch_num, std::vector<uint8_t> *result);

  /// \brief API to get User CPU Utilization of an MD operator
  /// \param [in] op_id The id of the operator
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the sampled User CPU Utilization of the operator.
  /// \return Status object with the error code
  Status GetUserCpuUtil(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);

  /// \brief API to get System CPU Utilization of an MD operator
  /// \param [in] op_id The id of the operator
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the sampled System CPU Utilization of the operator.
  /// \return Status object with the error code
  Status GetSysCpuUtil(int32_t op_id, int32_t epoch_num, std::vector<uint16_t> *result);
 #endif

  /// \brief API to get the connector size of an MD operator
  /// \param [in] op_id The id of the operator
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the sampled connector sizes of the operator
  /// \return Status object with the error code
  Status GetConnectorSize(int32_t op_id, int32_t epoch_num, std::vector<int32_t> *result);

  /// \brief API to get the connector size of DatasetIterator or DeviceQueueOp
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with connector size at each step
  /// \return Status object with the error code
  Status GetConnectorSize(int32_t epoch_num, std::vector<int32_t> *result);

  /// \brief API to get the pipeline time of batches
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the pipeline time for each step
  /// \return Status object with the error code
  Status GetPipelineTime(int32_t epoch_num, std::vector<int32_t> *result);

  /// \brief API to get the push time of batches
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the push time for each each step
  /// \return Status object with the error code
  Status GetPushTime(int32_t epoch_num, std::vector<int32_t> *result);

  /// \brief API to get the batch time of batches
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result A vector with the batch time for each step
  /// \return Status object with the error code
  Status GetBatchTime(int32_t epoch_num, std::vector<int32_t> *result);

  /// \brief API to get fraction of steps that DatasetIterator or DeviceQueueOp connector was empty
  /// \param [in] epoch_num The epoch number for which results are requested
  /// \param [out] result The empty queue frequency
  /// \return Status object with the error code
  Status GetEmptyQueueFrequency(int32_t epoch_num, float_t *result);

 private:
  std::unique_ptr<Monitor> perf_monitor_;
  bool enabled_;
  std::unordered_map<std::string, std::shared_ptr<Tracing>> tracing_nodes_;

  std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;

  ExecutionTree *tree_;                   // ExecutionTree pointer
  TreeConsumer *tree_consumer_;           // TreeConsumer pointer
  std::string dir_path_;                  // where to create profiling file
  std::string device_id_;                 // used when create profiling file,filename_device_id.suffix
  std::vector<uint64_t> epoch_end_ts_;    // End of epoch timestamp
  std::vector<uint32_t> epoch_end_step_;  // End of epoch step number

@@ -155,10 +249,13 @@ class ProfilingManager {
  // @return Status The status code returned
  Status RegisterSamplingNode(std::shared_ptr<Sampling> node);

  ExecutionTree *tree_;          // ExecutionTree pointer
  TreeConsumer *tree_consumer_;  // TreeConsumer pointer
  std::string dir_path_;         // where to create profiling file
  std::string device_id_;        // used when create profiling file,filename_device_id.suffix
  Status EpochToStepInterval(int32_t epoch_num, uint32_t *start_step, uint32_t *end_step);
  // get start and ending timestamp of an epoch
  Status EpochToTimeInterval(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts);
 #ifndef ENABLE_ANDROID
  Status PopulateCpuSamplerAPIInputs(int32_t epoch_num, uint64_t *start_ts, uint64_t *end_ts,
                                     std::shared_ptr<CpuSampler> *node);
 #endif
 };

 enum ProfilingType { TIME, CONNECTOR_DEPTH };
--- a/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc
@@ -257,8 +257,7 @@ Status TreeAdapter::GetNext(TensorRow *row) {
    cur_batch_num_++;
    cur_connector_size_ = tree_->root()->ConnectorSize();
    cur_connector_capacity_ = tree_->root()->ConnectorCapacity();
    RETURN_IF_NOT_OK(
      tracing_->Record(CONNECTOR_DEPTH, cur_connector_capacity_, cur_batch_num_, cur_connector_size_, end_time));
    tracing_->Record(CONNECTOR_DEPTH, cur_connector_capacity_, cur_batch_num_, cur_connector_size_, end_time);
  }
 #endif
  return Status::OK();
--- a/mindspore/lite/minddata/CMakeLists.txt
+++ b/mindspore/lite/minddata/CMakeLists.txt
@@ -180,7 +180,6 @@ if(BUILD_MINDDATA STREQUAL "full")
        ${MINDDATA_DIR}/engine/perf/monitor.cc
        ${MINDDATA_DIR}/engine/perf/device_queue_tracing.cc
        ${MINDDATA_DIR}/engine/perf/connector_size.cc
        ${MINDDATA_DIR}/engine/perf/connector_throughput.cc
        ${MINDDATA_DIR}/engine/perf/dataset_iterator_tracing.cc
        ${MINDDATA_DIR}/engine/datasetops/source/sampler/sampler.cc
        ${MINDDATA_DIR}/engine/datasetops/source/sampler/subset_sampler.cc
--- a/tests/ut/python/dataset/test_profiling.py
+++ b/tests/ut/python/dataset/test_profiling.py
@@ -82,8 +82,7 @@ def confirm_cpuutil(num_pipeline_ops, cpu_uti_file):
    with open(cpu_uti_file) as file1:
        data = json.load(file1)
        op_info = data["op_info"]
        # Confirm <num_pipeline_ops>+1 ops in CPU util file (including op_id=-1 for monitor thread)
        assert len(op_info) == num_pipeline_ops + 1
        assert len(op_info) == num_pipeline_ops


 def confirm_ops_in_pipeline(num_ops, op_list, pipeline_file):
@@ -176,7 +175,6 @@ def test_profiling_complex_pipeline():
                if op_info[i]["op_type"] != "ZipOp":
                    assert "size" in op_info[i]["metrics"]["output_queue"]
                    assert "length" in op_info[i]["metrics"]["output_queue"]
                    assert "throughput" in op_info[i]["metrics"]["output_queue"]
                else:
                    # Note: Zip is an inline op and hence does not have metrics information
                    assert op_info[i]["metrics"] is None
@@ -243,7 +241,6 @@ def test_profiling_inline_ops_pipeline1():
                else:
                    assert "size" in op_info[i]["metrics"]["output_queue"]
                    assert "length" in op_info[i]["metrics"]["output_queue"]
                    assert "throughput" in op_info[i]["metrics"]["output_queue"]

        # Confirm CPU util JSON file content, when 4 ops are in the pipeline JSON file
        confirm_cpuutil(4, cpu_util_file)
@@ -294,7 +291,6 @@ def test_profiling_inline_ops_pipeline2():
                else:
                    assert "size" in op_info[i]["metrics"]["output_queue"]
                    assert "length" in op_info[i]["metrics"]["output_queue"]
                    assert "throughput" in op_info[i]["metrics"]["output_queue"]

        # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
        confirm_cpuutil(5, cpu_util_file)
@@ -384,7 +380,6 @@ def test_profiling_basic_pipeline():
                else:
                    assert "size" in op_info[i]["metrics"]["output_queue"]
                    assert "length" in op_info[i]["metrics"]["output_queue"]
                    assert "throughput" in op_info[i]["metrics"]["output_queue"]

        # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
        confirm_cpuutil(5, cpu_util_file)
@@ -441,7 +436,6 @@ def test_profiling_cifar10_pipeline():
                else:
                    assert "size" in op_info[i]["metrics"]["output_queue"]
                    assert "length" in op_info[i]["metrics"]["output_queue"]
                    assert "throughput" in op_info[i]["metrics"]["output_queue"]

        # Confirm CPU util JSON file content, when 5 ops are in the pipeline JSON file
        confirm_cpuutil(5, cpu_util_file)