Browse Source

MD Profiling: RecordEndOfEpoch in ProfilingManager

tags/v1.6.0
Cathy Wong 4 years ago
parent
commit
3ce0ab3e56
5 changed files with 26 additions and 2 deletions
  1. +2
    -1
      mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc
  2. +9
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
  3. +6
    -0
      mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc
  4. +7
    -0
      mindspore/ccsrc/minddata/dataset/engine/perf/profiling.h
  5. +2
    -1
      mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc

+ 2
- 1
mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc View File

@@ -129,10 +129,11 @@ Status DatasetIterator::FetchNextTensorRow(TensorRow *out_row) {
// An eoe row means we have iterated an epoch.
// The next row in the pipeline might be an EOF or a TensorRow for next epoch
if (out_row->eoe()) {
MS_LOG(INFO) << "End of data iteration.";
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
#ifndef ENABLE_SECURITY
if (is_profiling_enable) {
root_->Tree()->SetEpochEnd();
root_->Tree()->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
}
#endif
return Status::OK();


+ 9
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc View File

@@ -277,6 +277,7 @@ Status DeviceQueueOp::SendDataToAscend() {
connector_size = ChildOpConnectorSize();
connector_capacity = ChildOpConnectorCapacity();
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(send_batch);
}
#endif
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&curr_row));
@@ -571,8 +572,10 @@ Status DeviceQueueOp::SendDataToGPU() {
first_fetch_flag_ = true;
int64_t num_buf = 0;
bool is_break_loop = false;
uint32_t batch_num = 0;
while (!current_row.eof() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
while (!current_row.eoe() && !is_break_loop && !GpuBufferMgr::GetInstance().IsClosed()) {
batch_num++;
RETURN_IF_NOT_OK(FilterMetadata(&current_row));
RETURN_IF_NOT_OK(CheckExceptions(current_row));
RETURN_IF_NOT_OK(receive_queues_[num_buf++ % num_workers_]->Add(std::move(current_row)));
@@ -590,6 +593,12 @@ Status DeviceQueueOp::SendDataToGPU() {
}
}

#ifndef ENABLE_SECURITY
if (current_row.eoe() && tree_->GetProfilingManager()->IsProfilingEnable()) {
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(batch_num);
}
#endif
if (!TaskManager::FindMe()->Interrupted() && !GpuBufferMgr::GetInstance().IsClosed()) {
RETURN_IF_NOT_OK(child_iterator_->FetchNextTensorRow(&current_row));
} else {


+ 6
- 0
mindspore/ccsrc/minddata/dataset/engine/perf/profiling.cc View File

@@ -240,6 +240,12 @@ Status ProfilingManager::ChangeFileMode() {
return Status::OK();
}

void ProfilingManager::RecordEndOfEpoch(uint32_t step_num) {
MS_LOG(INFO) << "Record end of epoch. step_num: " << step_num;
(void)epoch_end_ts_.emplace_back(ProfilingTime::GetCurMilliSecond());
(void)epoch_end_step_.emplace_back(step_num);
}

uint64_t ProfilingTime::GetCurMilliSecond() {
// because cpplint does not allow using namespace
using std::chrono::duration_cast;


+ 7
- 0
mindspore/ccsrc/minddata/dataset/engine/perf/profiling.h View File

@@ -121,6 +121,10 @@ class ProfilingManager {
// launched. This is the master off switch, once called, it won't start profiler even if env variable says so.
void DisableProfiling() { enabled_ = false; }

// Record end of epoch information
// @param step_num - The number of steps
void RecordEndOfEpoch(uint32_t step_num);

const std::unordered_map<std::string, std::shared_ptr<Sampling>> &GetSamplingNodes() { return sampling_nodes_; }

// Launch monitoring thread.
@@ -138,6 +142,9 @@ class ProfilingManager {

std::unordered_map<std::string, std::shared_ptr<Sampling>> sampling_nodes_;

std::vector<uint64_t> epoch_end_ts_; // End of epoch timestamp
std::vector<uint32_t> epoch_end_step_; // End of epoch step number

// Register profile node to tree
// @param node - Profiling node
// @return Status The status code returned


+ 2
- 1
mindspore/ccsrc/minddata/dataset/engine/tree_adapter.cc View File

@@ -235,10 +235,11 @@ Status TreeAdapter::GetNext(TensorRow *row) {

RETURN_IF_NOT_OK(tree_->root()->GetNextRow(row)); // first buf can't be eof or empty buf with none flag
if (row->eoe()) { // return empty tensor if 1st buf is a ctrl buf (no rows)
MS_LOG(INFO) << "End of data iteration.";
MS_LOG(INFO) << "End of data iteration. cur_batch_num_: " << cur_batch_num_;
#ifndef ENABLE_SECURITY
if (is_profiling_enable) {
tree_->SetEpochEnd();
tree_->GetProfilingManager()->RecordEndOfEpoch(cur_batch_num_);
}
#endif
return Status::OK();


Loading…
Cancel
Save