From c8a4a2e9a52a0bfc0043ad585c8f73da29799e6c Mon Sep 17 00:00:00 2001 From: yanghaitao1 Date: Tue, 9 Mar 2021 10:24:39 +0800 Subject: [PATCH] print error msg if profiling enabled after hccl init --- mindspore/ccsrc/pipeline/jit/pipeline.cc | 10 ++++++++++ .../device/ascend/profiling/profiling_manager.cc | 10 +++++++++- .../device/ascend/profiling/profiling_manager.h | 2 ++ mindspore/profiler/profiling.py | 5 +++-- 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/mindspore/ccsrc/pipeline/jit/pipeline.cc b/mindspore/ccsrc/pipeline/jit/pipeline.cc index 3866530b16..4fd57f02ab 100644 --- a/mindspore/ccsrc/pipeline/jit/pipeline.cc +++ b/mindspore/ccsrc/pipeline/jit/pipeline.cc @@ -62,6 +62,7 @@ #include "transform/graph_ir/convert.h" #include "transform/graph_ir/df_graph_manager.h" #include "transform/graph_ir/op_adapter_map.h" +#include "runtime/device/ascend/profiling/profiling_manager.h" #endif #ifdef ENABLE_DUMP_IR #include "debug/rdr/running_data_recorder.h" @@ -79,6 +80,10 @@ using mindspore::abstract::AbstractTensorPtr; using mindspore::abstract::AbstractTuple; using mindspore::abstract::AbstractTuplePtr; +#if (ENABLE_GE || ENABLE_D) +using mindspore::device::ascend::ProfilingManager; +#endif + const char IR_TYPE_ANF[] = "anf_ir"; const char IR_TYPE_ONNX[] = "onnx_ir"; const char IR_TYPE_MINDIR[] = "mind_ir"; @@ -1078,6 +1083,11 @@ void InitHccl() { (void)context::OpenTsd(ms_context); } #endif +#if (ENABLE_GE || ENABLE_D) + if (!ProfilingManager::GetInstance().IsProfiling()) { + ProfilingManager::GetInstance().SetHcclEnabledBefProfilingEnabled(); + } +#endif } void FinalizeHccl() { diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc index 50232db910..3f756f6699 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc @@ -42,7 +42,7 @@ ProfilingManager &ProfilingManager::GetInstance() { return inst; } -ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}) {} +ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false) {} uint64_t ProfilingManager::GetJobId() const { const char *job_id = std::getenv("JOB_ID"); @@ -139,6 +139,14 @@ bool ProfilingManager::StartupProfiling(uint32_t device_id) { MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; return true; } + + if (hccl_enabled_bef_profiling_enabled_) { + MS_LOG(ERROR) + << "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() " + "and mindspore.communication.management.init(). Profiler should be initialized before these code."; + return false; + } + device_id_ = device_id; struct MsprofGeOptions prof_conf = {0}; diff --git a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h index bbec7ba1c4..4ba26d8da9 100644 --- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h +++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.h @@ -61,6 +61,7 @@ class ProfilingManager { void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; } Status GetProfConf(NotNull prof); + void SetHcclEnabledBefProfilingEnabled() { hccl_enabled_bef_profiling_enabled_ = true; } protected: ProfilingManager(); @@ -70,6 +71,7 @@ class ProfilingManager { bool ProfStartUp(NotNull prof_conf); uint32_t device_id_; MsprofCallback prof_cb_; + bool hccl_enabled_bef_profiling_enabled_; }; Status RegProfCtrlCallback(MsprofCtrlCallback func); diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 98b180eaf0..ce14c31ac7 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -256,8 +256,9 @@ class Profiler: """Collect and analyse gpu performance data""" if context.get_auto_parallel_context('device_num') > 1 and self._dev_id != str(get_rank()): self._dev_id = str(get_rank()) - logger.error('Please check the Profiler object initialized after set_auto_parallel_context() ' - 'and init(). Profiler should be initialized after these code. ') + logger.error('Please check the Profiler object initialized after mindspore.context.set_auto_parallel_' + 'context() and mindspore.communication.management.init(). Profiler should be initialized' + ' after these code.') self._gpu_profiler.stop() timeline_generator = self._generate_timeline()