Merge pull request !25367 from yanghaitao/yht_condation_start_profilertags/v1.6.0
| @@ -1373,13 +1373,6 @@ void InitHccl() { | |||
| (void)context::OpenTsd(ms_context); | |||
| } | |||
| #endif | |||
| #if (defined ENABLE_D) | |||
| #ifndef ENABLE_SECURITY | |||
| if (!ProfilingManager::GetInstance().IsProfiling()) { | |||
| ProfilingManager::GetInstance().SetHcclEnabledBefProfilingEnabled(); | |||
| } | |||
| #endif | |||
| #endif | |||
| } | |||
| void FinalizeHccl() { | |||
| @@ -1440,38 +1433,10 @@ void ReleaseGeTsd() { | |||
| } | |||
| } | |||
| #ifndef ENABLE_SECURITY | |||
| void StartUpProfiling() { | |||
| #ifdef ENABLE_D | |||
| if (!ProfilingManager::GetInstance().IsProfiling()) { | |||
| return; | |||
| } | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| MS_LOG(INFO) << "Startup profiling"; | |||
| // Start up profiling before OpenTsd | |||
| uint32_t device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| std::string device_name = ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| if (ms_context->backend_policy() == "ms" && | |||
| ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice) { | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(device_name, device_id); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| runtime_instance->PreInit(); | |||
| } | |||
| #endif | |||
| } | |||
| #endif | |||
| void InitPipeline() { | |||
| // set python env flag | |||
| RecordInitStatus(); | |||
| mindspore::parse::python_adapter::set_python_env_flag(true); | |||
| #ifndef ENABLE_SECURITY | |||
| // Startup profiling before open tsd | |||
| StartUpProfiling(); | |||
| #endif | |||
| // open tsd before ge initialize | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| @@ -14,14 +14,27 @@ | |||
| */ | |||
| #include "profiler/device/ascend/ascend_profiling.h" | |||
| #include <map> | |||
| #include <string> | |||
| #include "pybind_api/api_register.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/utils.h" | |||
| #include "runtime/device/ascend/profiling/profiling_manager.h" | |||
| #include <nlohmann/json.hpp> | |||
| using mindspore::device::ascend::ProfilingManager; | |||
| namespace mindspore { | |||
| namespace profiler { | |||
| namespace ascend { | |||
| std::map<std::string, aclprofAicoreMetrics> kAicMetrics{ | |||
| {"ArithmeticUtilization", ACL_AICORE_ARITHMETIC_UTILIZATION}, | |||
| {"PipeUtilization", ACL_AICORE_PIPE_UTILIZATION}, | |||
| {"Memory", ACL_AICORE_MEMORY_BANDWIDTH}, | |||
| {"MemoryLO", ACL_AICORE_L0B_AND_WIDTH}, | |||
| {"ResourceConflictRatio", ACL_AICORE_RESOURCE_CONFLICT_RATIO}, | |||
| }; | |||
| std::shared_ptr<AscendProfiler> AscendProfiler::ascend_profiler_ = std::make_shared<AscendProfiler>(); | |||
| std::shared_ptr<AscendProfiler> &AscendProfiler::GetInstance() { return ascend_profiler_; } | |||
| @@ -31,21 +44,113 @@ void AscendProfiler::StepProfilingEnable(const bool enable_flag) { | |||
| enable_flag_ = enable_flag; | |||
| } | |||
| void AscendProfiler::Start(const std::string &profiling_options) { | |||
| void AscendProfiler::InitProfiling(const std::string &profiling_path, uint32_t device_id, | |||
| const std::string &profiling_options) { | |||
| MS_LOG(INFO) << "Begin to init profiling and call aclprofInit function."; | |||
| profiling_options_ = profiling_options; | |||
| profile_data_path_ = profiling_path; | |||
| device_id_ = device_id; | |||
| (void)ProfilingManager::GetInstance().InitProfiling(profiling_path, device_id); | |||
| aclError aclRet = aclprofInit(profile_data_path_.c_str(), profile_data_path_.length()); | |||
| if (aclRet != ACL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofInit function."; | |||
| } | |||
| } | |||
| uint64_t AscendProfiler::GetOptionsMask() const { | |||
| uint64_t mask = ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS; | |||
| nlohmann::json options_json; | |||
| try { | |||
| options_json = nlohmann::json::parse(profiling_options_); | |||
| } catch (const std::exception &err) { | |||
| MS_LOG(ERROR) << "Failed to parse profiling options."; | |||
| return ACL_AICORE_NONE; | |||
| } | |||
| if (options_json["task_trace"] == "on") { | |||
| mask |= ACL_PROF_TASK_TIME; | |||
| } | |||
| if (options_json["aicpu"] == "on") { | |||
| mask |= ACL_PROF_AICPU; | |||
| } | |||
| return mask; | |||
| } | |||
| aclprofAicoreMetrics AscendProfiler::GetAicMetrics() const { | |||
| nlohmann::json options_json; | |||
| try { | |||
| options_json = nlohmann::json::parse(profiling_options_); | |||
| } catch (const std::exception &err) { | |||
| MS_LOG(ERROR) << "Failed to parse profiling options."; | |||
| return ACL_AICORE_NONE; | |||
| } | |||
| auto result = std::find_if(kAicMetrics.begin(), kAicMetrics.end(), [&options_json](const auto &metric) { | |||
| return metric.first == options_json["aic_metrics"]; | |||
| }); | |||
| if (result == kAicMetrics.end()) { | |||
| return ACL_AICORE_NONE; | |||
| } | |||
| return result->second; | |||
| } | |||
| void AscendProfiler::Start() { | |||
| uint32_t device_list[1] = {device_id_}; | |||
| uint32_t device_num = 1; | |||
| uint64_t mask = GetOptionsMask(); | |||
| aclprofAicoreMetrics aic_metrics = GetAicMetrics(); | |||
| acl_config_ = aclprofCreateConfig(device_list, device_num, aic_metrics, nullptr, GetOptionsMask()); | |||
| if (acl_config_ == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofCreateConfig function."; | |||
| } | |||
| aclError aclRet = aclprofStart(acl_config_); | |||
| if (aclRet != ACL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofStart function."; | |||
| } | |||
| MS_LOG(INFO) << "Start profiling, options mask is " << mask << " aic_metrics is " << aic_metrics; | |||
| StepProfilingEnable(true); | |||
| } | |||
| void AscendProfiler::Stop() { | |||
| MS_LOG(INFO) << "Stop profiling"; | |||
| MS_LOG(INFO) << "Begin to stop profiling."; | |||
| if (acl_config_ == nullptr) { | |||
| MS_LOG(EXCEPTION) | |||
| << "Failed to stop profiling because of null acl config.Please make sure call Profiler.Start function " | |||
| "before call Profiler.Stop function."; | |||
| } | |||
| aclError aclRet = aclprofStop(acl_config_); | |||
| if (aclRet != ACL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofStop function."; | |||
| } | |||
| aclRet = aclprofDestroyConfig(acl_config_); | |||
| if (aclRet != ACL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function."; | |||
| } | |||
| StepProfilingEnable(false); | |||
| } | |||
| void AscendProfiler::Finalize() const { | |||
| MS_LOG(INFO) << "Begin to finalize profiling"; | |||
| aclError aclRet = aclprofFinalize(); | |||
| if (aclRet != ACL_SUCCESS) { | |||
| MS_LOG(EXCEPTION) << "Failed to call aclprofDestroyConfig function."; | |||
| } | |||
| } | |||
| REGISTER_PYBIND_DEFINE(AscendProfiler_, ([](const py::module *m) { | |||
| (void)py::class_<AscendProfiler, std::shared_ptr<AscendProfiler>>(*m, "AscendProfiler") | |||
| .def_static("get_instance", &AscendProfiler::GetInstance, "AscendProfiler get_instance.") | |||
| .def("start", &AscendProfiler::Start, py::arg("profiling_options"), "start") | |||
| .def("stop", &AscendProfiler::Stop, "stop"); | |||
| .def("init", &AscendProfiler::InitProfiling, py::arg("profiling_path"), py::arg("device_id"), | |||
| py::arg("profiling_options"), "init") | |||
| .def("start", &AscendProfiler::Start, "start") | |||
| .def("stop", &AscendProfiler::Stop, "stop") | |||
| .def("finalize", &AscendProfiler::Finalize, "finalize"); | |||
| })); | |||
| } // namespace ascend | |||
| } // namespace profiler | |||
| @@ -18,6 +18,7 @@ | |||
| #include <string> | |||
| #include <memory> | |||
| #include "profiler/device/profiling.h" | |||
| #include "acl/acl_prof.h" | |||
| namespace mindspore { | |||
| namespace profiler { | |||
| @@ -30,18 +31,24 @@ class AscendProfiler : public Profiler { | |||
| AscendProfiler(const AscendProfiler &) = delete; | |||
| AscendProfiler &operator=(const AscendProfiler &) = delete; | |||
| void Init(const std::string &profileDataPath) { return; } | |||
| void InitProfiling(const std::string &profiling_path, uint32_t device_id, const std::string &profiling_options); | |||
| void Stop(); | |||
| void StepProfilingEnable(const bool enable_flag) override; | |||
| void OpDataProducerEnd() { return; } | |||
| void Start(const std::string &profiling_options); | |||
| void Start(); | |||
| bool GetProfilingEnableFlag() const { return enable_flag_; } | |||
| std::string GetProfilingOptions() const { return profiling_options_; } | |||
| void SaveProfileData() { return; } | |||
| void ClearInst() { return; } | |||
| uint64_t GetOptionsMask() const; | |||
| aclprofAicoreMetrics GetAicMetrics() const; | |||
| void Finalize() const; | |||
| private: | |||
| static std::shared_ptr<AscendProfiler> ascend_profiler_; | |||
| std::string profiling_options_; | |||
| uint32_t device_id_; | |||
| aclprofConfig *acl_config_; | |||
| }; | |||
| } // namespace ascend | |||
| } // namespace profiler | |||
| @@ -244,7 +244,7 @@ void AsyncDataDumpUninit() { | |||
| void AscendKernelRuntime::ReportProfilingData() { | |||
| auto context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context); | |||
| if (ProfilingManager::GetInstance().IsProfiling() && | |||
| if (ProfilingManager::GetInstance().IsProfilingStart() && | |||
| context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) { | |||
| // Save Profiling Framework data | |||
| OpNameTaskStreamReporter reporter(device_id_, "nonsink", stream_id_task_id_op_name_map_); | |||
| @@ -295,9 +295,6 @@ void AscendKernelRuntime::ReleaseDeviceRes() { | |||
| } | |||
| (void)ResetDevice(device_id); | |||
| #ifndef ENABLE_SECURITY | |||
| (void)ProfilingManager::GetInstance().StopProfiling(); | |||
| #endif | |||
| current_graph_ = nullptr; | |||
| if (context_ptr->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode && | |||
| !context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK)) { | |||
| @@ -313,14 +310,6 @@ void AscendKernelRuntime::PreInit() { | |||
| if (error_manager_ret != 0) { | |||
| MS_LOG(WARNING) << "Init ErrorManager failed."; | |||
| } | |||
| auto ret = ProfilingManager::GetInstance().StartupProfiling(device_id_); | |||
| if (!ret) { | |||
| const string &error_message = ErrorManager::GetInstance().GetErrorMessage(); | |||
| if (!error_message.empty() && error_message.find(kUnknowErrorString) == string::npos) { | |||
| MS_LOG(ERROR) << "Ascend error occurred, error message:\n" << error_message; | |||
| } | |||
| MS_EXCEPTION(DeviceProcessError) << "StartupProfiling failed."; | |||
| } | |||
| } | |||
| #endif | |||
| @@ -567,10 +556,18 @@ bool AscendKernelRuntime::LoadTask(const session::KernelGraph &graph) { | |||
| } | |||
| #ifndef ENABLE_SECURITY | |||
| if (ProfilingManager::GetInstance().IsProfiling()) { | |||
| if (ProfilingManager::GetInstance().IsProfilingInitialized()) { | |||
| auto task_ids = ModelRunner::Instance().GetTaskIdList(model_iter->first); | |||
| auto stream_ids = ModelRunner::Instance().GetStreamIdList(model_iter->first); | |||
| ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph); | |||
| // Report data directly if profiling is start | |||
| if (ProfilingUtils::ValidComputeGraph(graph)) { | |||
| if (ProfilingManager::GetInstance().IsProfilingStart()) { | |||
| ProfilingUtils::ReportProfilingData(task_ids, stream_ids, graph.graph_id()); | |||
| } else { | |||
| // Cache data and save when profiling is start | |||
| ProfilingUtils::SetReportProfilingData(task_ids, stream_ids, graph.graph_id()); | |||
| } | |||
| } | |||
| } | |||
| LaunchDataDump(graph.graph_id()); | |||
| #endif | |||
| @@ -25,6 +25,9 @@ | |||
| #include "utils/convert_utils.h" | |||
| #include "runtime/base.h" | |||
| #include <nlohmann/json.hpp> | |||
| #include "runtime/device/ascend/profiling/profiling_utils.h" | |||
| using mindspore::device::ascend::ProfilingUtils; | |||
| namespace { | |||
| constexpr Status PROF_SUCCESS = 0; | |||
| @@ -39,7 +42,8 @@ ProfilingManager &ProfilingManager::GetInstance() { | |||
| return inst; | |||
| } | |||
| ProfilingManager::ProfilingManager() : device_id_(0), prof_cb_({0}), hccl_enabled_bef_profiling_enabled_(false) {} | |||
| ProfilingManager::ProfilingManager() | |||
| : device_id_(0), prof_cb_({0}), cur_state_(kProfilingInvalid), profiling_path_("") {} | |||
| uint64_t ProfilingManager::GetJobId() const { return 0; } | |||
| @@ -110,57 +114,15 @@ Status ProfilingManager::GetProfConf(const NotNull<MsprofGeOptions *> prof) { | |||
| return PROF_SUCCESS; | |||
| } | |||
| bool ProfilingManager::StartupProfiling(uint32_t device_id) { | |||
| auto is_profiling = IsProfiling(); | |||
| if (!is_profiling) { | |||
| int32_t cb_ret = MsprofInit(0XFF, nullptr, 0); | |||
| if (cb_ret != UintToInt(PROF_SUCCESS)) { | |||
| MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret; | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; | |||
| return true; | |||
| } | |||
| if (hccl_enabled_bef_profiling_enabled_) { | |||
| MS_LOG(ERROR) | |||
| << "Please check the Profiler object initialized before mindspore.context.set_auto_parallel_context() " | |||
| "and mindspore.communication.management.init(). Profiler should be initialized before these code."; | |||
| return false; | |||
| } | |||
| bool ProfilingManager::InitProfiling(const std::string &profiling_path, uint32_t device_id) { | |||
| profiling_path_ = profiling_path; | |||
| device_id_ = device_id; | |||
| struct MsprofGeOptions prof_conf = {0}; | |||
| if (GetProfConf(NOT_NULL(&prof_conf)) != PROF_SUCCESS) { | |||
| MS_LOG(ERROR) << "Get prof conf failed."; | |||
| return false; | |||
| } | |||
| if (!ProfStartUp(NOT_NULL(&prof_conf))) { | |||
| MS_LOG(ERROR) << "ProfMgrStartUp failed."; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| bool ProfilingManager::ProfStartUp(const NotNull<MsprofGeOptions *> prof_conf) const { | |||
| MS_LOG(INFO) << "Prof start up. "; | |||
| bool ret = ProfRegisterCtrlCallback(); | |||
| if (ret == false) { | |||
| return ret; | |||
| } | |||
| // call profiling start up api | |||
| int32_t cb_ret = MsprofInit(static_cast<uint32_t>(MsprofCtrlCallbackType::MSPROF_CTRL_INIT_GE_OPTIONS), | |||
| static_cast<void *>(prof_conf.get()), sizeof(MsprofGeOptions)); | |||
| if (cb_ret != UintToInt(PROF_SUCCESS)) { | |||
| MS_LOG(ERROR) << "Call msprofCtrlCallback failed, ret: " << cb_ret; | |||
| return false; | |||
| } | |||
| MS_LOG(INFO) << "Start up profiling success."; | |||
| return true; | |||
| } | |||
| @@ -188,25 +150,6 @@ rtError_t CtrlCallbackHandle(uint32_t rt_type, void *data, uint32_t /* len */) { | |||
| return RT_ERROR_NONE; | |||
| } | |||
| bool ProfilingManager::StopProfiling() const { | |||
| MS_LOG(INFO) << "StopProfiling"; | |||
| if (!IsProfiling()) { | |||
| MS_LOG(INFO) << "No need profiling. please export PROFILING_MODE and in train mode."; | |||
| return true; | |||
| } | |||
| // plugin unregister | |||
| PluginUnInit(); | |||
| // stop profiling | |||
| int32_t cb_ret = MsprofFinalize(); | |||
| if (cb_ret != 0) { | |||
| MS_LOG(WARNING) << "Call MsprofFinalize failed, ret: " << cb_ret; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter_data) const { | |||
| if (prof_cb_.msprofReporterCallback == nullptr) { | |||
| MS_LOG(ERROR) << "MsprofReporterCallback callback is nullptr."; | |||
| @@ -224,6 +167,58 @@ Status ProfilingManager::CallMsprofReport(const NotNull<ReporterData *> reporter | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfilingManager::ProfHandleInit() { | |||
| MS_LOG(INFO) << "Begin to init profiling. Current profiling state is " << cur_state_; | |||
| cur_state_ = kProfilingInit; | |||
| auto cb_ret = ProfilingManager::GetInstance().PluginInit(); | |||
| if (cb_ret != PROF_SUCCESS) { | |||
| MS_LOG(ERROR) << "Failed to init profiling."; | |||
| return PROF_FAILED; | |||
| } | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfilingManager::ProfHandleStart() { | |||
| MS_LOG(INFO) << "Begin to start profiling. Current profiling state is " << cur_state_; | |||
| cur_state_ = kProfilingStart; | |||
| // Report graph data if there is any cache data. | |||
| ProfilingUtils::ReportAllGraphProfilingData(); | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfilingManager::ProfHandleStop() { | |||
| MS_LOG(INFO) << "Begin to stop profiling. Current profiling state is " << cur_state_; | |||
| cur_state_ = kProfilingStop; | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfilingManager::ProfHandleFinalize() { | |||
| MS_LOG(INFO) << "Begin to finalize profiling. Current profiling state is " << cur_state_; | |||
| cur_state_ = kProfilingFinalize; | |||
| ProfilingManager::GetInstance().PluginUnInit(); | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfilingManager::ProfCommandHandle(ProfCommandHandleType type) { | |||
| // Only need process "Init"/“Start”/“Stop”/“Finalize” | |||
| if (type == kProfCommandhandleInit) { | |||
| return ProfHandleInit(); | |||
| } else if (type == kProfCommandhandleStart) { | |||
| return ProfHandleStart(); | |||
| } else if (type == kProfCommandhandleStop) { | |||
| return ProfHandleStop(); | |||
| } else if (type == kProfCommandhandleFinalize) { | |||
| return ProfHandleFinalize(); | |||
| } | |||
| MS_LOG(ERROR) << "Receive invalid profiling type " << type << ". Current profiling state is << " << cur_state_; | |||
| return PROF_FAILED; | |||
| } | |||
| Status ProfCtrlSwitchHandle(void *data) { | |||
| if (data == nullptr) { | |||
| MS_LOG(ERROR) << "Ctrl switch handl data is nullptr."; | |||
| @@ -235,18 +230,7 @@ Status ProfCtrlSwitchHandle(void *data) { | |||
| return ProfCommandHandle(type); | |||
| } | |||
| Status ProfCommandHandle(ProfCommandHandleType type) { | |||
| MS_LOG(INFO) << "ProfCommandHandle start, type:" << type; | |||
| if (type == kProfCommandhandleInit) { | |||
| auto cb_ret = ProfilingManager::GetInstance().PluginInit(); | |||
| if (cb_ret != PROF_SUCCESS) { | |||
| MS_LOG(ERROR) << "Profiling plugin int failed."; | |||
| return PROF_FAILED; | |||
| } | |||
| } | |||
| return PROF_SUCCESS; | |||
| } | |||
| Status ProfCommandHandle(ProfCommandHandleType type) { return ProfilingManager::GetInstance().ProfCommandHandle(type); } | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -28,6 +28,7 @@ | |||
| #include "toolchain/slog.h" | |||
| #include "runtime/base.h" | |||
| #include "profiler/device/profiling.h" | |||
| #include "acl/acl_prof.h" | |||
| using std::map; | |||
| using std::string; | |||
| @@ -50,19 +51,16 @@ enum ProfCommandHandleType { | |||
| kProfCommandhandleModelUnsubscribe | |||
| }; | |||
| enum ProfilingState { kProfilingInvalid, kProfilingInit, kProfilingStart, kProfilingStop, kProfilingFinalize }; | |||
| class ProfilingManager { | |||
| public: | |||
| static ProfilingManager &GetInstance(); | |||
| uint64_t GetJobId() const; | |||
| bool ProfRegisterCtrlCallback() const; | |||
| bool StartupProfiling(uint32_t device_id); | |||
| bool StopProfiling() const; | |||
| inline bool IsProfiling() const { | |||
| auto profiler_manager = profiler::ProfilerManager::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(profiler_manager); | |||
| return profiler_manager->GetProfilingEnableFlag(); | |||
| } | |||
| bool InitProfiling(const std::string &profiling_path, uint32_t device_id); | |||
| bool IsProfilingInitialized() const { return cur_state_ >= kProfilingInit; } | |||
| inline bool IsProfilingStart() const { return cur_state_ >= kProfilingStart; } | |||
| Status PluginInit() const; | |||
| void PluginUnInit() const; | |||
| Status CallMsprofReport(NotNull<ReporterData *> reporter_data) const; | |||
| @@ -71,17 +69,22 @@ class ProfilingManager { | |||
| void SetMsprofReporterCallback(MsprofReporterCallback func) { prof_cb_.msprofReporterCallback = func; } | |||
| void SetMsprofSetDeviceCallback(MsprofSetDeviceCallback func) { prof_cb_.msprofSetDeviceCallback = func; } | |||
| Status GetProfConf(NotNull<MsprofGeOptions *> prof); | |||
| void SetHcclEnabledBefProfilingEnabled() { hccl_enabled_bef_profiling_enabled_ = true; } | |||
| Status ProfCommandHandle(ProfCommandHandleType type); | |||
| Status ProfHandleInit(); | |||
| Status ProfHandleStart(); | |||
| Status ProfHandleStop(); | |||
| Status ProfHandleFinalize(); | |||
| protected: | |||
| ProfilingManager(); | |||
| ~ProfilingManager() {} | |||
| private: | |||
| bool ProfStartUp(NotNull<MsprofGeOptions *> prof_conf) const; | |||
| uint32_t device_id_; | |||
| MsprofCallback prof_cb_; | |||
| bool hccl_enabled_bef_profiling_enabled_; | |||
| aclprofConfig *acl_config_; | |||
| ProfilingState cur_state_; | |||
| std::string profiling_path_; | |||
| }; | |||
| Status ProfCommandHandle(ProfCommandHandleType type); | |||
| @@ -390,14 +390,15 @@ bool ProfilingUtils::ValidComputeGraph(const session::KernelGraph &kernel_graph) | |||
| return false; | |||
| } | |||
| void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids, | |||
| const session::KernelGraph &kernel_graph) { | |||
| if (!ValidComputeGraph(kernel_graph)) { | |||
| MS_LOG(INFO) << "Not a valid compute graph:" << kernel_graph.graph_id(); | |||
| return; | |||
| void ProfilingUtils::ReportAllGraphProfilingData() { | |||
| for (auto data : report_data_) { | |||
| ReportProfilingData(data.task_ids_, data.stream_ids_, data.graph_id_); | |||
| } | |||
| } | |||
| auto ret = graph_profiling_cnode_.find(kernel_graph.graph_id()); | |||
| void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids, | |||
| uint32_t graph_id) { | |||
| auto ret = graph_profiling_cnode_.find(graph_id); | |||
| if (ret == graph_profiling_cnode_.end()) { | |||
| MS_LOG(ERROR) << "Graph id not found"; | |||
| return; | |||
| @@ -415,7 +416,7 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, | |||
| graph_reporter.ReportData(); | |||
| // Report profiling point | |||
| auto point_iter = graph_point_.find(kernel_graph.graph_id()); | |||
| auto point_iter = graph_point_.find(graph_id); | |||
| if (point_iter == graph_point_.end()) { | |||
| MS_LOG(ERROR) << "Graph id not found in graph_point"; | |||
| return; | |||
| @@ -426,6 +427,12 @@ void ProfilingUtils::ReportProfilingData(const std::vector<uint32_t> &task_ids, | |||
| } | |||
| point_reporter.ReportData(); | |||
| } | |||
| void ProfilingUtils::SetReportProfilingData(const std::vector<uint32_t> &task_ids, | |||
| const std::vector<uint32_t> &stream_ids, uint32_t graph_id) { | |||
| GraphProfilingData report_data = {task_ids, stream_ids, graph_id}; | |||
| report_data_.emplace_back(report_data); | |||
| } | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -48,6 +48,12 @@ struct ProfilingContent { | |||
| uint32_t flags; | |||
| }; | |||
| struct GraphProfilingData { | |||
| std::vector<uint32_t> task_ids_; | |||
| std::vector<uint32_t> stream_ids_; | |||
| uint32_t graph_id_; | |||
| }; | |||
| class ProfilingUtils { | |||
| public: | |||
| ProfilingUtils() = default; | |||
| @@ -69,7 +75,7 @@ class ProfilingUtils { | |||
| static void SetGraphKernelName(uint32_t graph_id, const std::vector<std::string> &kernel_names); | |||
| // Save graph information to Framework file | |||
| static void ReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids, | |||
| const session::KernelGraph &graph); | |||
| uint32_t graph_id); | |||
| // Generate profiling trace | |||
| static ProfilingTraceInfo GenerateProfilingTrace(const session::KernelGraph &kernel_graph); | |||
| @@ -81,6 +87,11 @@ class ProfilingUtils { | |||
| static std::map<uint32_t, std::vector<std::string>> graph_kernel_name() { return graph_kernel_name_; } | |||
| static void SetReportProfilingData(const std::vector<uint32_t> &task_ids, const std::vector<uint32_t> &stream_ids, | |||
| uint32_t graph_id); | |||
| static void ReportAllGraphProfilingData(); | |||
| static bool ValidComputeGraph(const session::KernelGraph &kernel_graph); | |||
| inline static constexpr char kProfiling[] = "Profiling"; | |||
| inline static constexpr char kNotify[] = "notify"; | |||
| inline static constexpr char kProfilerTraceId[] = "profiler_trace_id"; | |||
| @@ -101,7 +112,6 @@ class ProfilingUtils { | |||
| static void GetCNodeOutputRealNode(const std::string &node_name, const session::KernelGraph &kernel_graph, | |||
| NotNull<std::set<std::string> *> getnext_outputs); | |||
| static bool ValidComputeGraph(const session::KernelGraph &kernel_graph); | |||
| static void SaveProfilingPoint(uint32_t graph_id, const std::string &node_name, uint32_t point_id); | |||
| // graph id --> (kernel name list) | |||
| @@ -109,8 +119,9 @@ class ProfilingUtils { | |||
| inline static std::map<uint32_t, std::vector<std::string>> graph_kernel_name_; | |||
| inline static std::map<uint32_t, std::vector<std::shared_ptr<ProfDesc>>> graph_point_; | |||
| inline static uint32_t custom_node_index_; | |||
| inline static std::vector<GraphProfilingData> report_data_; | |||
| }; | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEND_PROFILING_PROFILING_UTILS_H_ | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_DEVICE_ASCEN_D_PROFILING_PROFILING_UTILS_H_ | |||
| @@ -278,7 +278,7 @@ bool TaskGenerator::LaunchAllKernel(const std::vector<CNodePtr> &anf_node_list, | |||
| #ifndef ENABLE_SECURITY | |||
| ProfilingUtils::SetGraphKernelName(graph_id, kernel_name_list); | |||
| if (ProfilingManager::GetInstance().IsProfiling()) { | |||
| if (ProfilingManager::GetInstance().IsProfilingInitialized()) { | |||
| ProfilingUtils::SetGraphProfilingCNode(graph_id, profiling_cnode_list); | |||
| } | |||
| #endif | |||
| @@ -642,7 +642,7 @@ CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr<session::K | |||
| #ifndef ENABLE_SECURITY | |||
| void KernelAdjust::Profiling(NotNull<session::KernelGraph *> kernel_graph_ptr) { | |||
| if (!ascend::ProfilingManager::GetInstance().IsProfiling()) { | |||
| if (!ascend::ProfilingManager::GetInstance().IsProfilingInitialized()) { | |||
| MS_LOG(INFO) << "No need to profiling"; | |||
| return; | |||
| } | |||
| @@ -140,6 +140,8 @@ class Profiler: | |||
| self._get_devid_rankid_and_devtarget() | |||
| self._get_output_path(kwargs) | |||
| self._profile_communication = False | |||
| self._has_started = False | |||
| self.start_profile = True | |||
| # Setup and start MindData Profiling | |||
| self._md_profiler = cde.GlobalContext.profiling_manager() | |||
| @@ -174,7 +176,7 @@ class Profiler: | |||
| raise ValueError(msg) | |||
| # use context interface to open profiling, for the new mindspore version(after 2020.5.21) | |||
| self._ascend_profiler = c_expression.AscendProfiler.get_instance() | |||
| self._ascend_profiler.start(profiling_options) | |||
| self._ascend_profiler.init(self._output_path, int(self._dev_id), profiling_options) | |||
| base_profiling_container_path = os.path.join(self._output_path, "container") | |||
| container_path = os.path.join(base_profiling_container_path, self._dev_id) | |||
| data_path = os.path.join(container_path, "data") | |||
| @@ -184,8 +186,10 @@ class Profiler: | |||
| # add job id env through user input later | |||
| self._job_id_env = 0 | |||
| self._start_time = int(time.time() * 10000000) | |||
| logger.info("Profiling: profiling start time: %d", self._start_time) | |||
| self._init_time = int(time.time() * 10000000) | |||
| logger.info("Profiling: profiling init time: %d", self._init_time) | |||
| if self.start_profile: | |||
| self.start() | |||
| def _construct_profiling_options(self): | |||
| """ | |||
| @@ -225,7 +229,9 @@ class Profiler: | |||
| logger.critical(msg) | |||
| raise ValueError(msg) | |||
| self._output_path, _ = os.path.split(job_dir) | |||
| self.start_profile = kwargs.pop("start_profile", True) | |||
| if not isinstance(self.start_profile, bool): | |||
| raise TypeError("The parameter start_profile must be bool.") | |||
| self._profile_communication = kwargs.pop("profile_communication", False) | |||
| if not isinstance(self._profile_communication, bool): | |||
| raise TypeError("The parameter profile_communication must be bool.") | |||
| @@ -270,6 +276,12 @@ class Profiler: | |||
| self._rank_size = get_group_size() | |||
| release() | |||
| if (not self.start_profile) or self._has_started: | |||
| self._ascend_profiler.stop() | |||
| else: | |||
| msg = "The profiler has not start, so can not stop." | |||
| logger.info(msg) | |||
| self._ascend_profiler.finalize() | |||
| job_id = self._get_profiling_job_id() | |||
| logger.info("Profiling: job id is %s ", job_id) | |||
| @@ -377,7 +389,30 @@ class Profiler: | |||
| self._dev_id, self._rank_id, is_training_mode_flag) | |||
| logger.info("Profiling: analyzing the operation FLOPs.") | |||
| flops_parser.execute() | |||
| def start(self): | |||
| """Used for Ascend, start profiling.""" | |||
| if not self._has_started: | |||
| self._has_started = True | |||
| else: | |||
| msg = "The profiler has already started." | |||
| logger.error(msg) | |||
| raise RuntimeError(msg) | |||
| self._ascend_profiler.start() | |||
| self._start_time = int(time.time() * 10000000) | |||
| logger.info("Profiling: start time: %d", self._start_time) | |||
| def stop(self): | |||
| """Used for Ascend, stop profiling.""" | |||
| if self._has_started: | |||
| self._has_started = False | |||
| else: | |||
| msg = "The profiler has not start, so can not stop." | |||
| logger.error(msg) | |||
| raise RuntimeError(msg) | |||
| self._ascend_profiler.stop() | |||
| self._stop_time = int(time.time() * 10000000) | |||
| logger.info("Profiling: stop time: %d", self._stop_time) | |||
| def _gpu_analyse(self): | |||
| """Collect and analyse gpu performance data""" | |||
| @@ -573,8 +608,7 @@ class Profiler: | |||
| if int(job_start_time) < self._start_time: | |||
| logger.warning("Find profiling job path %s, but start_time(%d) is earlier than this training " | |||
| "start_time(%d), profiler will ignore this job dir.", | |||
| job_dir, job_start_time, self._start_time) | |||
| continue | |||
| job_dir, int(job_start_time), self._start_time) | |||
| job_id = dir_name | |||
| break | |||
| @@ -16,6 +16,7 @@ | |||
| #include <string> | |||
| #include "prof_mgr_core.h" | |||
| #include "prof_callback.h" | |||
| #include "acl/acl_prof.h" | |||
| namespace Msprof { | |||
| namespace Engine { | |||
| @@ -72,4 +73,20 @@ int32_t MsprofInit(uint32_t dataType, void *data, uint32_t dataLen) { return 0; | |||
| * @param NULL | |||
| * @return 0:SUCCESS, >0:FAILED | |||
| */ | |||
| int32_t MsprofFinalize() { return 0; } | |||
| int32_t MsprofFinalize() { return 0; } | |||
| ACL_FUNC_VISIBILITY aclError aclprofInit(const char *profilerResultPath, size_t length) { return ACL_SUCCESS; } | |||
| ACL_FUNC_VISIBILITY aclError aclprofStart(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; } | |||
| ACL_FUNC_VISIBILITY aclError aclprofStop(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; } | |||
| ACL_FUNC_VISIBILITY aclError aclprofFinalize() { return ACL_SUCCESS; } | |||
| ACL_FUNC_VISIBILITY aclprofConfig *aclprofCreateConfig(uint32_t *deviceIdList, uint32_t deviceNums, | |||
| aclprofAicoreMetrics aicoreMetrics, | |||
| aclprofAicoreEvents *aicoreEvents, uint64_t dataTypeConfig) { | |||
| return nullptr; | |||
| } | |||
| ACL_FUNC_VISIBILITY aclError aclprofDestroyConfig(const aclprofConfig *profilerConfig) { return ACL_SUCCESS; } | |||