| @@ -25,10 +25,12 @@ | |||
| namespace { | |||
| #ifdef ENABLE_DUMP_IR | |||
| constexpr auto ENV_RDR_ENABLE = "MS_RDR_ENABLE"; | |||
| constexpr auto ENV_RDR_MODE = "MS_RDR_MODE"; | |||
| constexpr auto ENV_RDR_PATH = "MS_RDR_PATH"; | |||
| constexpr auto KEY_RDR_SETTINGS = "rdr"; | |||
| constexpr auto KEY_PATH = "path"; | |||
| constexpr auto KEY_ENABLE = "enable"; | |||
| constexpr auto KEY_MODE = "mode"; | |||
| constexpr auto KEY_PATH = "path"; | |||
| #endif | |||
| constexpr auto KEY_MEM_REUSE_SETTINGS = "sys"; | |||
| constexpr auto KEY_MEM_REUSE = "mem_reuse"; | |||
| @@ -52,6 +54,22 @@ std::optional<bool> GetRdrEnableFromEnv() { | |||
| return std::nullopt; | |||
| } | |||
| std::optional<int> GetRdrModeFromEnv() { | |||
| // get environment variable to configure RDR | |||
| std::string env_mode_str = common::GetEnv(ENV_RDR_MODE); | |||
| if (!env_mode_str.empty()) { | |||
| (void)std::transform(env_mode_str.begin(), env_mode_str.end(), env_mode_str.begin(), ::tolower); | |||
| if (env_mode_str != "1" && env_mode_str != "2") { | |||
| MS_LOG(WARNING) << "The environment variable '" << ENV_RDR_MODE << "' should be 1 or 2."; | |||
| } | |||
| if (env_mode_str == "2") { | |||
| return Normal; | |||
| } | |||
| return Exceptional; | |||
| } | |||
| return std::nullopt; | |||
| } | |||
| std::optional<std::string> GetRdrPathFromEnv() { | |||
| // get environment variable to configure RDR | |||
| std::string path = common::GetEnv(ENV_RDR_PATH); | |||
| @@ -102,6 +120,11 @@ void EnvConfigParser::ParseFromEnv() { | |||
| has_rdr_setting_ = true; | |||
| rdr_enabled_ = rdr_enable_env.value(); | |||
| } | |||
| auto rdr_mode_env = GetRdrModeFromEnv(); | |||
| if (rdr_mode_env.has_value()) { | |||
| has_rdr_setting_ = true; | |||
| rdr_mode_ = rdr_mode_env.value(); | |||
| } | |||
| auto path_env = GetRdrPathFromEnv(); | |||
| if (path_env.has_value()) { | |||
| has_rdr_setting_ = true; | |||
| @@ -204,12 +227,35 @@ void EnvConfigParser::ParseRdrSetting(const nlohmann::json &content) { | |||
| ParseRdrEnable(**rdr_enable); | |||
| } | |||
| auto rdr_mode = CheckJsonKeyExist(*rdr_setting, KEY_RDR_SETTINGS, KEY_MODE); | |||
| if (rdr_mode.has_value()) { | |||
| ParseRdrMode(**rdr_mode); | |||
| } | |||
| auto rdr_path = CheckJsonKeyExist(*rdr_setting, KEY_RDR_SETTINGS, KEY_PATH); | |||
| if (rdr_path.has_value()) { | |||
| ParseRdrPath(**rdr_path); | |||
| } | |||
| } | |||
| void EnvConfigParser::ParseRdrEnable(const nlohmann::json &content) { | |||
| if (!content.is_boolean()) { | |||
| MS_LOG(WARNING) << "Json parse failed. 'enable' in " << KEY_RDR_SETTINGS << " should be boolean." | |||
| << " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context."; | |||
| return; | |||
| } | |||
| rdr_enabled_ = content; | |||
| } | |||
| void EnvConfigParser::ParseRdrMode(const nlohmann::json &content) { | |||
| if (content != Exceptional && content != Normal) { | |||
| MS_LOG(WARNING) << "Json parse failed. 'mode' in " << KEY_RDR_SETTINGS << " should be 1 or 2." | |||
| << " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context."; | |||
| return; | |||
| } | |||
| rdr_mode_ = content; | |||
| } | |||
| void EnvConfigParser::ParseRdrPath(const nlohmann::json &content) { | |||
| std::string err_msg = "RDR path parse failed. The RDR path will be a default value: '" + rdr_path_ + | |||
| "'. Please check the settings about '" + KEY_RDR_SETTINGS + "' in config file '" + | |||
| @@ -230,25 +276,19 @@ void EnvConfigParser::ParseRdrPath(const nlohmann::json &content) { | |||
| } | |||
| rdr_path_ = path; | |||
| } | |||
| void EnvConfigParser::ParseRdrEnable(const nlohmann::json &content) { | |||
| if (!content.is_boolean()) { | |||
| MS_LOG(WARNING) << "Json parse failed. 'enable' in " << KEY_RDR_SETTINGS << " should be boolean." | |||
| << " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context."; | |||
| return; | |||
| } | |||
| rdr_enabled_ = content; | |||
| } | |||
| #endif | |||
| void EnvConfigParser::ConfigToString() { | |||
| std::string cur_config; | |||
| #ifdef ENABLE_DUMP_IR | |||
| cur_config.append("After parsed, rdr path: "); | |||
| cur_config.append(rdr_path_); | |||
| cur_config.append(", rdr_enable: "); | |||
| cur_config.append("After parsed, "); | |||
| cur_config.append("rdr_enable: "); | |||
| std::string rdr_enable_flag = rdr_enabled_ ? "1" : "0"; | |||
| (void)cur_config.append(rdr_enable_flag); | |||
| cur_config.append(", rdr mode: "); | |||
| cur_config.append(std::to_string(rdr_mode_)); | |||
| cur_config.append(", rdr path: "); | |||
| cur_config.append(rdr_path_); | |||
| #endif | |||
| MS_LOG(INFO) << cur_config; | |||
| } | |||
| @@ -23,6 +23,8 @@ | |||
| #include "nlohmann/json.hpp" | |||
| #include "utils/ms_utils.h" | |||
| namespace mindspore { | |||
| enum RdrModes : int { Exceptional = 1, Normal = 2 }; | |||
| class EnvConfigParser { | |||
| public: | |||
| static EnvConfigParser &GetInstance() { | |||
| @@ -37,6 +39,7 @@ class EnvConfigParser { | |||
| #ifdef ENABLE_DUMP_IR | |||
| bool HasRdrSetting() const { return has_rdr_setting_; } | |||
| bool RdrEnabled() const { return rdr_enabled_; } | |||
| int RdrMode() const { return rdr_mode_; } | |||
| std::string RdrPath() const { return rdr_path_; } | |||
| #endif | |||
| bool GetSysMemreuse() { return sys_memreuse_; } | |||
| @@ -52,8 +55,9 @@ class EnvConfigParser { | |||
| #ifdef ENABLE_DUMP_IR | |||
| // rdr | |||
| bool rdr_enabled_{false}; | |||
| bool has_rdr_setting_{false}; | |||
| bool rdr_enabled_{false}; | |||
| int rdr_mode_{1}; | |||
| std::string rdr_path_{"./"}; | |||
| #endif | |||
| @@ -71,6 +75,7 @@ class EnvConfigParser { | |||
| void ParseRdrSetting(const nlohmann::json &content); | |||
| void ParseRdrPath(const nlohmann::json &content); | |||
| void ParseRdrEnable(const nlohmann::json &content); | |||
| void ParseRdrMode(const nlohmann::json &content); | |||
| #endif | |||
| void ParseMemReuseSetting(const nlohmann::json &content); | |||
| void ParseSysMemReuse(const nlohmann::json &content); | |||
| @@ -26,6 +26,8 @@ void RecorderManager::UpdateRdrEnable() { | |||
| } | |||
| auto &config_parser = mindspore::EnvConfigParser::GetInstance(); | |||
| rdr_enable_ = config_parser.RdrEnabled(); | |||
| rdr_mode_ = config_parser.RdrMode(); | |||
| rdr_mode_dup_ = rdr_mode_; | |||
| if (config_parser.HasRdrSetting()) { | |||
| #ifdef __linux__ | |||
| if (!rdr_enable_) { | |||
| @@ -105,7 +107,21 @@ void RecorderManager::TriggerAll() { | |||
| if (!trigger) { | |||
| MS_LOG(WARNING) << "There is no recorder to export."; | |||
| } else { | |||
| MS_LOG(INFO) << "RDR export all recorders."; | |||
| // Prevent duplicate data export by ClearResAtexit() in exceptional scenario. | |||
| rdr_mode_ = Exceptional; | |||
| MS_LOG(INFO) << "RDR exports all recorders."; | |||
| } | |||
| } | |||
| void RecorderManager::Snapshot() { | |||
| if (rdr_mode_ != Normal) { | |||
| // Restore RDR mode value from early backup. | |||
| rdr_mode_ = rdr_mode_dup_; | |||
| return; | |||
| } | |||
| RecorderManager::TriggerAll(); | |||
| if (rdr_enable_) { | |||
| MS_LOG(INFO) << "RDR exports all recorders in normal end scenario."; | |||
| } | |||
| } | |||
| @@ -23,6 +23,7 @@ | |||
| #include <memory> | |||
| #include <mutex> | |||
| #include <utility> | |||
| #include "debug/env_config_parser.h" | |||
| namespace mindspore { | |||
| // The number is the reciprocal of the golden ratio. | |||
| @@ -68,6 +69,7 @@ class RecorderManager { | |||
| bool RecordObject(const BaseRecorderPtr &recorder); | |||
| BaseRecorderPtr GetRecorder(std::string module, std::string name); | |||
| void TriggerAll(); | |||
| void Snapshot(); | |||
| void ClearAll(); | |||
| private: | |||
| @@ -75,6 +77,8 @@ class RecorderManager { | |||
| ~RecorderManager() {} | |||
| bool rdr_enable_{false}; | |||
| int rdr_mode_{Exceptional}; | |||
| int rdr_mode_dup_{Exceptional}; | |||
| bool rdr_has_record_mem_{false}; | |||
| mutable std::mutex mtx_; | |||
| @@ -118,6 +118,8 @@ bool UpdateMemAddress(const SubModuleId module, const std::string &name, const s | |||
| void TriggerAll() { mindspore::RecorderManager::Instance().TriggerAll(); } | |||
| void Snapshot() { mindspore::RecorderManager::Instance().Snapshot(); } | |||
| void ResetRecorder() { mindspore::RecorderManager::Instance().ClearAll(); } | |||
| void ClearMemAddressInfo() { | |||
| @@ -61,6 +61,7 @@ bool RecordTaskDebugInfo(SubModuleId module, const std::string &name, | |||
| const std::vector<TaskDebugInfoPtr> &task_debug_info_list); | |||
| #endif // ENABLE_D | |||
| void TriggerAll(); | |||
| void Snapshot(); | |||
| void ResetRecorder(); | |||
| void ClearMemAddressInfo(); | |||
| } // namespace RDR | |||
| @@ -1458,6 +1458,7 @@ void ClearResAtexit() { | |||
| } | |||
| #endif | |||
| #ifdef ENABLE_DUMP_IR | |||
| mindspore::RDR::Snapshot(); | |||
| mindspore::RDR::ResetRecorder(); | |||
| #endif | |||
| session::ExecutorManager::Instance().Clear(); | |||
| @@ -637,6 +637,9 @@ def set_context(**kwargs): | |||
| - enable: controls whether the RDR is enabled to collect the key data during training and | |||
| save key data in the fault scenario. When set to true, the RDR will be turned on. | |||
| When set to false, the RDR will be turned off. | |||
| - mode: sets the mode of RDR on exporting data. When set to 1, the RDR only exports data | |||
| in the fault scenario. When set to 2, the RDR exports data in the fault scenario and the | |||
| normal end scenario. Default is 1. | |||
| - path: sets the path where RDR saves data. The current path must be absolute. | |||
| Memory reuse: | |||