Browse Source

RDR supports exporting data in normal end scenario

tags/v1.6.0
liangyongxiong 4 years ago
parent
commit
94aee10d06
8 changed files with 87 additions and 15 deletions
  1. +53
    -13
      mindspore/ccsrc/debug/env_config_parser.cc
  2. +6
    -1
      mindspore/ccsrc/debug/env_config_parser.h
  3. +17
    -1
      mindspore/ccsrc/debug/rdr/recorder_manager.cc
  4. +4
    -0
      mindspore/ccsrc/debug/rdr/recorder_manager.h
  5. +2
    -0
      mindspore/ccsrc/debug/rdr/running_data_recorder.cc
  6. +1
    -0
      mindspore/ccsrc/debug/rdr/running_data_recorder.h
  7. +1
    -0
      mindspore/ccsrc/pipeline/jit/pipeline.cc
  8. +3
    -0
      mindspore/context.py

+ 53
- 13
mindspore/ccsrc/debug/env_config_parser.cc View File

@@ -25,10 +25,12 @@
namespace {
#ifdef ENABLE_DUMP_IR
constexpr auto ENV_RDR_ENABLE = "MS_RDR_ENABLE";
constexpr auto ENV_RDR_MODE = "MS_RDR_MODE";
constexpr auto ENV_RDR_PATH = "MS_RDR_PATH";
constexpr auto KEY_RDR_SETTINGS = "rdr";
constexpr auto KEY_PATH = "path";
constexpr auto KEY_ENABLE = "enable";
constexpr auto KEY_MODE = "mode";
constexpr auto KEY_PATH = "path";
#endif
constexpr auto KEY_MEM_REUSE_SETTINGS = "sys";
constexpr auto KEY_MEM_REUSE = "mem_reuse";
@@ -52,6 +54,22 @@ std::optional<bool> GetRdrEnableFromEnv() {
return std::nullopt;
}

std::optional<int> GetRdrModeFromEnv() {
// get environment variable to configure RDR
std::string env_mode_str = common::GetEnv(ENV_RDR_MODE);
if (!env_mode_str.empty()) {
(void)std::transform(env_mode_str.begin(), env_mode_str.end(), env_mode_str.begin(), ::tolower);
if (env_mode_str != "1" && env_mode_str != "2") {
MS_LOG(WARNING) << "The environment variable '" << ENV_RDR_MODE << "' should be 1 or 2.";
}
if (env_mode_str == "2") {
return Normal;
}
return Exceptional;
}
return std::nullopt;
}

std::optional<std::string> GetRdrPathFromEnv() {
// get environment variable to configure RDR
std::string path = common::GetEnv(ENV_RDR_PATH);
@@ -102,6 +120,11 @@ void EnvConfigParser::ParseFromEnv() {
has_rdr_setting_ = true;
rdr_enabled_ = rdr_enable_env.value();
}
auto rdr_mode_env = GetRdrModeFromEnv();
if (rdr_mode_env.has_value()) {
has_rdr_setting_ = true;
rdr_mode_ = rdr_mode_env.value();
}
auto path_env = GetRdrPathFromEnv();
if (path_env.has_value()) {
has_rdr_setting_ = true;
@@ -204,12 +227,35 @@ void EnvConfigParser::ParseRdrSetting(const nlohmann::json &content) {
ParseRdrEnable(**rdr_enable);
}

auto rdr_mode = CheckJsonKeyExist(*rdr_setting, KEY_RDR_SETTINGS, KEY_MODE);
if (rdr_mode.has_value()) {
ParseRdrMode(**rdr_mode);
}

auto rdr_path = CheckJsonKeyExist(*rdr_setting, KEY_RDR_SETTINGS, KEY_PATH);
if (rdr_path.has_value()) {
ParseRdrPath(**rdr_path);
}
}

void EnvConfigParser::ParseRdrEnable(const nlohmann::json &content) {
if (!content.is_boolean()) {
MS_LOG(WARNING) << "Json parse failed. 'enable' in " << KEY_RDR_SETTINGS << " should be boolean."
<< " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context.";
return;
}
rdr_enabled_ = content;
}

void EnvConfigParser::ParseRdrMode(const nlohmann::json &content) {
if (content != Exceptional && content != Normal) {
MS_LOG(WARNING) << "Json parse failed. 'mode' in " << KEY_RDR_SETTINGS << " should be 1 or 2."
<< " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context.";
return;
}
rdr_mode_ = content;
}

void EnvConfigParser::ParseRdrPath(const nlohmann::json &content) {
std::string err_msg = "RDR path parse failed. The RDR path will be a default value: '" + rdr_path_ +
"'. Please check the settings about '" + KEY_RDR_SETTINGS + "' in config file '" +
@@ -230,25 +276,19 @@ void EnvConfigParser::ParseRdrPath(const nlohmann::json &content) {
}
rdr_path_ = path;
}

void EnvConfigParser::ParseRdrEnable(const nlohmann::json &content) {
if (!content.is_boolean()) {
MS_LOG(WARNING) << "Json parse failed. 'enable' in " << KEY_RDR_SETTINGS << " should be boolean."
<< " Please check the config file '" << config_file_ << "' set by 'env_config_path' in context.";
return;
}
rdr_enabled_ = content;
}
#endif

void EnvConfigParser::ConfigToString() {
std::string cur_config;
#ifdef ENABLE_DUMP_IR
cur_config.append("After parsed, rdr path: ");
cur_config.append(rdr_path_);
cur_config.append(", rdr_enable: ");
cur_config.append("After parsed, ");
cur_config.append("rdr_enable: ");
std::string rdr_enable_flag = rdr_enabled_ ? "1" : "0";
(void)cur_config.append(rdr_enable_flag);
cur_config.append(", rdr mode: ");
cur_config.append(std::to_string(rdr_mode_));
cur_config.append(", rdr path: ");
cur_config.append(rdr_path_);
#endif
MS_LOG(INFO) << cur_config;
}


+ 6
- 1
mindspore/ccsrc/debug/env_config_parser.h View File

@@ -23,6 +23,8 @@
#include "nlohmann/json.hpp"
#include "utils/ms_utils.h"
namespace mindspore {
enum RdrModes : int { Exceptional = 1, Normal = 2 };

class EnvConfigParser {
public:
static EnvConfigParser &GetInstance() {
@@ -37,6 +39,7 @@ class EnvConfigParser {
#ifdef ENABLE_DUMP_IR
bool HasRdrSetting() const { return has_rdr_setting_; }
bool RdrEnabled() const { return rdr_enabled_; }
int RdrMode() const { return rdr_mode_; }
std::string RdrPath() const { return rdr_path_; }
#endif
bool GetSysMemreuse() { return sys_memreuse_; }
@@ -52,8 +55,9 @@ class EnvConfigParser {

#ifdef ENABLE_DUMP_IR
// rdr
bool rdr_enabled_{false};
bool has_rdr_setting_{false};
bool rdr_enabled_{false};
int rdr_mode_{1};
std::string rdr_path_{"./"};
#endif

@@ -71,6 +75,7 @@ class EnvConfigParser {
void ParseRdrSetting(const nlohmann::json &content);
void ParseRdrPath(const nlohmann::json &content);
void ParseRdrEnable(const nlohmann::json &content);
void ParseRdrMode(const nlohmann::json &content);
#endif
void ParseMemReuseSetting(const nlohmann::json &content);
void ParseSysMemReuse(const nlohmann::json &content);


+ 17
- 1
mindspore/ccsrc/debug/rdr/recorder_manager.cc View File

@@ -26,6 +26,8 @@ void RecorderManager::UpdateRdrEnable() {
}
auto &config_parser = mindspore::EnvConfigParser::GetInstance();
rdr_enable_ = config_parser.RdrEnabled();
rdr_mode_ = config_parser.RdrMode();
rdr_mode_dup_ = rdr_mode_;
if (config_parser.HasRdrSetting()) {
#ifdef __linux__
if (!rdr_enable_) {
@@ -105,7 +107,21 @@ void RecorderManager::TriggerAll() {
if (!trigger) {
MS_LOG(WARNING) << "There is no recorder to export.";
} else {
MS_LOG(INFO) << "RDR export all recorders.";
// Prevent duplicate data export by ClearResAtexit() in exceptional scenario.
rdr_mode_ = Exceptional;
MS_LOG(INFO) << "RDR exports all recorders.";
}
}

void RecorderManager::Snapshot() {
if (rdr_mode_ != Normal) {
// Restore RDR mode value from early backup.
rdr_mode_ = rdr_mode_dup_;
return;
}
RecorderManager::TriggerAll();
if (rdr_enable_) {
MS_LOG(INFO) << "RDR exports all recorders in normal end scenario.";
}
}



+ 4
- 0
mindspore/ccsrc/debug/rdr/recorder_manager.h View File

@@ -23,6 +23,7 @@
#include <memory>
#include <mutex>
#include <utility>
#include "debug/env_config_parser.h"

namespace mindspore {
// The number is the reciprocal of the golden ratio.
@@ -68,6 +69,7 @@ class RecorderManager {
bool RecordObject(const BaseRecorderPtr &recorder);
BaseRecorderPtr GetRecorder(std::string module, std::string name);
void TriggerAll();
void Snapshot();
void ClearAll();

private:
@@ -75,6 +77,8 @@ class RecorderManager {
~RecorderManager() {}

bool rdr_enable_{false};
int rdr_mode_{Exceptional};
int rdr_mode_dup_{Exceptional};
bool rdr_has_record_mem_{false};

mutable std::mutex mtx_;


+ 2
- 0
mindspore/ccsrc/debug/rdr/running_data_recorder.cc View File

@@ -118,6 +118,8 @@ bool UpdateMemAddress(const SubModuleId module, const std::string &name, const s

void TriggerAll() { mindspore::RecorderManager::Instance().TriggerAll(); }

void Snapshot() { mindspore::RecorderManager::Instance().Snapshot(); }

void ResetRecorder() { mindspore::RecorderManager::Instance().ClearAll(); }

void ClearMemAddressInfo() {


+ 1
- 0
mindspore/ccsrc/debug/rdr/running_data_recorder.h View File

@@ -61,6 +61,7 @@ bool RecordTaskDebugInfo(SubModuleId module, const std::string &name,
const std::vector<TaskDebugInfoPtr> &task_debug_info_list);
#endif // ENABLE_D
void TriggerAll();
void Snapshot();
void ResetRecorder();
void ClearMemAddressInfo();
} // namespace RDR


+ 1
- 0
mindspore/ccsrc/pipeline/jit/pipeline.cc View File

@@ -1458,6 +1458,7 @@ void ClearResAtexit() {
}
#endif
#ifdef ENABLE_DUMP_IR
mindspore::RDR::Snapshot();
mindspore::RDR::ResetRecorder();
#endif
session::ExecutorManager::Instance().Clear();


+ 3
- 0
mindspore/context.py View File

@@ -637,6 +637,9 @@ def set_context(**kwargs):
- enable: controls whether the RDR is enabled to collect the key data during training and
save key data in the fault scenario. When set to true, the RDR will be turned on.
When set to false, the RDR will be turned off.
- mode: sets the mode of RDR on exporting data. When set to 1, the RDR only exports data
in the fault scenario. When set to 2, the RDR exports data in the fault scenario and the
normal end scenario. Default is 1.
- path: sets the path where RDR saves data. The current path must be absolute.

Memory reuse:


Loading…
Cancel
Save