Merge pull request !27164 from Jimmy Qi/new-async-stat-dumptags/v1.6.0
| @@ -382,9 +382,9 @@ void DumpJsonParser::ParseSavedData(const nlohmann::json &content) { | |||
| << saved_data_ << ". Please set saved_data to either statistic, tensor, or full"; | |||
| } | |||
| auto context = MsContext::GetInstance(); | |||
| if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kGPUDevice) { | |||
| MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set " | |||
| "saved_data to tensor or use a GPU device"; | |||
| if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kCPUDevice) { | |||
| MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU and Ascend, please " | |||
| "set saved_data to tensor or use a GPU or Ascend device"; | |||
| } | |||
| } | |||
| @@ -19,6 +19,7 @@ | |||
| #include <unistd.h> | |||
| #include <algorithm> | |||
| #include <map> | |||
| #include <memory> | |||
| #include <set> | |||
| #include <utility> | |||
| #include <vector> | |||
| @@ -511,7 +512,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum | |||
| for (uint32_t slot = 0; slot < input_tensors.size(); slot++) { | |||
| auto in_tensor = input_tensors[slot]; | |||
| std::string in_slot_path = in_path + std::to_string(slot) + "."; | |||
| auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset); | |||
| auto succ = ConvertFormatForTensorAndDump(in_slot_path, in_tensor, data_ptr + offset, "input", slot); | |||
| if (!succ) { | |||
| MS_LOG(INFO) << "Failed to convert format for tensor " << in_slot_path; | |||
| } | |||
| @@ -524,7 +525,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum | |||
| for (uint32_t slot = 0; slot < output_tensors.size(); slot++) { | |||
| auto out_tensor = output_tensors[slot]; | |||
| std::string out_slot_path = out_path + std::to_string(slot) + "."; | |||
| auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset); | |||
| auto succ = ConvertFormatForTensorAndDump(out_slot_path, out_tensor, data_ptr + offset, "output", slot); | |||
| if (!succ) { | |||
| MS_LOG(INFO) << "Failed to convert format for tensor " << out_slot_path; | |||
| } | |||
| @@ -533,7 +534,40 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum | |||
| } | |||
| template <typename T> | |||
| bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr) { | |||
| bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io, | |||
| uint32_t slot, const ShapeVector &shape, TypeId type) { | |||
| if (!DumpJsonParser::GetInstance().IsStatisticDump()) { | |||
| return true; | |||
| } | |||
| size_t pos = dump_path.rfind("/"); | |||
| std::string file_name = dump_path.substr(pos + 1); | |||
| size_t first_dot = file_name.find("."); | |||
| size_t second_dot = file_name.find(".", first_dot + 1); | |||
| size_t third_dot = file_name.find(".", second_dot + 1); | |||
| size_t fourth_dot = file_name.find(".", third_dot + 1); | |||
| size_t fifth_dot = file_name.find(".", fourth_dot + 1); | |||
| std::string op_type = file_name.substr(0, first_dot); | |||
| std::string op_name = file_name.substr(first_dot + 1, second_dot - first_dot - 1); | |||
| std::string task_id = file_name.substr(second_dot + 1, third_dot - second_dot - 1); | |||
| std::string stream_id = file_name.substr(third_dot + 1, fourth_dot - third_dot - 1); | |||
| std::string timestamp = file_name.substr(fourth_dot + 1, fifth_dot - fourth_dot - 1); | |||
| TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, io, slot, slot); | |||
| std::shared_ptr<TensorData> data = std::make_shared<TensorData>(); | |||
| try { | |||
| data->ConvertMsToDbgType(type); | |||
| } catch (...) { | |||
| MS_LOG(ERROR) << "Data type of operator " << file_name << " is not supported by statistic dump"; | |||
| return false; | |||
| } | |||
| data->SetByteSize((size_t)tensor.size()); | |||
| data->SetShape(shape); | |||
| data->SetDataPtr(data_ptr); | |||
| return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data); | |||
| } | |||
| template <typename T> | |||
| bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr, | |||
| const std::string &io, uint32_t slot) { | |||
| // get format | |||
| auto iter_fmt = kFormatToStringMap.find(tensor.format()); | |||
| if (iter_fmt == kFormatToStringMap.end()) { | |||
| @@ -584,13 +618,21 @@ bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tens | |||
| } | |||
| } | |||
| // dump tensor data into npy file | |||
| bool dump_success = false; | |||
| bool dump_success = true; | |||
| if (trans_success) { | |||
| dump_path += host_format; | |||
| dump_success = DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type); | |||
| dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, reinterpret_cast<char *>(trans_buf.data()), io, slot, | |||
| shape_to, src_type); | |||
| if (DumpJsonParser::GetInstance().IsTensorDump()) { | |||
| dump_path += host_format; | |||
| dump_success = | |||
| DumpJsonParser::DumpToFile(dump_path, trans_buf.data(), data_size, shape_to, src_type) && dump_success; | |||
| } | |||
| } else { | |||
| dump_path += device_format; | |||
| dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type); | |||
| dump_success = DumpTensorStatsIfNeeded(dump_path, tensor, data_ptr, io, slot, shape_to, src_type); | |||
| if (DumpJsonParser::GetInstance().IsTensorDump()) { | |||
| dump_path += device_format; | |||
| dump_success = DumpJsonParser::DumpToFile(dump_path, data_ptr, data_size, shape_to, src_type) && dump_success; | |||
| } | |||
| } | |||
| return dump_success; | |||
| } | |||
| @@ -95,7 +95,8 @@ class E2eDump { | |||
| static nlohmann::json ParseOverflowInfo(char *data_ptr); | |||
| template <typename T> | |||
| static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr); | |||
| static bool ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr, | |||
| const std::string &io, uint32_t slot); | |||
| #endif | |||
| inline static unsigned int starting_graph_id = INT32_MAX; | |||
| @@ -41,7 +41,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { | |||
| } | |||
| auto file_path = Common::CreatePrefixPath(path); | |||
| if (!file_path.has_value()) { | |||
| MS_LOG(WARNING) << "CreatePrefixPath failed."; | |||
| MS_LOG(WARNING) << "CreatePrefixPath failed, skipping current statistics"; | |||
| return false; | |||
| } | |||
| // try to open file | |||
| @@ -55,7 +55,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { | |||
| file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary); | |||
| } | |||
| if (!file_.is_open()) { | |||
| MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno); | |||
| MS_LOG(WARNING) << "Open file " << file_path_value << " failed." << ErrnoToString(errno); | |||
| return false; | |||
| } | |||
| if (first_time_opening) { | |||
| @@ -63,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) { | |||
| file_.flush(); | |||
| file_path_str_ = path; | |||
| } | |||
| MS_LOG(INFO) << "Opened file: " << path; | |||
| MS_LOG(INFO) << "Opened file: " << file_path_value; | |||
| return true; | |||
| } | |||
| @@ -93,9 +93,9 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op | |||
| size_t tensor_loader_slot) | |||
| : op_type_{op_type}, | |||
| op_name_{op_name}, | |||
| task_id_{task_id}, | |||
| stream_id_{stream_id}, | |||
| timestamp_{timestamp}, | |||
| task_id_{std::to_string(task_id)}, | |||
| stream_id_{std::to_string(stream_id)}, | |||
| timestamp_{std::to_string(timestamp)}, | |||
| slot_{slot}, | |||
| tensor_loader_slot_{tensor_loader_slot} { | |||
| if (input) { | |||
| @@ -105,6 +105,22 @@ TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op | |||
| } | |||
| } | |||
| TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id, | |||
| const std::string &stream_id, const std::string ×tamp, const std::string &io, | |||
| size_t slot, size_t tensor_loader_slot) | |||
| : op_type_{op_type}, | |||
| op_name_{op_name}, | |||
| task_id_{task_id}, | |||
| stream_id_{stream_id}, | |||
| timestamp_{timestamp}, | |||
| io_{io}, | |||
| slot_{slot}, | |||
| tensor_loader_slot_{tensor_loader_slot} { | |||
| if (io_ != kInput && io_ != kOutput) { | |||
| MS_LOG(EXCEPTION) << "Cannot instantiate TensorStatDump, io needs to be either " << kInput << " or " << kOutput; | |||
| } | |||
| } | |||
| bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) { | |||
| std::string filename = dump_path + "/" + kCsvFileName; | |||
| // try to open file | |||
| @@ -125,16 +141,24 @@ bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) { | |||
| bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path, | |||
| const Debugger *debugger) { | |||
| if (!OpenStatisticsFile(dump_path)) { | |||
| return false; | |||
| } | |||
| // get tensor statistics using debugger | |||
| // get tensor data using debugger | |||
| std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_); | |||
| std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name); | |||
| if (data == nullptr) { | |||
| MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics"; | |||
| return false; | |||
| } | |||
| return DumpTensorStatsToFile(dump_path, data); | |||
| } | |||
| bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data) { | |||
| if (data == nullptr) { | |||
| MS_LOG(WARNING) << "Tensor data is empty, skipping current statistics"; | |||
| return false; | |||
| } | |||
| if (!OpenStatisticsFile(dump_path)) { | |||
| return false; | |||
| } | |||
| const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data); | |||
| // write tensor statistics to csv file | |||
| std::ostringstream shape; | |||
| @@ -17,6 +17,7 @@ | |||
| #ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ | |||
| #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ | |||
| #include <memory> | |||
| #include <string> | |||
| #include <fstream> | |||
| @@ -24,6 +25,7 @@ | |||
| namespace mindspore { | |||
| class Debugger; | |||
| class TensorData; | |||
| class CsvWriter { | |||
| public: | |||
| static CsvWriter &GetInstance() { | |||
| @@ -31,13 +33,6 @@ class CsvWriter { | |||
| return instance; | |||
| } | |||
| private: | |||
| const std::string kSeparator = ","; | |||
| const std::string kEndLine = "\n"; | |||
| std::ofstream file_; | |||
| std::string file_path_str_ = ""; | |||
| public: | |||
| CsvWriter() = default; | |||
| ~CsvWriter(); | |||
| DISABLE_COPY_AND_ASSIGN(CsvWriter) | |||
| @@ -45,28 +40,39 @@ class CsvWriter { | |||
| void CloseFile(); | |||
| template <typename T> | |||
| void WriteToCsv(const T &val, bool end_line = false); | |||
| private: | |||
| const std::string kSeparator = ","; | |||
| const std::string kEndLine = "\n"; | |||
| std::ofstream file_; | |||
| std::string file_path_str_ = ""; | |||
| }; | |||
| class TensorStatDump { | |||
| static const char CSV_HEADER[]; | |||
| static const char CSV_FILE_NAME[]; | |||
| const std::string &op_type_; | |||
| const std::string &op_name_; | |||
| uint32_t task_id_; | |||
| uint32_t stream_id_; | |||
| uint64_t timestamp_; | |||
| std::string io_; | |||
| size_t slot_; | |||
| size_t tensor_loader_slot_; | |||
| public: | |||
| static bool OpenStatisticsFile(const std::string &dump_path); | |||
| TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id, | |||
| uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_); | |||
| TensorStatDump(const std::string &op_type, const std::string &op_name, const std::string &task_id, | |||
| const std::string &stream_id, const std::string ×tamp, const std::string &io, size_t slot, | |||
| size_t tensor_loader_slot); | |||
| bool DumpTensorStatsToFile(const std::string &dump_path, std::shared_ptr<TensorData> data); | |||
| bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path, | |||
| const Debugger *debugger); | |||
| private: | |||
| static const char CSV_HEADER[]; | |||
| static const char CSV_FILE_NAME[]; | |||
| const std::string op_type_; | |||
| const std::string op_name_; | |||
| const std::string task_id_; | |||
| const std::string stream_id_; | |||
| const std::string timestamp_; | |||
| std::string io_; | |||
| size_t slot_; | |||
| size_t tensor_loader_slot_; | |||
| }; | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_ | |||
| @@ -158,11 +158,15 @@ def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data | |||
| """ | |||
| if test_key == "test_gpu_e2e_dump": | |||
| data = e2e_dump_dict | |||
| data["common_dump_settings"]["path"] = dump_path | |||
| data["common_dump_settings"]["saved_data"] = saved_data | |||
| elif test_key == "test_async_dump": | |||
| data = async_dump_dict | |||
| data["common_dump_settings"]["input_output"] = 0 | |||
| data["common_dump_settings"]["file_format"] = "npy" | |||
| else: | |||
| raise ValueError( | |||
| "Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.") | |||
| data["common_dump_settings"]["path"] = dump_path | |||
| data["common_dump_settings"]["saved_data"] = saved_data | |||
| with open(json_file_name, 'w') as f: | |||
| json.dump(data, f) | |||
| @@ -409,36 +409,37 @@ def check_statistic_dump(dump_file_path): | |||
| real_path = os.path.realpath(output_path) | |||
| with open(real_path) as f: | |||
| reader = csv.DictReader(f) | |||
| input1 = next(reader) | |||
| stats = list(reader) | |||
| input1 = stats[0] | |||
| assert input1['IO'] == 'input' | |||
| assert input1['Min Value'] == '1' | |||
| assert input1['Max Value'] == '6' | |||
| input2 = next(reader) | |||
| input2 = stats[1] | |||
| assert input2['IO'] == 'input' | |||
| assert input2['Min Value'] == '7' | |||
| assert input2['Max Value'] == '12' | |||
| output = next(reader) | |||
| output = stats[2] | |||
| assert output['IO'] == 'output' | |||
| assert output['Min Value'] == '8' | |||
| assert output['Max Value'] == '18' | |||
| def check_data_dump(dump_file_path): | |||
| output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy" | |||
| output_name = "Add.Add-op*.output.0.*.npy" | |||
| output_path = glob.glob(os.path.join(dump_file_path, output_name))[0] | |||
| real_path = os.path.realpath(output_path) | |||
| output = np.load(real_path) | |||
| expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32) | |||
| assert np.array_equal(output, expect) | |||
| def run_gpu_e2e_dump(saved_data): | |||
| """Run gpu e2e dump""" | |||
| def run_saved_data_dump_test(scenario, saved_data): | |||
| """Run e2e dump on scenario, testing statistic dump""" | |||
| if sys.platform != 'linux': | |||
| return | |||
| pwd = os.getcwd() | |||
| with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir: | |||
| dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump') | |||
| dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json') | |||
| generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data) | |||
| dump_path = os.path.join(tmp_dir, 'test_saved_data') | |||
| dump_config_path = os.path.join(tmp_dir, 'test_saved_data.json') | |||
| generate_statistic_dump_json(dump_path, dump_config_path, scenario, saved_data) | |||
| os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path | |||
| dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0') | |||
| if os.path.isdir(dump_path): | |||
| @@ -473,7 +474,7 @@ def test_gpu_e2e_statistic_dump(): | |||
| Expectation: Statistics are stored in statistic.csv files | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| run_gpu_e2e_dump('statistic') | |||
| run_saved_data_dump_test('test_gpu_e2e_dump', 'statistic') | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @@ -486,7 +487,7 @@ def test_gpu_e2e_tensor_dump(): | |||
| Expectation: Tensor data are stored in npy files | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| run_gpu_e2e_dump('tensor') | |||
| run_saved_data_dump_test('test_gpu_e2e_dump', 'tensor') | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @@ -499,4 +500,46 @@ def test_gpu_e2e_full_dump(): | |||
| Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| run_gpu_e2e_dump('full') | |||
| run_saved_data_dump_test('test_gpu_e2e_dump', 'full') | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.env_onecard | |||
| @security_off_wrap | |||
| def test_ascend_statistic_dump(): | |||
| """ | |||
| Feature: Ascend Statistics Dump | |||
| Description: Test Ascend statistics dump | |||
| Expectation: Statistics are stored in statistic.csv files | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") | |||
| run_saved_data_dump_test('test_async_dump', 'statistic') | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.env_onecard | |||
| @security_off_wrap | |||
| def test_ascend_tensor_dump(): | |||
| """ | |||
| Feature: Ascend Tensor Dump | |||
| Description: Test Ascend tensor dump | |||
| Expectation: Tensors are stored in npy files | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") | |||
| run_saved_data_dump_test('test_async_dump', 'tensor') | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_arm_ascend_training | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.env_onecard | |||
| @security_off_wrap | |||
| def test_ascend_full_dump(): | |||
| """ | |||
| Feature: Ascend Full Dump | |||
| Description: Test Ascend full dump | |||
| Expectation: Tensors are stored in npy files and their statistics stored in statistic.csv | |||
| """ | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="Ascend") | |||
| run_saved_data_dump_test('test_async_dump', 'full') | |||