You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

dump_utils.cc 7.1 kB

4 years ago
4 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. /**
  2. * Copyright 2021-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "debug/data_dump/dump_utils.h"
  17. #include <map>
  18. #include <vector>
  19. #include <algorithm>
  20. #include "runtime/device/ms_device_shape_transfer.h"
  21. #include "utils/ms_context.h"
  22. #include "debug/anf_ir_utils.h"
  23. #include "debug/data_dump/dump_json_parser.h"
  24. #include "backend/common/session/anf_runtime_algorithm.h"
  25. #include "include/common/utils/anfalgo.h"
  26. #include "runtime/device/kernel_runtime_manager.h"
  27. #include "include/common/utils/utils.h"
  28. #include "debug/common.h"
  29. #include "runtime/graph_scheduler/device_tensor_store.h"
  30. using mindspore::runtime::DeviceTensorStore;
  31. namespace mindspore {
  32. uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
  33. auto context = MsContext::GetInstance();
  34. MS_EXCEPTION_IF_NULL(context);
  35. auto device_target = context->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  36. auto kernel_runtime = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(device_target, device_id);
  37. MS_EXCEPTION_IF_NULL(kernel_runtime);
  38. return kernel_runtime->device_id();
  39. }
  40. /*
  41. * Feature group: Dump.
  42. * Target device group: Ascend, GPU and CPU.
  43. * Runtime category: Old runtime, MindRT.
  44. * Description: Generate dir path to dump data. It will be in these formats:
  45. * 1) tensor/statistic: /dump_path/rank_{rank_id}/{net_name}/{graph_id}/{iter_num}.
  46. * 2) constant data: /dump_path/rank_{rank_id}/{net_name}/{graph_id}/constants/.
  47. */
  48. std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id, bool is_cst) {
  49. auto &dump_json_parser = DumpJsonParser::GetInstance();
  50. std::string net_name = dump_json_parser.net_name();
  51. std::string iterator = std::to_string(dump_json_parser.cur_dump_iter());
  52. std::string dump_path = dump_json_parser.path();
  53. if (dump_path.back() != '/') {
  54. dump_path += "/";
  55. }
  56. if (is_cst) {
  57. dump_path += ("rank_" + std::to_string(rank_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/constants/");
  58. } else {
  59. dump_path +=
  60. ("rank_" + std::to_string(rank_id) + "/" + net_name + "/" + std::to_string(graph_id) + "/" + iterator + "/");
  61. }
  62. return dump_path;
  63. }
  64. void GetFileKernelName(NotNull<std::string *> kernel_name) {
  65. const std::string strsrc = "/";
  66. const std::string strdst = "--";
  67. std::string::size_type pos = 0;
  68. std::string::size_type srclen = strsrc.size();
  69. std::string::size_type dstlen = strdst.size();
  70. while ((pos = kernel_name->find(strsrc, pos)) != std::string::npos) {
  71. kernel_name->replace(pos, srclen, strdst);
  72. pos += dstlen;
  73. }
  74. }
  75. /*
  76. * Feature group: Dump.
  77. * Target device group: Ascend, GPU and CPU.
  78. * Runtime category: Old runtime, MindRT.
  79. * Description: Get the actual tensor shape for dumping based on trans_flag option in configuration json file.
  80. */
  81. void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *> int_shapes, bool trans_flag) {
  82. if (trans_flag) {
  83. *int_shapes = trans::GetRuntimePaddingShape(node, index);
  84. } else {
  85. auto shape = AnfAlgo::GetOutputDeviceShape(node, index);
  86. (void)std::transform(shape.begin(), shape.end(), std::back_inserter(*int_shapes),
  87. [](size_t inner_item) { return SizeToInt(inner_item); });
  88. }
  89. }
  90. const DeviceTensorPtr GetParameterInfo(const AnfNodePtr &node, NotNull<ShapeVector *> int_shapes,
  91. NotNull<TypeId *> host_type, NotNull<TypeId *> device_type) {
  92. const auto &device_tensors = DeviceTensorStore::GetInstance().Fetch(node.get());
  93. if (device_tensors.size() < 1) {
  94. return nullptr;
  95. }
  96. auto device_addr = device_tensors[0];
  97. MS_EXCEPTION_IF_NULL(device_addr);
  98. auto &dump_json_parser = DumpJsonParser::GetInstance();
  99. bool trans_flag = dump_json_parser.trans_flag();
  100. auto ref_node = device_addr->GetNodeIndex().first;
  101. MS_EXCEPTION_IF_NULL(ref_node);
  102. GetDumpIntShape(ref_node, PARAMETER_OUTPUT_INDEX, int_shapes, trans_flag);
  103. *host_type = common::AnfAlgo::GetOutputInferDataType(ref_node, PARAMETER_OUTPUT_INDEX);
  104. *device_type = AnfAlgo::GetOutputDeviceDataType(ref_node, PARAMETER_OUTPUT_INDEX);
  105. return device_addr;
  106. }
  107. /*
  108. * Feature group: Dump.
  109. * Target device group: Ascend, CPU.
  110. * Runtime category: Old runtime, MindRT.
  111. * Description: Dump the data in memory into file path.
  112. */
  113. void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
  114. const TypeId &type, bool trans_flag) {
  115. auto format = kOpFormat_DEFAULT;
  116. auto ret = addr.DumpMemToFile(file_path, format, int_shapes, type, trans_flag);
  117. if (!ret) {
  118. MS_LOG(ERROR) << "DumpMemToFile Failed: flag:" << trans_flag << ", path:" << file_path << ", host_format:" << format
  119. << ".!";
  120. }
  121. }
  122. uint64_t GetTimeStamp() {
  123. auto cur_sys_time = std::chrono::system_clock::now();
  124. uint64_t timestamp = std::chrono::duration_cast<std::chrono::microseconds>(cur_sys_time.time_since_epoch()).count();
  125. return timestamp;
  126. }
  127. /*
  128. * Feature group: Dump.
  129. * Target device group: Ascend, GPU, CPU.
  130. * Runtime category: Old runtime, MindRT.
  131. * Description: Remove scope from operator name. The default separator is "--".
  132. */
  133. std::string GetOpNameWithoutScope(const std::string &fullname_with_scope, const std::string &separator) {
  134. std::size_t found = fullname_with_scope.rfind(separator);
  135. std::string op_name;
  136. if (found != std::string::npos) {
  137. op_name = fullname_with_scope.substr(found + separator.length());
  138. }
  139. return op_name;
  140. }
  141. /*
  142. * Feature group: Dump.
  143. * Target device group: Ascend, GPU, CPU.
  144. * Runtime category: Old runtime, MindRT.
  145. * Description: Dump string content into file path. Current purpose is to save operator overflow information in json
  146. * file in ascend a+m dump mode.
  147. */
  148. void DumpToFile(const std::string &file_name, const std::string &dump_str) {
  149. if (dump_str.empty()) {
  150. MS_LOG(ERROR) << "Failed to dump empty tensor data.";
  151. return;
  152. }
  153. auto real_path = Common::CreatePrefixPath(file_name);
  154. if (!real_path.has_value()) {
  155. MS_LOG(ERROR) << "CreatePrefixPath failed.";
  156. return;
  157. }
  158. std::string real_path_str = real_path.value();
  159. ChangeFileMode(real_path_str, S_IWUSR);
  160. std::ofstream file(real_path_str, std::ofstream::out | std::ofstream::trunc);
  161. if (!file.is_open()) {
  162. MS_LOG(EXCEPTION) << "Open file " << real_path_str << "failed: " << ErrnoToString(errno);
  163. }
  164. file << dump_str;
  165. if (file.bad()) {
  166. file.close();
  167. MS_LOG(EXCEPTION) << "Dump string to file " << real_path_str << " failed: " << ErrnoToString(errno);
  168. }
  169. file.close();
  170. ChangeFileMode(real_path_str, S_IRUSR);
  171. }
  172. } // namespace mindspore