You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

e2e_dump.h 5.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. /**
  2. * Copyright 2020-2022 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
  17. #define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_H_
  18. #include <dirent.h>
  19. #include <map>
  20. #include <memory>
  21. #include <string>
  22. #include <vector>
  23. #include "backend/common/session/kernel_graph.h"
  24. #include "runtime/device/device_address.h"
  25. #include "debug/data_dump/dump_json_parser.h"
  26. #include "debug/data_dump/dump_utils.h"
  27. #ifdef ENABLE_D
  28. #include "proto/dump_data.pb.h"
  29. #endif
  30. #include "include/backend/visible.h"
  31. using mindspore::kernel::KernelLaunchInfo;
  32. #ifndef ENABLE_DEBUGGER
  33. class Debugger;
  34. #endif
  35. namespace mindspore {
  36. struct dump_data_t {
  37. std::string dump_file_path;
  38. char *data_ptr;
  39. mindspore::TypeId data_type;
  40. std::string format;
  41. ShapeVector device_shape;
  42. ShapeVector host_shape;
  43. size_t data_size;
  44. int32_t sub_format;
  45. std::string in_out_str;
  46. uint32_t slot;
  47. std::shared_ptr<tensor::Tensor> trans_buf{nullptr};
  48. };
  49. class E2eDump {
  50. public:
  51. E2eDump() = default;
  52. ~E2eDump() = default;
  53. static void UpdateIterMindRTDump();
  54. static void UpdateIterOldRTDump(const session::KernelGraph *graph);
  55. static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
  56. static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
  57. static void DumpConstantData(const session::KernelGraph *graph, const std::string &cst_dump_path,
  58. const Debugger *debugger = nullptr);
  59. static void DumpConstantData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
  60. static void DumpParametersData(uint32_t rank_id, const Debugger *debugger);
  61. static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id,
  62. const Debugger *debugger = nullptr, const KernelLaunchInfo *launch_info = nullptr);
  63. // Dump data when task error.
  64. static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
  65. std::string *kernel_name, const Debugger *debugger,
  66. const KernelLaunchInfo *launch_info = nullptr);
  67. static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
  68. std::string *kernel_name, const Debugger *debugger);
  69. // Dump input/output data without additional check, used for exception case only
  70. static void DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
  71. std::string *kernel_name);
  72. static void DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
  73. std::string *kernel_name);
  74. #ifdef ENABLE_D
  75. static void DumpTensorToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data, char *data_ptr);
  76. static void DumpOpDebugToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
  77. char *data_ptr);
  78. #endif
  79. private:
  80. static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
  81. static void DumpOutputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger);
  82. static void DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
  83. static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger,
  84. const KernelLaunchInfo *launch_info = nullptr);
  85. static void DumpParameters(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
  86. static void DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag,
  87. const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot,
  88. const ShapeVector &int_shapes, const TypeId &host_type);
  89. static bool IsDeviceTargetGPU();
  90. static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
  91. bool trans_flag, const Debugger *debugger);
  92. static void DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::string &dump_path, bool trans_flag,
  93. const Debugger *debugger);
  94. #ifdef ENABLE_D
  95. static nlohmann::json ParseOverflowInfo(char *data_ptr);
  96. static bool ConvertFormatForOneTensor(dump_data_t *dump_tensor_info);
  97. static void ConvertFormatForTensors(std::vector<dump_data_t> *dump_tensor_vec, uint32_t start_idx, uint32_t end_idx);
  98. static bool DumpTensorStatsIfNeeded(const dump_data_t &dump_tensor_info);
  99. static bool DumpTensorDataIfNeeded(const dump_data_t &dump_tensor_info);
  100. #endif
  101. BACKEND_EXPORT inline static unsigned int starting_graph_id = INT32_MAX;
  102. };
  103. } // namespace mindspore
  104. #endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_E_2_E_DUMP_UTIL_H_