You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.h 5.7 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_GPU_PROFILING_H
  17. #define MINDSPORE_GPU_PROFILING_H
  18. #include <cuda.h>
  19. #include <cupti.h>
  20. #include <algorithm>
  21. #include <cstdio>
  22. #include <map>
  23. #include <memory>
  24. #include <mutex>
  25. #include <string>
  26. #include <unordered_map>
  27. #include <utility>
  28. #include <vector>
  29. #include "profiler/device/gpu/gpu_profiling_utils.h"
  30. namespace mindspore {
  31. namespace profiler {
  32. namespace gpu {
  33. enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
  34. enum class ActivityType {
  35. kKernel = 0,
  36. kMemcpyH2D = 1,
  37. kMemcpyD2H = 2,
  38. kMemcpyH2A = 3,
  39. kMemcpyA2H = 4,
  40. kMemcpyA2D = 5,
  41. kMemcpyD2A = 6,
  42. kMemcpyD2D = 7,
  43. kMemcpyP2P = 8,
  44. kMemcpyH2H = 9,
  45. kMemset = 10,
  46. kMemcpyUnknown = 11
  47. };
  48. struct MemcpyInfo {
  49. size_t bytes;
  50. unsigned char src_kind;
  51. unsigned char dst_kind;
  52. };
  53. struct KernelInfo {
  54. uint64_t registers_per_thread;
  55. uint64_t static_shared_memory;
  56. uint64_t dynamic_shared_memory;
  57. uint64_t block_x;
  58. uint64_t block_y;
  59. uint64_t block_z;
  60. uint64_t grid_x;
  61. uint64_t grid_y;
  62. uint64_t grid_z;
  63. };
  64. struct Event {
  65. std::string kernel_name;
  66. std::string kernel_type;
  67. CUPTIApiType api_type;
  68. ActivityType activity_type;
  69. uint64_t start_time_stamp;
  70. uint64_t end_time_stamp;
  71. std::string op_name;
  72. uint32_t device_id;
  73. uint32_t correlation_id;
  74. uint32_t thread_id;
  75. uint32_t context_id;
  76. uint32_t stream_id;
  77. CUpti_CallbackId cb_id;
  78. union {
  79. MemcpyInfo memcpy_info;
  80. KernelInfo kernel_info;
  81. };
  82. };
  83. struct StartDuration {
  84. uint64_t start_timestamp = 0l;
  85. float duration = 0l;
  86. };
  87. struct OpInfo {
  88. std::string op_name;
  89. float cupti_api_call_time = 0l;
  90. float cupti_activity_time = 0l;
  91. float op_host_cost_time = 0;
  92. int op_kernel_api_count = 0;
  93. int op_kernel_count = 0;
  94. int op_count = 0;
  95. std::vector<StartDuration> start_duration;
  96. void *stream;
  97. };
  98. struct BaseTime {
  99. // nanosecond
  100. uint64_t host_start_time = 0l;
  101. uint64_t gpu_start_time = 0l;
  102. };
  103. const float kTimeUnit = 1000;
  104. class ProfilingOp {
  105. public:
  106. ProfilingOp() = default;
  107. virtual ~ProfilingOp() = default;
  108. virtual void SaveProfilingData() = 0;
  109. virtual void Init() = 0;
  110. std::string Name() const { return op_name_; }
  111. protected:
  112. std::string op_name_;
  113. };
  114. class GPUProfiler {
  115. public:
  116. static std::shared_ptr<GPUProfiler> GetInstance();
  117. ~GPUProfiler() { StopCUPTI(); }
  118. GPUProfiler(const GPUProfiler &) = delete;
  119. GPUProfiler &operator=(const GPUProfiler &) = delete;
  120. void Init(const std::string &profileDataPath);
  121. void Stop();
  122. void StopCUPTI();
  123. void StepProfilingEnable(const bool enable_flag);
  124. void SyncEnable(const bool enable_flag);
  125. bool GetEnableFlag() const { return enable_flag_; }
  126. bool GetSyncEnableFlag() const { return sync_enable_flag_; }
  127. void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
  128. uint64_t startTimestamp, uint64_t endTimestamp);
  129. void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  130. void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  131. void OpDataProducerBegin(const std::string op_name, void *stream);
  132. void OpDataProducerEnd();
  133. void ProcessEvents();
  134. void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
  135. void SetStepTraceOpName(ProfilingTraceInfo trace_op_name);
  136. std::string ProfileDataPath() const { return profile_data_path_; }
  137. private:
  138. GPUProfiler() = default;
  139. void OpsParser();
  140. void EventLog(const Event &event);
  141. void ClearInst();
  142. void HandleActivityRecord(CUpti_Activity *record);
  143. void AddEvent(Event &&event);
  144. void SetRunTimeData(const std::string &op_name, void *stream);
  145. void SetRunTimeData(const std::string &op_name, const float time_elapsed);
  146. void SetRunTimeData(const std::string &op_name, const uint64_t start, const float duration);
  147. void FixOpNameByCorrelationId(Event *event);
  148. static std::shared_ptr<GPUProfiler> profiler_inst_;
  149. bool enable_flag_ = false;
  150. bool sync_enable_flag_ = true;
  151. std::unordered_map<std::string, OpInfo> op_info_map_;
  152. std::unordered_map<uint32_t, std::string> op_name_map_;
  153. std::vector<Event> events_;
  154. BaseTime base_time_;
  155. std::string op_name_;
  156. void *stream_;
  157. void SaveProfileData();
  158. void SaveExtraProfileData();
  159. std::mutex event_mutex_;
  160. std::vector<CUpti_ActivityKind> activities_enable_;
  161. uint64_t cupti_callback_events_count_ = 0l;
  162. uint64_t cupti_callback_events_drop_count_ = 0l;
  163. uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
  164. uint64_t cupti_activity_events_count_ = 0l;
  165. uint64_t cupti_activity_events_drop_count_ = 0l;
  166. uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
  167. CUpti_SubscriberHandle subscriber_ = nullptr;
  168. cudaEvent_t op_event_start_;
  169. cudaEvent_t op_event_stop_;
  170. uint64_t op_host_time_start_;
  171. uint64_t op_host_time_stop_;
  172. uint64_t op_cupti_time_start_;
  173. std::string profile_data_path_;
  174. std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
  175. ProfilingTraceInfo step_trace_op_name;
  176. };
  177. } // namespace gpu
  178. } // namespace profiler
  179. } // namespace mindspore
  180. #endif // MINDSPORE_GPU_PROFILING_H