You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.h 5.6 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_GPU_PROFILING_H
  17. #define MINDSPORE_GPU_PROFILING_H
  18. #include <cuda.h>
  19. #include <cupti.h>
  20. #include <algorithm>
  21. #include <cstdio>
  22. #include <map>
  23. #include <memory>
  24. #include <mutex>
  25. #include <string>
  26. #include <unordered_map>
  27. #include <utility>
  28. #include <vector>
  29. namespace mindspore {
  30. namespace profiler {
  31. namespace gpu {
  32. enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
  33. enum class ActivityType {
  34. kKernel = 0,
  35. kMemcpyH2D = 1,
  36. kMemcpyD2H = 2,
  37. kMemcpyH2A = 3,
  38. kMemcpyA2H = 4,
  39. kMemcpyA2D = 5,
  40. kMemcpyD2A = 6,
  41. kMemcpyD2D = 7,
  42. kMemcpyP2P = 8,
  43. kMemcpyH2H = 9,
  44. kMemset = 10,
  45. kMemcpyUnknown = 11
  46. };
  47. struct MemcpyInfo {
  48. size_t bytes;
  49. unsigned char src_kind;
  50. unsigned char dst_kind;
  51. };
  52. struct KernelInfo {
  53. uint64_t registers_per_thread;
  54. uint64_t static_shared_memory;
  55. uint64_t dynamic_shared_memory;
  56. uint64_t block_x;
  57. uint64_t block_y;
  58. uint64_t block_z;
  59. uint64_t grid_x;
  60. uint64_t grid_y;
  61. uint64_t grid_z;
  62. };
  63. struct Event {
  64. std::string kernel_name;
  65. std::string kernel_type;
  66. CUPTIApiType api_type;
  67. ActivityType activity_type;
  68. uint64_t start_time_stamp;
  69. uint64_t end_time_stamp;
  70. std::string op_name;
  71. uint32_t device_id;
  72. uint32_t correlation_id;
  73. uint32_t thread_id;
  74. uint32_t context_id;
  75. uint32_t stream_id;
  76. CUpti_CallbackId cb_id;
  77. union {
  78. MemcpyInfo memcpy_info;
  79. KernelInfo kernel_info;
  80. };
  81. };
  82. struct StartDuration {
  83. uint64_t start_timestamp = 0l;
  84. float duration = 0l;
  85. };
  86. struct OpInfo {
  87. std::string op_name;
  88. float cupti_api_call_time = 0l;
  89. float cupti_activity_time = 0l;
  90. float op_host_cost_time = 0;
  91. int op_kernel_api_count = 0;
  92. int op_kernel_count = 0;
  93. int op_count = 0;
  94. std::vector<StartDuration> start_duration;
  95. void *stream;
  96. };
  97. struct BaseTime {
  98. // nanosecond
  99. uint64_t host_start_time = 0l;
  100. uint64_t gpu_start_time = 0l;
  101. };
  102. const float kTimeUnit = 1000;
  103. class ProfilingOp {
  104. public:
  105. ProfilingOp() = default;
  106. virtual ~ProfilingOp() = default;
  107. virtual void SaveProfilingData() = 0;
  108. virtual void Init() = 0;
  109. std::string Name() const { return op_name_; }
  110. protected:
  111. std::string op_name_;
  112. };
  113. class GPUProfiler {
  114. public:
  115. static std::shared_ptr<GPUProfiler> GetInstance();
  116. ~GPUProfiler() { StopCUPTI(); }
  117. GPUProfiler(const GPUProfiler &) = delete;
  118. GPUProfiler &operator=(const GPUProfiler &) = delete;
  119. void Init(const std::string &profileDataPath);
  120. void Stop();
  121. void StopCUPTI();
  122. void StepProfilingEnable(const bool enable_flag);
  123. void SyncEnable(const bool enable_flag);
  124. bool GetEnableFlag() const { return enable_flag_; }
  125. bool GetSyncEnableFlag() const { return sync_enable_flag_; }
  126. void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
  127. uint64_t startTimestamp, uint64_t endTimestamp);
  128. void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  129. void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  130. void OpDataProducerBegin(const std::string op_name, void *stream);
  131. void OpDataProducerEnd();
  132. void ProcessEvents();
  133. void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
  134. std::string ProfileDataPath() const { return profile_data_path_; }
  135. private:
  136. GPUProfiler() = default;
  137. void OpsParser();
  138. void EventLog(const Event &event);
  139. void ClearInst();
  140. void HandleActivityRecord(CUpti_Activity *record);
  141. void AddEvent(Event &&event);
  142. void SetRunTimeData(const std::string &op_name, void *stream);
  143. void SetRunTimeData(const std::string &op_name, const float time_elapsed);
  144. void SetRunTimeData(const std::string &op_name, const uint64_t start, const float duration);
  145. void FixOpNameByCorrelationId(Event *event);
  146. static std::shared_ptr<GPUProfiler> profiler_inst_;
  147. bool enable_flag_ = false;
  148. bool sync_enable_flag_ = true;
  149. std::unordered_map<std::string, OpInfo> op_info_map_;
  150. std::unordered_map<uint32_t, std::string> op_name_map_;
  151. std::vector<Event> events_;
  152. BaseTime base_time_;
  153. std::string op_name_;
  154. void *stream_;
  155. void SaveProfileData();
  156. void SaveExtraProfileData();
  157. std::mutex event_mutex_;
  158. std::vector<CUpti_ActivityKind> activities_enable_;
  159. uint64_t cupti_callback_events_count_ = 0l;
  160. uint64_t cupti_callback_events_drop_count_ = 0l;
  161. uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
  162. uint64_t cupti_activity_events_count_ = 0l;
  163. uint64_t cupti_activity_events_drop_count_ = 0l;
  164. uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
  165. CUpti_SubscriberHandle subscriber_ = nullptr;
  166. cudaEvent_t op_event_start_;
  167. cudaEvent_t op_event_stop_;
  168. uint64_t op_host_time_start_;
  169. uint64_t op_host_time_stop_;
  170. uint64_t op_cupti_time_start_;
  171. std::string profile_data_path_;
  172. std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
  173. };
  174. } // namespace gpu
  175. } // namespace profiler
  176. } // namespace mindspore
  177. #endif // MINDSPORE_GPU_PROFILING_H