You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.h 4.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_GPU_PROFILING_H
  17. #define MINDSPORE_GPU_PROFILING_H
  18. #include <cuda.h>
  19. #include <cupti.h>
  20. #include <cstdio>
  21. #include <unordered_map>
  22. #include <string>
  23. #include <vector>
  24. #include <mutex>
  25. #include <memory>
  26. #include <algorithm>
  27. #include <utility>
  28. namespace mindspore {
  29. namespace profiler {
  30. namespace gpu {
  31. enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
  32. enum class ActivityType {
  33. kKernel = 0,
  34. kMemcpyH2D = 1,
  35. kMemcpyD2H = 2,
  36. kMemcpyH2A = 3,
  37. kMemcpyA2H = 4,
  38. kMemcpyA2D = 5,
  39. kMemcpyD2A = 6,
  40. kMemcpyD2D = 7,
  41. kMemcpyP2P = 8,
  42. kMemcpyH2H = 9,
  43. kMemset = 10,
  44. kMemcpyUnknown = 11
  45. };
  46. struct MemcpyInfo {
  47. size_t bytes;
  48. unsigned char src_kind;
  49. unsigned char dst_kind;
  50. };
  51. struct KernelInfo {
  52. uint64_t registers_per_thread;
  53. uint64_t static_shared_memory;
  54. uint64_t dynamic_shared_memory;
  55. uint64_t block_x;
  56. uint64_t block_y;
  57. uint64_t block_z;
  58. uint64_t grid_x;
  59. uint64_t grid_y;
  60. uint64_t grid_z;
  61. };
  62. struct Event {
  63. std::string kernel_name;
  64. std::string kernel_type;
  65. CUPTIApiType api_type;
  66. ActivityType activity_type;
  67. uint64_t start_time_stamp;
  68. uint64_t end_time_stamp;
  69. std::string op_name;
  70. uint32_t device_id;
  71. uint32_t correlation_id;
  72. uint32_t thread_id;
  73. int64_t context_id;
  74. uint32_t stream_id;
  75. CUpti_CallbackId cb_id;
  76. union {
  77. MemcpyInfo memcpy_info;
  78. KernelInfo kernel_info;
  79. };
  80. };
  81. struct OpInfo {
  82. std::string op_name;
  83. float cupti_api_call_time = 0l;
  84. float cupti_activity_time = 0l;
  85. float op_host_cost_time = 0;
  86. int op_kernel_api_count = 0;
  87. int op_kernel_count = 0;
  88. int op_count = 0;
  89. void *stream;
  90. MemcpyInfo memcpy_info = {0};
  91. KernelInfo kernel_info = {0};
  92. };
  93. struct BaseTime {
  94. // nanosecond
  95. uint64_t host_start_time = 0l;
  96. uint64_t gpu_start_time = 0l;
  97. };
  98. const float kTimeUnit = 1000;
  99. class GPUProfiler {
  100. public:
  101. static std::shared_ptr<GPUProfiler> GetInstance();
  102. ~GPUProfiler() { StopCUPTI(); }
  103. GPUProfiler(const GPUProfiler &) = delete;
  104. GPUProfiler &operator=(const GPUProfiler &) = delete;
  105. void Init(const std::string &profileDataPath);
  106. void Stop();
  107. void StopCUPTI();
  108. void StepProfilingEnable(const bool enable_flag);
  109. void SyncEnable(const bool enable_flag);
  110. bool GetEnableFlag() const { return enable_flag_; }
  111. bool GetSyncEnableFlag() const { return sync_enable_flag_; }
  112. void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
  113. uint64_t startTimestamp, uint64_t endTimestamp);
  114. void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  115. void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  116. void OpDataProducerBegin(const std::string op_name, void *stream);
  117. void OpDataProducerEnd();
  118. private:
  119. GPUProfiler() = default;
  120. void OpsParser();
  121. void EventLog(const Event &event);
  122. void HandleActivityRecord(CUpti_Activity *record);
  123. void AddEvent(Event &&event);
  124. void SetRunTimeData(const std::string &op_name, void *stream);
  125. void SetRunTimeData(const std::string &op_name, const float time_elapsed);
  126. void FixOpNameByCorrelationId(Event *event);
  127. static std::shared_ptr<GPUProfiler> profiler_inst_;
  128. bool enable_flag_ = false;
  129. bool sync_enable_flag_ = true;
  130. std::unordered_map<std::string, OpInfo> op_info_map_;
  131. std::unordered_map<uint32_t, std::string> op_name_map_;
  132. std::vector<Event> events_;
  133. BaseTime base_time_;
  134. std::string op_name_;
  135. void *stream_;
  136. void SaveProfileData();
  137. std::mutex event_mutex_;
  138. std::vector<CUpti_ActivityKind> activities_enable_;
  139. uint64_t cupti_callback_events_count_ = 0l;
  140. uint64_t cupti_callback_events_drop_count_ = 0l;
  141. uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
  142. uint64_t cupti_activity_events_count_ = 0l;
  143. uint64_t cupti_activity_events_drop_count_ = 0l;
  144. uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
  145. CUpti_SubscriberHandle subscriber_ = nullptr;
  146. cudaEvent_t op_event_start_;
  147. cudaEvent_t op_event_stop_;
  148. uint64_t op_host_time_start_;
  149. uint64_t op_host_time_stop_;
  150. std::string profile_data_path_;
  151. };
  152. } // namespace gpu
  153. } // namespace profiler
  154. } // namespace mindspore
  155. #endif // MINDSPORE_GPU_PROFILING_H