You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.h 5.5 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H
  17. #define MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H
  18. #include <cuda.h>
  19. #include <cupti.h>
  20. #include <algorithm>
  21. #include <cstdio>
  22. #include <map>
  23. #include <memory>
  24. #include <mutex>
  25. #include <string>
  26. #include <unordered_map>
  27. #include <utility>
  28. #include <vector>
  29. #include "profiler/device/profiling.h"
  30. #include "profiler/device/gpu/gpu_profiling_utils.h"
  31. namespace mindspore {
  32. namespace profiler {
  33. namespace gpu {
  34. enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
  35. enum class ActivityType {
  36. kKernel = 0,
  37. kMemcpyH2D = 1,
  38. kMemcpyD2H = 2,
  39. kMemcpyH2A = 3,
  40. kMemcpyA2H = 4,
  41. kMemcpyA2D = 5,
  42. kMemcpyD2A = 6,
  43. kMemcpyD2D = 7,
  44. kMemcpyP2P = 8,
  45. kMemcpyH2H = 9,
  46. kMemset = 10,
  47. kMemcpyUnknown = 11
  48. };
  49. struct MemcpyInfo {
  50. size_t bytes;
  51. unsigned char src_kind;
  52. unsigned char dst_kind;
  53. };
  54. struct KernelInfo {
  55. uint64_t registers_per_thread;
  56. uint64_t static_shared_memory;
  57. uint64_t dynamic_shared_memory;
  58. uint64_t block_x;
  59. uint64_t block_y;
  60. uint64_t block_z;
  61. uint64_t grid_x;
  62. uint64_t grid_y;
  63. uint64_t grid_z;
  64. };
  65. struct Event {
  66. std::string kernel_name;
  67. std::string kernel_type;
  68. CUPTIApiType api_type;
  69. ActivityType activity_type;
  70. uint64_t start_time_stamp;
  71. uint64_t end_time_stamp;
  72. std::string op_name;
  73. uint32_t device_id;
  74. uint32_t correlation_id;
  75. uint32_t thread_id;
  76. uint32_t context_id;
  77. uint32_t stream_id;
  78. CUpti_CallbackId cb_id;
  79. union {
  80. MemcpyInfo memcpy_info;
  81. KernelInfo kernel_info;
  82. };
  83. };
  84. struct BaseTime {
  85. // nanosecond
  86. uint64_t host_start_time = 0l;
  87. uint64_t host_start_monotonic_raw_time = 0l;
  88. uint64_t gpu_start_time = 0l;
  89. };
  90. const float kTimeUnit = 1000;
  91. class ProfilingOp {
  92. public:
  93. ProfilingOp() = default;
  94. virtual ~ProfilingOp() = default;
  95. virtual void SaveProfilingData() = 0;
  96. virtual void Init() = 0;
  97. std::string Name() const { return op_name_; }
  98. protected:
  99. std::string op_name_;
  100. };
  101. class GPUProfiler : public Profiler {
  102. public:
  103. static std::shared_ptr<GPUProfiler> &GetInstance();
  104. GPUProfiler() = default;
  105. ~GPUProfiler() { StopCUPTI(); }
  106. GPUProfiler(const GPUProfiler &) = delete;
  107. GPUProfiler &operator=(const GPUProfiler &) = delete;
  108. void Init(const std::string &profileDataPath) override;
  109. void Stop() override;
  110. void StopCUPTI();
  111. void StepProfilingEnable(const bool enable_flag) override;
  112. void SyncEnable(const bool enable_flag);
  113. bool GetEnableFlag() const { return enable_flag_; }
  114. bool GetSyncEnableFlag() const { return sync_enable_flag_; }
  115. void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
  116. uint64_t startTimestamp, uint64_t endTimestamp);
  117. void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  118. void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  119. void OpDataProducerBegin(const std::string op_name, void *stream);
  120. void OpDataProducerEnd() override;
  121. void ProcessEvents();
  122. void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
  123. void SetStepTraceOpName(ProfilingTraceInfo trace_op_name);
  124. std::string ProfileDataPath() const { return profile_data_path_; }
  125. bool IsInitialized() { return is_init_; }
  126. private:
  127. void SingleOpLaunchTimeProcess(float op_time_elapsed);
  128. void OpsParser();
  129. void EventLog(const Event &event);
  130. void ClearInst() override;
  131. void HandleActivityRecord(CUpti_Activity *record);
  132. void AddEvent(Event &&event);
  133. void SetRunTimeData(const std::string &op_name, void *stream);
  134. void FixOpNameByCorrelationId(Event *event);
  135. static std::shared_ptr<GPUProfiler> profiler_inst_;
  136. bool enable_flag_ = false;
  137. bool sync_enable_flag_ = true;
  138. std::unordered_map<uint32_t, std::string> op_name_map_;
  139. std::vector<Event> events_;
  140. BaseTime base_time_;
  141. std::string op_name_;
  142. void *stream_;
  143. void SaveProfileData() override;
  144. void SaveExtraProfileData();
  145. std::mutex event_mutex_;
  146. std::vector<CUpti_ActivityKind> activities_enable_;
  147. uint64_t cupti_callback_events_count_ = 0l;
  148. uint64_t cupti_callback_events_drop_count_ = 0l;
  149. uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
  150. uint64_t cupti_activity_events_count_ = 0l;
  151. uint64_t cupti_activity_events_drop_count_ = 0l;
  152. uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
  153. CUpti_SubscriberHandle subscriber_ = nullptr;
  154. cudaEvent_t op_event_start_;
  155. cudaEvent_t op_event_stop_;
  156. uint64_t op_host_time_start_;
  157. uint64_t op_host_time_stop_;
  158. uint64_t op_cupti_time_start_;
  159. std::string profile_data_path_;
  160. std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
  161. ProfilingTraceInfo step_trace_op_name_;
  162. bool is_init_ = false;
  163. };
  164. } // namespace gpu
  165. } // namespace profiler
  166. } // namespace mindspore
  167. #endif // MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_PROFILING_H