You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gpu_profiling.h 5.4 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H
  17. #define MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_GPU_PROFILING_H
  18. #include <cuda.h>
  19. #include <cupti.h>
  20. #include <algorithm>
  21. #include <cstdio>
  22. #include <map>
  23. #include <memory>
  24. #include <mutex>
  25. #include <string>
  26. #include <unordered_map>
  27. #include <utility>
  28. #include <vector>
  29. #include "profiler/device/profiling.h"
  30. #include "profiler/device/gpu/gpu_profiling_utils.h"
  31. namespace mindspore {
  32. namespace profiler {
  33. namespace gpu {
  34. enum class CUPTIApiType { kCallback = 0, kActivity = 1 };
  35. enum class ActivityType {
  36. kKernel = 0,
  37. kMemcpyH2D = 1,
  38. kMemcpyD2H = 2,
  39. kMemcpyH2A = 3,
  40. kMemcpyA2H = 4,
  41. kMemcpyA2D = 5,
  42. kMemcpyD2A = 6,
  43. kMemcpyD2D = 7,
  44. kMemcpyP2P = 8,
  45. kMemcpyH2H = 9,
  46. kMemset = 10,
  47. kMemcpyUnknown = 11
  48. };
  49. struct MemcpyInfo {
  50. size_t bytes;
  51. unsigned char src_kind;
  52. unsigned char dst_kind;
  53. };
  54. struct KernelInfo {
  55. uint64_t registers_per_thread;
  56. uint64_t static_shared_memory;
  57. uint64_t dynamic_shared_memory;
  58. uint64_t block_x;
  59. uint64_t block_y;
  60. uint64_t block_z;
  61. uint64_t grid_x;
  62. uint64_t grid_y;
  63. uint64_t grid_z;
  64. };
  65. struct Event {
  66. std::string kernel_name;
  67. std::string kernel_type;
  68. CUPTIApiType api_type;
  69. ActivityType activity_type;
  70. uint64_t start_time_stamp;
  71. uint64_t end_time_stamp;
  72. std::string op_name;
  73. uint32_t device_id;
  74. uint32_t correlation_id;
  75. uint32_t thread_id;
  76. uint32_t context_id;
  77. uint32_t stream_id;
  78. CUpti_CallbackId cb_id;
  79. union {
  80. MemcpyInfo memcpy_info;
  81. KernelInfo kernel_info;
  82. };
  83. };
  84. struct BaseTime {
  85. // nanosecond
  86. uint64_t host_start_time = 0l;
  87. uint64_t host_start_monotonic_raw_time = 0l;
  88. uint64_t gpu_start_time = 0l;
  89. };
  90. const float kTimeUnit = 1000;
  91. class ProfilingOp {
  92. public:
  93. ProfilingOp() = default;
  94. virtual ~ProfilingOp() = default;
  95. virtual void SaveProfilingData() = 0;
  96. virtual void Init() = 0;
  97. std::string Name() const { return op_name_; }
  98. protected:
  99. std::string op_name_;
  100. };
  101. class GPUProfiler : public Profiler {
  102. public:
  103. static std::shared_ptr<GPUProfiler> GetInstance();
  104. ~GPUProfiler() { StopCUPTI(); }
  105. GPUProfiler(const GPUProfiler &) = delete;
  106. GPUProfiler &operator=(const GPUProfiler &) = delete;
  107. void Init(const std::string &profileDataPath) override;
  108. void Stop() override;
  109. void StopCUPTI();
  110. void StepProfilingEnable(const bool enable_flag) override;
  111. void SyncEnable(const bool enable_flag);
  112. bool GetEnableFlag() const { return enable_flag_; }
  113. bool GetSyncEnableFlag() const { return sync_enable_flag_; }
  114. void EventHandleProcess(CUpti_CallbackId cbid, const CUpti_CallbackData *cbdata, const std::string &typestring,
  115. uint64_t startTimestamp, uint64_t endTimestamp);
  116. void CUPTIAPI AllocBuffer(uint8_t **buffer, size_t *size, size_t *maxNumRecords);
  117. void CUPTIAPI ProcessBuffer(CUcontext ctx, uint32_t streamId, uint8_t *buffer, size_t size, size_t validSize);
  118. void OpDataProducerBegin(const std::string op_name, void *stream);
  119. void OpDataProducerEnd() override;
  120. void ProcessEvents();
  121. void RegisterProfilingOp(std::shared_ptr<ProfilingOp> node);
  122. void SetStepTraceOpName(ProfilingTraceInfo trace_op_name);
  123. std::string ProfileDataPath() const { return profile_data_path_; }
  124. private:
  125. GPUProfiler() = default;
  126. void OpsParser();
  127. void EventLog(const Event &event);
  128. void ClearInst() override;
  129. void HandleActivityRecord(CUpti_Activity *record);
  130. void AddEvent(Event &&event);
  131. void SetRunTimeData(const std::string &op_name, void *stream);
  132. void FixOpNameByCorrelationId(Event *event);
  133. static std::shared_ptr<GPUProfiler> profiler_inst_;
  134. bool enable_flag_ = false;
  135. bool sync_enable_flag_ = true;
  136. std::unordered_map<uint32_t, std::string> op_name_map_;
  137. std::vector<Event> events_;
  138. BaseTime base_time_;
  139. std::string op_name_;
  140. void *stream_;
  141. void SaveProfileData() override;
  142. void SaveExtraProfileData();
  143. std::mutex event_mutex_;
  144. std::vector<CUpti_ActivityKind> activities_enable_;
  145. uint64_t cupti_callback_events_count_ = 0l;
  146. uint64_t cupti_callback_events_drop_count_ = 0l;
  147. uint64_t max_cupti_callback_events_ = 2 * 1024 * 10000;
  148. uint64_t cupti_activity_events_count_ = 0l;
  149. uint64_t cupti_activity_events_drop_count_ = 0l;
  150. uint64_t max_cupti_activity_events_ = 2 * 1024 * 10000;
  151. CUpti_SubscriberHandle subscriber_ = nullptr;
  152. cudaEvent_t op_event_start_;
  153. cudaEvent_t op_event_stop_;
  154. uint64_t op_host_time_start_;
  155. uint64_t op_host_time_stop_;
  156. uint64_t op_cupti_time_start_;
  157. std::string profile_data_path_;
  158. std::map<std::string, std::shared_ptr<ProfilingOp>> profiling_op_;
  159. ProfilingTraceInfo step_trace_op_name;
  160. };
  161. } // namespace gpu
  162. } // namespace profiler
  163. } // namespace mindspore
  164. #endif // MINDSPORE_CCSRC_PROFILER_DEVICE_GPU_PROFILING_H