You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_saver.cc 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. /**
  2. * Copyright 2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "profiler/device/gpu/data_saver.h"
  17. #include <fstream>
  18. #include <numeric>
  19. #include "sys/stat.h"
  20. #include "utils/log_adapter.h"
  21. #include "utils/ms_utils.h"
  22. namespace mindspore {
  23. namespace profiler {
  24. namespace gpu {
  25. OpDetailInfo::OpDetailInfo(std::shared_ptr<OpInfo> op_info, float proportion)
  26. : op_info_(op_info), proportion_(proportion) {
  27. // op_full_name is like 'xxx/xxx/{op_type}-op{node_id}'
  28. op_full_name_ = op_info->op_name;
  29. auto op_type_begin_iter = op_full_name_.rfind('/') + 1;
  30. auto op_type_end_iter = op_full_name_.rfind('-');
  31. op_type_ = op_full_name_.substr(op_type_begin_iter, op_type_end_iter - op_type_begin_iter);
  32. op_name_ = op_full_name_.substr(op_type_begin_iter);
  33. op_avg_time_ = op_info->op_host_cost_time / op_info->op_count;
  34. }
  35. ActivityData::ActivityData(std::shared_ptr<Event> data) : basic_info_(data) {
  36. grid_dim_ = basic_info_->activity_type == ActivityType::kKernel
  37. ? "\"" + std::to_string(basic_info_->kernel_info.grid_x) + ',' +
  38. std::to_string(basic_info_->kernel_info.grid_y) + ',' +
  39. std::to_string(basic_info_->kernel_info.grid_z) + "\""
  40. : "";
  41. block_dim_ = basic_info_->activity_type == ActivityType::kKernel
  42. ? "\"" + std::to_string(basic_info_->kernel_info.block_x) + ',' +
  43. std::to_string(basic_info_->kernel_info.block_y) + ',' +
  44. std::to_string(basic_info_->kernel_info.block_z) + "\""
  45. : "";
  46. count_ = 1;
  47. total_duration_ = (basic_info_->end_time_stamp - basic_info_->start_time_stamp) / kTimeUnit;
  48. avg_duration_ = total_duration_;
  49. max_duration_ = total_duration_;
  50. min_duration_ = total_duration_;
  51. start_duration.emplace_back(StartDuration({basic_info_->start_time_stamp, total_duration_}));
  52. }
  53. ActivityData &ActivityData::operator+=(const ActivityData &other) {
  54. this->count_ += other.count_;
  55. this->total_duration_ += other.total_duration_;
  56. // update max or min duration
  57. if (other.total_duration_ > this->max_duration_) {
  58. this->max_duration_ = other.total_duration_;
  59. } else if (other.max_duration_ < this->min_duration_) {
  60. this->min_duration_ = other.total_duration_;
  61. }
  62. return *this;
  63. }
  64. void DataSaver::ParseOpInfo(const OpInfoMap &op_info_maps) {
  65. op_detail_infos_.reserve(op_info_maps.size());
  66. float total_time_sum = GetTotalOpTime(op_info_maps);
  67. for (auto item : op_info_maps) {
  68. op_timestamps_map_[item.first] = item.second.start_duration;
  69. float proportion = item.second.op_host_cost_time / total_time_sum;
  70. auto op_info = std::make_shared<OpInfo>(item.second);
  71. OpDetailInfo op_detail_info = OpDetailInfo(op_info, proportion);
  72. op_detail_infos_.emplace_back(op_detail_info);
  73. AddOpDetailInfoForType(op_detail_info);
  74. }
  75. // update average time of op type
  76. for (auto &op_type : op_type_infos_) {
  77. // device_infos: <type_name, op_type_info>
  78. op_type.second.avg_time_ = op_type.second.total_time_ / op_type.second.count_;
  79. }
  80. MS_LOG(DEBUG) << "Get " << op_detail_infos_.size() << " operation items.";
  81. MS_LOG(DEBUG) << "Get " << op_type_infos_.size() << " operation type items.";
  82. }
  83. void DataSaver::AddOpDetailInfoForType(const OpDetailInfo &op_detail_info) {
  84. // Construct OpType object according to op detail info
  85. OpType op_type = OpType{op_detail_info.op_type_, op_detail_info.op_info_->op_count,
  86. op_detail_info.op_info_->op_host_cost_time, 0, op_detail_info.proportion_};
  87. // Set the OpType into op_type_infos_ map
  88. std::string type_name = op_detail_info.op_type_;
  89. auto iter = op_type_infos_.find(type_name);
  90. if (iter == op_type_infos_.end()) {
  91. op_type_infos_.emplace(type_name, op_type);
  92. } else {
  93. iter->second += op_type;
  94. }
  95. }
  96. float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) {
  97. float sum = 0;
  98. sum = std::accumulate(op_info_maps.begin(), op_info_maps.end(), sum,
  99. [](float i, auto iter) { return i + iter.second.op_host_cost_time; });
  100. MS_LOG(DEBUG) << "The total op time is " << sum;
  101. return sum;
  102. }
  103. void DataSaver::ParseEvent(const std::vector<Event> &events) {
  104. // Put Kernel activity events into activity_infos_
  105. for (const auto &event : events) {
  106. if (event.op_name.empty() || event.api_type != CUPTIApiType::kActivity ||
  107. event.activity_type != ActivityType::kKernel) {
  108. continue;
  109. }
  110. AddKernelEvent(event);
  111. }
  112. // update average time of kernel op cost
  113. for (auto &device_infos : activity_infos_) {
  114. // device_infos: <device_id, DeviceActivityInfos>
  115. for (auto &activity_info : device_infos.second) {
  116. // activity_info: <kernel_name, Activity>
  117. activity_info.second.avg_duration_ = activity_info.second.total_duration_ / activity_info.second.count_;
  118. }
  119. MS_LOG(DEBUG) << "Get " << device_infos.second.size() << " activity items for device:" << device_infos.first;
  120. }
  121. }
  122. void DataSaver::AddKernelEvent(const Event &event) {
  123. // Put kernel event to activity_infos according to device id
  124. uint32_t device_id = event.device_id;
  125. auto iter = activity_infos_.find(device_id);
  126. if (iter == activity_infos_.end()) {
  127. auto res_flag = activity_infos_.emplace(device_id, DeviceActivityInfos());
  128. AddKernelEventToDevice(event, &res_flag.first->second);
  129. } else {
  130. AddKernelEventToDevice(event, &iter->second);
  131. }
  132. }
  133. void DataSaver::AddKernelEventToDevice(const Event &event, DeviceActivityInfos *device_activity_infos) {
  134. // Combine kernel activity with same kernel name
  135. auto event_ptr = std::make_shared<Event>(event);
  136. ActivityData activity_data = ActivityData(event_ptr);
  137. std::string kernel_name = event.kernel_name;
  138. auto iter = device_activity_infos->find(kernel_name);
  139. if (iter == device_activity_infos->end()) {
  140. device_activity_infos->emplace(kernel_name, activity_data);
  141. } else {
  142. iter->second += activity_data;
  143. iter->second.start_duration.emplace_back(StartDuration({event.start_time_stamp, activity_data.total_duration_}));
  144. }
  145. }
  146. void DataSaver::WriteFile(std::string out_path_dir) {
  147. if (out_path_dir.empty()) {
  148. MS_LOG(WARNING) << "Output directory. Ignore the writing data.";
  149. return;
  150. }
  151. if (op_detail_infos_.empty() || op_type_infos_.empty() || activity_infos_.empty()) {
  152. MS_LOG(WARNING) << "No operation detail infos to write.";
  153. return;
  154. }
  155. // not support multi-device for operator info per process yet
  156. device_id_ = std::to_string(activity_infos_.begin()->first);
  157. WriteOpDetail(out_path_dir);
  158. WriteOpType(out_path_dir);
  159. WriteActivity(out_path_dir);
  160. WriteOpTimestamp(out_path_dir);
  161. }
  162. void DataSaver::WriteOpType(const std::string &saver_base_dir) {
  163. std::string file_path = saver_base_dir + "/gpu_op_type_info_" + device_id_ + ".csv";
  164. std::ofstream ofs(file_path);
  165. // check if the file is writable
  166. if (!ofs.is_open()) {
  167. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  168. return;
  169. }
  170. // write op type info into file
  171. ofs << OpType().GetHeader() << std::endl;
  172. for (auto op_type_info : op_type_infos_) {
  173. ofs << op_type_info.second << std::endl;
  174. }
  175. ofs.close();
  176. ChangeFileMode(file_path);
  177. MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path;
  178. }
  179. void DataSaver::WriteOpDetail(const std::string &saver_base_dir) {
  180. std::string file_path = saver_base_dir + "/gpu_op_detail_info_" + device_id_ + ".csv";
  181. std::ofstream ofs(file_path);
  182. if (!ofs.is_open()) {
  183. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  184. return;
  185. }
  186. // write op detail info into file
  187. ofs << OpDetailInfo().GetHeader() << std::endl;
  188. for (auto op_detail : op_detail_infos_) {
  189. ofs << op_detail << std::endl;
  190. }
  191. ofs.close();
  192. ChangeFileMode(file_path);
  193. MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path;
  194. }
  195. void DataSaver::WriteActivity(const std::string &saver_base_dir) {
  196. std::string file_path_base = saver_base_dir + "/gpu_activity_data_";
  197. std::string timestamp_file_path_base = saver_base_dir + "/activity_execute_timestamp_";
  198. for (auto device_info : activity_infos_) {
  199. // write activity result csv
  200. std::string file_path = file_path_base + std::to_string(device_info.first) + ".csv";
  201. std::ofstream ofs(file_path);
  202. if (!ofs.is_open()) {
  203. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  204. return;
  205. }
  206. // write activity timestamp txt
  207. std::string timestamp_file_path = timestamp_file_path_base + std::to_string(device_info.first) + ".txt";
  208. std::ofstream activity_timestamp_ofs(timestamp_file_path);
  209. if (!activity_timestamp_ofs.is_open()) {
  210. MS_LOG(WARNING) << "Open file '" << timestamp_file_path << "' failed!";
  211. return;
  212. }
  213. // write activity data into file
  214. ofs << ActivityData().GetHeader() << std::endl;
  215. for (auto activity_data : device_info.second) {
  216. ofs << activity_data.second << std::endl;
  217. for (auto start_duration : activity_data.second.start_duration) {
  218. activity_timestamp_ofs << activity_data.second.basic_info_->kernel_name << ";";
  219. activity_timestamp_ofs << activity_data.second.basic_info_->stream_id << ";";
  220. activity_timestamp_ofs << start_duration.start_timestamp << ";";
  221. activity_timestamp_ofs << start_duration.duration << std::endl;
  222. }
  223. }
  224. ofs.close();
  225. ChangeFileMode(file_path);
  226. activity_timestamp_ofs.close();
  227. ChangeFileMode(timestamp_file_path);
  228. MS_LOG(INFO) << "Write " << device_info.second.size() << " activity infos into file: " << file_path;
  229. }
  230. }
  231. void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) {
  232. std::string file_path = saver_base_dir + "/op_execute_timestamp_" + device_id_ + ".txt";
  233. std::ofstream ofs(file_path);
  234. // check if the file is writable
  235. if (!ofs.is_open()) {
  236. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  237. return;
  238. }
  239. // write op timestamp info into file
  240. for (const auto &op_timestamp_info : op_timestamps_map_) {
  241. ofs << op_timestamp_info.first << ";Ops;";
  242. for (auto start_end : op_timestamp_info.second) {
  243. ofs << start_end.start_timestamp << "," << start_end.duration << " ";
  244. }
  245. ofs << std::endl;
  246. }
  247. ofs.close();
  248. ChangeFileMode(file_path);
  249. }
  250. void DataSaver::ChangeFileMode(const std::string &file_path) {
  251. if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
  252. MS_LOG(INFO) << "Modify file:" << file_path << " to rw fail.";
  253. return;
  254. }
  255. }
  256. } // namespace gpu
  257. } // namespace profiler
  258. } // namespace mindspore