You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

data_saver.cc 7.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "profiler/device/data_saver.h"
  17. #include <fstream>
  18. #include <numeric>
  19. #include "sys/stat.h"
  20. #include "utils/ms_utils.h"
  21. #include "utils/ms_context.h"
  22. namespace mindspore {
  23. namespace profiler {
  24. OpDetailInfo::OpDetailInfo(const std::shared_ptr<OpInfo> op_info, float proportion)
  25. : op_info_(op_info), proportion_(proportion) {
  26. // op_full_name is like 'xxx/xxx/{op_type}-op{node_id}'
  27. op_full_name_ = op_info->op_name;
  28. auto op_type_begin_iter = op_full_name_.rfind('/') + 1;
  29. auto op_type_end_iter = op_full_name_.rfind('-');
  30. op_type_ = op_full_name_.substr(op_type_begin_iter, op_type_end_iter - op_type_begin_iter);
  31. op_name_ = op_full_name_.substr(op_type_begin_iter);
  32. if (op_info->op_count == 0) {
  33. MS_LOG(ERROR) << "The num of operations can not be 0.";
  34. return;
  35. }
  36. op_avg_time_ = op_info->op_host_cost_time / op_info->op_count;
  37. }
  38. void DataSaver::ParseOpInfo(const OpInfoMap &op_info_maps) {
  39. op_detail_infos_.reserve(op_info_maps.size());
  40. float total_time_sum = GetTotalOpTime(op_info_maps);
  41. for (auto item : op_info_maps) {
  42. op_timestamps_map_[item.first] = item.second.start_duration;
  43. if (total_time_sum == 0.0) {
  44. MS_LOG(ERROR) << "The total operation times can not be 0.";
  45. return;
  46. }
  47. float proportion = item.second.op_host_cost_time / total_time_sum;
  48. auto op_info = std::make_shared<OpInfo>(item.second);
  49. if (op_info == nullptr) {
  50. MS_LOG(ERROR) << "Create Operation information node failed when parse operation information.";
  51. return;
  52. }
  53. OpDetailInfo op_detail_info = OpDetailInfo(op_info, proportion);
  54. op_detail_infos_.emplace_back(op_detail_info);
  55. AddOpDetailInfoForType(op_detail_info);
  56. }
  57. // update average time of op type
  58. for (auto &op_type : op_type_infos_) {
  59. // device_infos: <type_name, op_type_info>
  60. if (op_type.second.count_ == 0) {
  61. MS_LOG(ERROR) << "The num of operation type can not be 0.";
  62. return;
  63. }
  64. op_type.second.avg_time_ = op_type.second.total_time_ / op_type.second.count_;
  65. }
  66. MS_LOG(DEBUG) << "Get " << op_detail_infos_.size() << " operation items.";
  67. MS_LOG(DEBUG) << "Get " << op_type_infos_.size() << " operation type items.";
  68. }
  69. void DataSaver::AddOpDetailInfoForType(const OpDetailInfo &op_detail_info) {
  70. // Construct OpType object according to op detail info
  71. OpType op_type = OpType{op_detail_info.op_type_,
  72. op_detail_info.op_info_->op_count,
  73. op_detail_info.op_info_->op_count,
  74. op_detail_info.op_info_->op_host_cost_time,
  75. 0,
  76. op_detail_info.proportion_};
  77. // Set the OpType into op_type_infos_ map
  78. std::string type_name = op_detail_info.op_type_;
  79. auto iter = op_type_infos_.find(type_name);
  80. if (iter == op_type_infos_.end()) {
  81. op_type_infos_.emplace(type_name, op_type);
  82. } else {
  83. iter->second += op_type;
  84. }
  85. }
  86. float DataSaver::GetTotalOpTime(const OpInfoMap &op_info_maps) const {
  87. float sum = 0;
  88. sum = std::accumulate(op_info_maps.begin(), op_info_maps.end(), sum,
  89. [](float i, auto iter) { return i + iter.second.op_host_cost_time; });
  90. MS_LOG(DEBUG) << "The total op time is " << sum;
  91. return sum;
  92. }
  93. void DataSaver::WriteOpType(const std::string &saver_base_dir) const {
  94. std::string file_path = saver_base_dir + "/" + op_side_ + "_op_type_info_" + device_id_ + ".csv";
  95. std::ofstream ofs(file_path);
  96. // check if the file is writable
  97. if (!ofs.is_open()) {
  98. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  99. return;
  100. }
  101. try {
  102. // write op type info into file
  103. if (op_side_ == "cpu") {
  104. ofs << OpType().GetCpuHeader() << std::endl;
  105. for (auto op_type_info : op_type_infos_) {
  106. op_type_info.second.OutputCpuOpTypeInfo(ofs);
  107. }
  108. }
  109. if (op_side_ == "gpu") {
  110. ofs << OpType().GetGpuHeader() << std::endl;
  111. for (auto op_type_info : op_type_infos_) {
  112. op_type_info.second.OutputGpuOpTypeInfo(ofs);
  113. }
  114. }
  115. } catch (const std::exception &e) {
  116. MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
  117. }
  118. ofs.close();
  119. ChangeFileMode(file_path);
  120. MS_LOG(INFO) << "Write " << op_type_infos_.size() << " op type infos into file: " << file_path;
  121. }
  122. void DataSaver::WriteOpDetail(const std::string &saver_base_dir) const {
  123. std::string file_path = saver_base_dir + "/" + op_side_ + "_op_detail_info_" + device_id_ + ".csv";
  124. std::ofstream ofs(file_path);
  125. if (!ofs.is_open()) {
  126. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  127. return;
  128. }
  129. try {
  130. // write op detail info into file
  131. if (op_side_ == "cpu") {
  132. ofs << OpDetailInfo().GetCpuHeader() << std::endl;
  133. for (auto op_detail : op_detail_infos_) {
  134. op_detail.OutputCpuOpDetailInfo(ofs);
  135. }
  136. }
  137. if (op_side_ == "gpu") {
  138. ofs << OpDetailInfo().GetGpuHeader() << std::endl;
  139. for (auto op_detail : op_detail_infos_) {
  140. op_detail.OutputGpuOpDetailInfo(ofs);
  141. }
  142. }
  143. } catch (const std::exception &e) {
  144. MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
  145. }
  146. ofs.close();
  147. ChangeFileMode(file_path);
  148. MS_LOG(INFO) << "Write " << op_detail_infos_.size() << " op detail infos into file: " << file_path;
  149. }
  150. void DataSaver::WriteOpTimestamp(const std::string &saver_base_dir) const {
  151. std::string file_path = saver_base_dir + "/" + op_side_ + "_op_execute_timestamp_" + device_id_ + ".txt";
  152. std::ofstream ofs(file_path);
  153. // check if the file is writable
  154. if (!ofs.is_open()) {
  155. MS_LOG(WARNING) << "Open file '" << file_path << "' failed!";
  156. return;
  157. }
  158. try {
  159. // write op timestamp info into file
  160. for (const auto &op_timestamp_info : op_timestamps_map_) {
  161. if (op_side_ == "cpu") {
  162. ofs << op_timestamp_info.first << ";HostCpuOps;";
  163. } else {
  164. ofs << op_timestamp_info.first << ";GpuOps;";
  165. }
  166. for (auto start_end : op_timestamp_info.second) {
  167. ofs << start_end.start_timestamp << "," << start_end.duration << " ";
  168. }
  169. ofs << std::endl;
  170. }
  171. } catch (const std::exception &e) {
  172. MS_LOG(ERROR) << "Write " << file_path << "failed: " << e.what();
  173. }
  174. ofs.close();
  175. ChangeFileMode(file_path);
  176. }
  177. void DataSaver::ChangeFileMode(const std::string &file_path) const {
  178. if (chmod(common::SafeCStr(file_path), S_IRUSR | S_IWUSR) == -1) {
  179. MS_LOG(WARNING) << "Modify file: " << file_path << " to rw fail.";
  180. return;
  181. }
  182. }
  183. } // namespace profiler
  184. } // namespace mindspore