You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

tensor_summary.cc 10 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. /**
  2. * Copyright 2020-2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <cmath>
  17. #include <algorithm>
  18. #include <limits>
  19. #include <memory>
  20. #include <bitset>
  21. #include <tuple>
  22. #include "debug/debugger/tensor_summary.h"
  23. #ifdef OFFLINE_DBG_MODE
  24. #include "Eigen/Core"
  25. #include "Eigen/src/Core/arch/CUDA/Half.h"
  26. using float16 = Eigen::half;
  27. #include "offline_debug/offline_logger.h"
  28. #endif
  29. #ifdef ONLINE_DBG_MODE
  30. namespace mindspore {
  31. #endif
  32. using CONDITION_TYPE = DebugServices::CONDITION_TYPE;
  33. RangeCountCalculator::RangeCountCalculator()
  34. : range_start_inclusive(-std::numeric_limits<double>::infinity()),
  35. range_end_inclusive(std::numeric_limits<double>::infinity()),
  36. count(0),
  37. total(0) {}
  38. void RangeCountCalculator::ProcessElement(double element) {
  39. count += (element >= range_start_inclusive && element <= range_end_inclusive);
  40. total += 1;
  41. }
  42. double RangeCountCalculator::GetPercentInRange() const {
  43. if (total == 0) {
  44. return 0.0;
  45. }
  46. const double factor = 100.0;
  47. return factor * count / total;
  48. }
  49. AllCloseCalculator::AllCloseCalculator() : atol(1.0e-8), rtol(1.0e-5), result(true) {}
  50. void AllCloseCalculator::ProcessElement(double current, double previous) {
  51. result = result && (std::abs(current - previous) <= (atol + rtol * std::abs(previous)));
  52. }
  53. bool AllCloseCalculator::IsAllClose() const { return result; }
  54. MeanCalculator::MeanCalculator() : mean(0.0), count(0) {}
  55. void MeanCalculator::ProcessElement(double value) {
  56. count += 1;
  57. double delta = value - mean;
  58. mean += delta / count;
  59. }
  60. double MeanCalculator::GetMean() const { return mean; }
  61. VarianceAndMeanCalculator::VarianceAndMeanCalculator() : mean(0.0), count(0), m2(0.0) {}
  62. void VarianceAndMeanCalculator::ProcessElement(double value) {
  63. count += 1;
  64. double delta = value - mean;
  65. mean += delta / count;
  66. m2 += delta * (value - mean);
  67. }
  68. double VarianceAndMeanCalculator::GetMean() const { return mean; }
  69. double VarianceAndMeanCalculator::GetVariance() const {
  70. if (count > 1) {
  71. return m2 / (count - 1);
  72. } else {
  73. return 0.0;
  74. }
  75. }
  76. double VarianceAndMeanCalculator::GetStandardDeviation() { return sqrt(GetVariance()); }
  77. template <typename T>
  78. TensorSummary<T>::TensorSummary(void *current_tensor_ptr, void *const previous_tensor_ptr, uint32_t num_elements)
  79. : current_tensor_ptr(reinterpret_cast<T *>(current_tensor_ptr)),
  80. prev_tensor_ptr(reinterpret_cast<T *>(previous_tensor_ptr)),
  81. num_elements(num_elements),
  82. min(std::numeric_limits<double>::max()),
  83. max(std::numeric_limits<double>::lowest()),
  84. inf_count(0),
  85. nan_count(0),
  86. zero_count(0),
  87. epsilon(1.0e-9),
  88. mean_sd_cal_enabled(false) {}
  89. template <typename T>
  90. void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) {
  91. InitCalculators(wps);
  92. for (size_t i = 0; i < num_elements; ++i) {
  93. auto current_value = static_cast<double>(current_tensor_ptr[i]);
  94. double previous_value =
  95. prev_tensor_ptr ? static_cast<double>(prev_tensor_ptr[i]) : std::numeric_limits<double>::quiet_NaN();
  96. inf_count += std::isinf(current_value);
  97. nan_count += std::isnan(current_value);
  98. zero_count += (current_value == 0);
  99. max = std::max(max, current_value);
  100. min = std::min(min, current_value);
  101. if (mean_sd_cal_enabled) {
  102. current_mean_variance.ProcessElement(current_value);
  103. }
  104. for (auto &it : all_close) {
  105. it.second->ProcessElement(current_value, previous_value);
  106. }
  107. for (auto &range_count : range_counts) {
  108. range_count.second->ProcessElement(current_value);
  109. }
  110. for (auto &mean : means) {
  111. if (mean.first == "curr_prev_diff_mean") {
  112. mean.second->ProcessElement(std::abs(current_value - previous_value));
  113. } else if (mean.first == "abs_prev_mean") {
  114. mean.second->ProcessElement(std::abs(previous_value));
  115. } else if (mean.first == "abs_current_mean") {
  116. mean.second->ProcessElement(std::abs(current_value));
  117. }
  118. }
  119. }
  120. }
  121. template <typename T>
  122. std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit(
  123. DebugServices::watchpoint_t wp) {
  124. auto parameter_list = wp.parameter_list;
  125. bool hit = false;
  126. const uint8_t bit_size = 32;
  127. std::bitset<bit_size> error_code;
  128. CONDITION_TYPE type = wp.condition.type;
  129. // bit 0 denotes presence of nan
  130. error_code.set(0, nan_count > 0);
  131. // bit 1 denotes presence of inf
  132. error_code.set(1, inf_count > 0);
  133. if (type == CONDITION_TYPE::HAS_NAN) {
  134. error_code.reset();
  135. hit = nan_count > 0;
  136. } else if (type == CONDITION_TYPE::HAS_INF) {
  137. error_code.reset();
  138. hit = inf_count > 0;
  139. } else if (type == CONDITION_TYPE::GENERAL_OVERFLOW) {
  140. error_code.reset();
  141. hit = (nan_count + inf_count) > 0;
  142. } else if (type == CONDITION_TYPE::NOT_CHANGED && prev_tensor_ptr && error_code.none()) {
  143. hit = all_close[wp.id]->IsAllClose();
  144. } else if ((type == CONDITION_TYPE::NOT_CHANGED || type == CONDITION_TYPE::CHANGE_TOO_LARGE ||
  145. type == CONDITION_TYPE::CHANGE_TOO_SMALL) &&
  146. !prev_tensor_ptr) {
  147. // bit 2 denotes absence of previous tensor
  148. error_code.set(2, true);
  149. }
  150. if (error_code.none()) {
  151. for (auto &parameter : parameter_list) {
  152. if (parameter.disabled || error_code.any()) {
  153. continue;
  154. }
  155. // extract inequality type from watchpoint for backward compatibility
  156. std::string inequality_type;
  157. if (wp.is_gt_wp()) {
  158. inequality_type = "gt";
  159. } else if (wp.is_lt_wp()) {
  160. inequality_type = "lt";
  161. }
  162. parameter.Evaluate(StatLookup(parameter.name, wp), inequality_type);
  163. hit = hit || parameter.hit;
  164. }
  165. }
  166. return std::make_tuple(hit, static_cast<int32_t>(error_code.to_ulong()), parameter_list);
  167. }
  168. template <typename T>
  169. double_t TensorSummary<T>::StatLookup(const std::string &parameter_name, const DebugServices::watchpoint_t &wp) {
  170. if (parameter_name == "param") return StatLookup(wp);
  171. std::string param_type;
  172. auto pos = parameter_name.find_last_of('_');
  173. if (pos != std::string::npos) {
  174. param_type = parameter_name.substr(0, pos);
  175. }
  176. if (param_type == "max") {
  177. return max;
  178. } else if (param_type == "min") {
  179. return min;
  180. } else if (param_type == "max_min") {
  181. return max - min;
  182. } else if (param_type == "mean") {
  183. return current_mean_variance.GetMean();
  184. } else if (param_type == "sd") {
  185. return current_mean_variance.GetStandardDeviation();
  186. } else if (param_type == "abs_mean") {
  187. if (means.find("abs_current_mean") != means.end()) {
  188. return means["abs_current_mean"]->GetMean();
  189. }
  190. } else if (param_type == "abs_mean_update_ratio" && prev_tensor_ptr) {
  191. if (means.find("curr_prev_diff_mean") != means.end() && means.find("abs_prev_mean") != means.end()) {
  192. return means["curr_prev_diff_mean"]->GetMean() / (means["abs_prev_mean"]->GetMean() + epsilon);
  193. }
  194. } else if (param_type == "range_percentage") {
  195. if (range_counts.find(wp.id) != range_counts.end()) {
  196. return range_counts[wp.id]->GetPercentInRange();
  197. }
  198. } else if (param_type == "zero_percentage") {
  199. return GetZeroValPercent();
  200. }
  201. return std::numeric_limits<double_t>::quiet_NaN();
  202. }
  203. template <typename T>
  204. double_t TensorSummary<T>::StatLookup(const DebugServices::watchpoint_t &wp) {
  205. CONDITION_TYPE type = wp.condition.type;
  206. if (type == CONDITION_TYPE::MAX_LT || type == CONDITION_TYPE::MAX_GT) {
  207. return max;
  208. } else if (type == CONDITION_TYPE::MIN_LT || type == CONDITION_TYPE::MIN_GT) {
  209. return min;
  210. } else if (type == CONDITION_TYPE::MEAN_LT || type == CONDITION_TYPE::MEAN_GT) {
  211. return current_mean_variance.GetMean();
  212. } else if (type == CONDITION_TYPE::SD_LT || type == CONDITION_TYPE::SD_GT) {
  213. return current_mean_variance.GetStandardDeviation();
  214. } else if (type == CONDITION_TYPE::MAX_MIN_GT || type == CONDITION_TYPE::MAX_MIN_LT) {
  215. return max - min;
  216. }
  217. return std::numeric_limits<double_t>::quiet_NaN();
  218. }
  219. template <typename T>
  220. double_t TensorSummary<T>::GetZeroValPercent() {
  221. if (num_elements == 0) {
  222. return 0;
  223. }
  224. return (zero_count * 100.0) / num_elements;
  225. }
  226. template <typename T>
  227. void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoint_t> &wps) {
  228. for (auto &wp : wps) {
  229. auto wp_id = wp.id;
  230. mean_sd_cal_enabled = mean_sd_cal_enabled || wp.mean_sd_enabled();
  231. if (wp.allclose_enabled() && prev_tensor_ptr) {
  232. all_close[wp_id] = std::make_unique<AllCloseCalculator>();
  233. if (!wp.parameter_list[0].disabled) {
  234. all_close[wp_id]->set_atol(wp.parameter_list[0].value);
  235. }
  236. if (!wp.parameter_list[1].disabled) {
  237. all_close[wp_id]->set_rtol(wp.parameter_list[1].value);
  238. }
  239. } else if (wp.range_enabled()) {
  240. range_counts[wp_id] = std::make_unique<RangeCountCalculator>();
  241. if (!wp.parameter_list[0].disabled) {
  242. range_counts[wp_id]->set_range_start_inclusive(wp.parameter_list[0].value);
  243. }
  244. if (!wp.parameter_list[1].disabled) {
  245. range_counts[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
  246. }
  247. } else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr) {
  248. means.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
  249. means.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
  250. } else if (wp.abs_mean_enabled()) {
  251. means.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
  252. }
  253. }
  254. }
  255. template class TensorSummary<uint8_t>;
  256. template class TensorSummary<int8_t>;
  257. template class TensorSummary<uint16_t>;
  258. template class TensorSummary<int16_t>;
  259. template class TensorSummary<uint32_t>;
  260. template class TensorSummary<int32_t>;
  261. template class TensorSummary<uint64_t>;
  262. template class TensorSummary<int64_t>;
  263. template class TensorSummary<float16>;
  264. template class TensorSummary<float>;
  265. template class TensorSummary<double>;
  266. template class TensorSummary<bool>;
  267. #ifdef ONLINE_DBG_MODE
  268. } // namespace mindspore
  269. #endif