You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hcom_util.cc 7.0 kB

5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "kernel/hccl/hcom_util.h"
  17. #include <memory>
  18. #include "kernel/common_utils.h"
  19. #include "session/anf_runtime_algorithm.h"
  20. #include "utils/utils.h"
  21. namespace mindspore {
  22. bool HcomUtil::GetKernelInputShape(const AnfNodePtr &anf_node, vector<vector<size_t>> *hccl_kernel_intput_shape_list) {
  23. MS_EXCEPTION_IF_NULL(anf_node);
  24. MS_EXCEPTION_IF_NULL(hccl_kernel_intput_shape_list);
  25. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  26. std::vector<size_t> shape_i = AnfAlgo::GetInputDeviceShape(anf_node, i);
  27. hccl_kernel_intput_shape_list->emplace_back(shape_i);
  28. }
  29. return true;
  30. }
  31. bool HcomUtil::GetKernelOutputShape(const AnfNodePtr &anf_node, vector<vector<size_t>> *hccl_kernel_output_shape_list) {
  32. MS_EXCEPTION_IF_NULL(anf_node);
  33. MS_EXCEPTION_IF_NULL(hccl_kernel_output_shape_list);
  34. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(anf_node); ++i) {
  35. std::vector<size_t> shape_i = AnfAlgo::GetOutputDeviceShape(anf_node, i);
  36. hccl_kernel_output_shape_list->emplace_back(shape_i);
  37. }
  38. return true;
  39. }
  40. bool HcomUtil::GetHcomDataType(const AnfNodePtr &anf_node, vector<hcclDataType_t> *data_type_list) {
  41. MS_EXCEPTION_IF_NULL(anf_node);
  42. MS_EXCEPTION_IF_NULL(data_type_list);
  43. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  44. auto type_ptr = AnfAlgo::GetPrevNodeOutputDeviceDataType(anf_node, i);
  45. auto iter = CONST_OP_HCOM_DATA_TYPE_MAP.find(type_ptr);
  46. if (iter == CONST_OP_HCOM_DATA_TYPE_MAP.end()) {
  47. MS_LOG(EXCEPTION) << "HcomDataType cann't support Current Ascend Data Type : " << type_ptr;
  48. }
  49. data_type_list->emplace_back(iter->second);
  50. }
  51. auto type_base = *(std::begin(*data_type_list));
  52. if (std::any_of(data_type_list->begin(), data_type_list->end(),
  53. [&type_base](hcclDataType_t type) { return type != type_base; })) {
  54. MS_LOG(ERROR) << "hccl have different data type";
  55. return false;
  56. }
  57. return true;
  58. }
  59. bool HcomUtil::GetHcclOpSize(const hcclDataType_t &data_type, const vector<size_t> &shape, size_t *size) {
  60. MS_EXCEPTION_IF_NULL(size);
  61. size_t tmp_size = 1;
  62. uint32_t type_size = 4;
  63. for (size_t i = 0; i < shape.size(); i++) {
  64. tmp_size = SizetMulWithOverflowCheck(tmp_size, shape[i]);
  65. }
  66. if (!GetHcomTypeSize(data_type, &type_size)) {
  67. return false;
  68. }
  69. *size = SizetMulWithOverflowCheck(tmp_size, type_size);
  70. MS_LOG(INFO) << "size[" << *size << "]";
  71. return true;
  72. }
  73. bool HcomUtil::GetHcomTypeSize(const hcclDataType_t &data_type, uint32_t *size) {
  74. MS_EXCEPTION_IF_NULL(size);
  75. auto iter = CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.find(data_type);
  76. if (iter == CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.end()) {
  77. MS_LOG(ERROR) << "HcomUtil::HcomDataTypeSize, No DataTypeSize!";
  78. return false;
  79. }
  80. *size = iter->second;
  81. return true;
  82. }
  83. bool HcomUtil::GetHcomCount(const AnfNodePtr &anf_node, const vector<hcclDataType_t> &data_type_list,
  84. const vector<vector<size_t>> &shape_list, uint64_t *total_count) {
  85. MS_EXCEPTION_IF_NULL(anf_node);
  86. MS_EXCEPTION_IF_NULL(total_count);
  87. const uint32_t align_size = 512;
  88. const uint32_t filled_size = 32;
  89. uint64_t total_size = 0;
  90. uint64_t block_size;
  91. size_t input_size;
  92. uint32_t type_size = 4;
  93. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  94. if (!GetHcomTypeSize(data_type_list[i], &type_size)) {
  95. return false;
  96. }
  97. if (!GetHcclOpSize(data_type_list[i], shape_list[i], &input_size)) {
  98. MS_LOG(ERROR) << "Get GetHcclOpSize failed";
  99. return false;
  100. }
  101. if (AnfAlgo::GetCNodeName(anf_node) == kReduceScatterOpName) {
  102. int32_t rank_size;
  103. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  104. MS_EXCEPTION_IF_NULL(primitive);
  105. if (primitive->GetAttr("rank_size") != nullptr) {
  106. rank_size = GetValue<int32_t>(primitive->GetAttr("rank_size"));
  107. } else {
  108. MS_LOG(ERROR) << "Get rank size failed";
  109. return false;
  110. }
  111. block_size = input_size / IntToSize(rank_size);
  112. total_size = total_size + block_size;
  113. } else {
  114. if (AnfAlgo::GetCNodeName(anf_node) == kAllGatherOpName) {
  115. block_size = input_size;
  116. } else {
  117. block_size = (input_size + align_size - 1 + filled_size) / align_size * align_size;
  118. }
  119. total_size = total_size + block_size;
  120. }
  121. }
  122. if (type_size == 0 || total_size % type_size != 0) {
  123. MS_LOG(ERROR) << "Total_size[" << total_size << "],Type_size[" << type_size << "] != 0, fail!";
  124. return false;
  125. }
  126. *total_count = total_size / type_size;
  127. return true;
  128. }
  129. bool HcomUtil::GetHcomOperationType(const AnfNodePtr &anf_node, hcclRedOp_t *op_type) {
  130. MS_EXCEPTION_IF_NULL(anf_node);
  131. MS_EXCEPTION_IF_NULL(op_type);
  132. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  133. MS_EXCEPTION_IF_NULL(primitive);
  134. if (primitive->GetAttr("op") == nullptr) {
  135. MS_LOG(ERROR) << "Get HCOM_ATTR_REDUCE_TYPE fail, not support!";
  136. return false;
  137. }
  138. auto hcom_op_type_get = GetValue<const char *>(primitive->GetAttr("op"));
  139. string hcom_op_type(hcom_op_type_get);
  140. if (hcom_op_type == "min") {
  141. *op_type = HCCL_REP_OP_MIN;
  142. } else if (hcom_op_type == "max") {
  143. *op_type = HCCL_REP_OP_MAX;
  144. } else if (hcom_op_type == "prod") {
  145. *op_type = HCCL_REP_OP_PROD;
  146. } else if (hcom_op_type == "sum") {
  147. *op_type = HCCL_REP_OP_SUM;
  148. } else {
  149. MS_LOG(ERROR) << "HcomUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [" << hcom_op_type << "] not support!";
  150. return false;
  151. }
  152. return true;
  153. }
  154. bool HcomUtil::GetHcomRootId(const AnfNodePtr &anf_node, uint32_t *root_id) {
  155. MS_EXCEPTION_IF_NULL(anf_node);
  156. MS_EXCEPTION_IF_NULL(root_id);
  157. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  158. MS_EXCEPTION_IF_NULL(primitive);
  159. if (primitive->GetAttr("root_rank") != nullptr) {
  160. *root_id = (uint32_t)GetValue<int>(primitive->GetAttr("root_rank"));
  161. } else {
  162. MS_LOG(ERROR) << "HcomUtil::Get HCOM_ATTR_ROOT_INDEX fail, not support!";
  163. return false;
  164. }
  165. return true;
  166. }
  167. void HcomUtil::GetHcomGroup(NotNull<const AnfNodePtr &> anf_node, NotNull<std::string *> group) {
  168. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  169. MS_EXCEPTION_IF_NULL(primitive);
  170. auto attr = primitive->GetAttr("group");
  171. if (attr != nullptr) {
  172. *group = GetValue<std::string>(attr);
  173. } else {
  174. MS_LOG(EXCEPTION) << "Get Hcom Group Attr of Op:" << anf_node->fullname_with_scope() << " failed";
  175. }
  176. }
  177. } // namespace mindspore