You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

hcom_util.cc 7.0 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "kernel/hccl/hcom_util.h"
  17. #include <memory>
  18. #include "kernel/common_utils.h"
  19. #include "session/anf_runtime_algorithm.h"
  20. #include "utils/utils.h"
  21. namespace mindspore {
  22. bool HcomUtil::GetKernelInputShape(const AnfNodePtr &anf_node, vector<vector<size_t>> *hccl_kernel_intput_shape_list) {
  23. MS_EXCEPTION_IF_NULL(anf_node);
  24. MS_EXCEPTION_IF_NULL(hccl_kernel_intput_shape_list);
  25. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  26. std::vector<size_t> shape_i = AnfAlgo::GetInputDeviceShape(anf_node, i);
  27. hccl_kernel_intput_shape_list->emplace_back(shape_i);
  28. }
  29. return true;
  30. }
  31. bool HcomUtil::GetKernelOutputShape(const AnfNodePtr &anf_node, vector<vector<size_t>> *hccl_kernel_output_shape_list) {
  32. MS_EXCEPTION_IF_NULL(anf_node);
  33. MS_EXCEPTION_IF_NULL(hccl_kernel_output_shape_list);
  34. for (size_t i = 0; i < AnfAlgo::GetOutputTensorNum(anf_node); ++i) {
  35. std::vector<size_t> shape_i = AnfAlgo::GetOutputDeviceShape(anf_node, i);
  36. hccl_kernel_output_shape_list->emplace_back(shape_i);
  37. }
  38. return true;
  39. }
  40. bool HcomUtil::GetHcomDataType(const AnfNodePtr &anf_node, vector<hcclDataType_t> *data_type_list) {
  41. MS_EXCEPTION_IF_NULL(anf_node);
  42. MS_EXCEPTION_IF_NULL(data_type_list);
  43. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  44. auto type_ptr = AnfAlgo::GetPrevNodeOutputDeviceDataType(anf_node, i);
  45. auto iter = CONST_OP_HCOM_DATA_TYPE_MAP.find(type_ptr);
  46. if (iter == CONST_OP_HCOM_DATA_TYPE_MAP.end()) {
  47. MS_LOG(EXCEPTION) << "HcomDataType cann't support Current Ascend Data Type : " << type_ptr;
  48. }
  49. data_type_list->emplace_back(iter->second);
  50. }
  51. auto type_base = *(std::begin(*data_type_list));
  52. if (std::any_of(data_type_list->begin(), data_type_list->end(),
  53. [&type_base](hcclDataType_t type) { return type != type_base; })) {
  54. MS_LOG(ERROR) << "hccl have different data type";
  55. return false;
  56. }
  57. return true;
  58. }
  59. bool HcomUtil::GetHcclOpSize(const hcclDataType_t &data_type, const vector<size_t> &shape, size_t *size) {
  60. int tmp_size = 1;
  61. uint32_t type_size = 4;
  62. for (size_t i = 0; i < shape.size(); i++) {
  63. IntMulWithOverflowCheck(tmp_size, SizeToInt(shape[i]), &tmp_size);
  64. }
  65. if (!GetHcomTypeSize(data_type, &type_size)) {
  66. return false;
  67. }
  68. IntMulWithOverflowCheck(tmp_size, UintToInt(type_size), &tmp_size);
  69. *size = IntToSize(tmp_size);
  70. MS_LOG(INFO) << "size[" << *size << "]";
  71. return true;
  72. }
  73. bool HcomUtil::GetHcomTypeSize(const hcclDataType_t &data_type, uint32_t *size) {
  74. auto iter = CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.find(data_type);
  75. if (iter == CONST_OP_HCOM_DATA_TYPE_SIZE_MAP.end()) {
  76. MS_LOG(ERROR) << "HcomUtil::HcomDataTypeSize, No DataTypeSize!";
  77. return false;
  78. }
  79. *size = iter->second;
  80. return true;
  81. }
  82. bool HcomUtil::GetHcomCount(const AnfNodePtr &anf_node, const vector<hcclDataType_t> &data_type_list,
  83. const vector<vector<size_t>> &shape_list, uint64_t *total_count) {
  84. MS_EXCEPTION_IF_NULL(anf_node);
  85. MS_EXCEPTION_IF_NULL(total_count);
  86. const uint32_t align_size = 512;
  87. const uint32_t filled_size = 32;
  88. uint64_t total_size = 0;
  89. uint64_t block_size;
  90. size_t input_size;
  91. uint32_t type_size = 4;
  92. for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(anf_node); ++i) {
  93. if (!GetHcomTypeSize(data_type_list[i], &type_size)) {
  94. return false;
  95. }
  96. if (!GetHcclOpSize(data_type_list[i], shape_list[i], &input_size)) {
  97. MS_LOG(ERROR) << "Get GetHcclOpSize failed";
  98. return false;
  99. }
  100. if (AnfAlgo::GetCNodeName(anf_node) == kReduceScatterOpName) {
  101. int32_t rank_size;
  102. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  103. MS_EXCEPTION_IF_NULL(primitive);
  104. if (primitive->GetAttr("rank_size") != nullptr) {
  105. rank_size = GetValue<int32_t>(primitive->GetAttr("rank_size"));
  106. } else {
  107. MS_LOG(ERROR) << "Get rank size failed";
  108. return false;
  109. }
  110. block_size = input_size / IntToSize(rank_size);
  111. total_size = total_size + block_size;
  112. } else {
  113. if (AnfAlgo::GetCNodeName(anf_node) == kAllGatherOpName) {
  114. block_size = input_size;
  115. } else {
  116. block_size = (input_size + align_size - 1 + filled_size) / align_size * align_size;
  117. }
  118. total_size = total_size + block_size;
  119. }
  120. }
  121. if (type_size == 0 || total_size % type_size != 0) {
  122. MS_LOG(ERROR) << "Total_size[" << total_size << "],Type_size[" << type_size << "] != 0, fail!";
  123. return false;
  124. }
  125. *total_count = total_size / type_size;
  126. return true;
  127. }
  128. bool HcomUtil::GetHcomOperationType(const AnfNodePtr &anf_node, hcclRedOp_t *op_type) {
  129. MS_EXCEPTION_IF_NULL(anf_node);
  130. MS_EXCEPTION_IF_NULL(op_type);
  131. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  132. MS_EXCEPTION_IF_NULL(primitive);
  133. if (primitive->GetAttr("op") == nullptr) {
  134. MS_LOG(ERROR) << "Get HCOM_ATTR_REDUCE_TYPE fail, not support!";
  135. return false;
  136. }
  137. auto hcom_op_type_get = GetValue<const char *>(primitive->GetAttr("op"));
  138. string hcom_op_type(hcom_op_type_get);
  139. if (hcom_op_type == "min") {
  140. *op_type = HCCL_REP_OP_MIN;
  141. } else if (hcom_op_type == "max") {
  142. *op_type = HCCL_REP_OP_MAX;
  143. } else if (hcom_op_type == "prod") {
  144. *op_type = HCCL_REP_OP_PROD;
  145. } else if (hcom_op_type == "sum") {
  146. *op_type = HCCL_REP_OP_SUM;
  147. } else {
  148. MS_LOG(ERROR) << "HcomUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [" << hcom_op_type << "] not support!";
  149. return false;
  150. }
  151. return true;
  152. }
  153. bool HcomUtil::GetHcomRootId(const AnfNodePtr &anf_node, uint32_t *root_id) {
  154. MS_EXCEPTION_IF_NULL(anf_node);
  155. MS_EXCEPTION_IF_NULL(root_id);
  156. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  157. MS_EXCEPTION_IF_NULL(primitive);
  158. if (primitive->GetAttr("root_rank") != nullptr) {
  159. *root_id = (uint32_t)GetValue<int>(primitive->GetAttr("root_rank"));
  160. } else {
  161. MS_LOG(ERROR) << "HcomUtil::Get HCOM_ATTR_ROOT_INDEX fail, not support!";
  162. return false;
  163. }
  164. return true;
  165. }
  166. void HcomUtil::GetHcomGroup(NotNull<const AnfNodePtr &> anf_node, NotNull<std::string *> group) {
  167. auto primitive = AnfAlgo::GetCNodePrimitive(anf_node);
  168. MS_EXCEPTION_IF_NULL(primitive);
  169. auto attr = primitive->GetAttr("group");
  170. if (attr != nullptr) {
  171. *group = GetValue<std::string>(attr);
  172. } else {
  173. MS_LOG(EXCEPTION) << "Get Hcom Group Attr of Op:" << anf_node->fullname_with_scope() << " failed";
  174. }
  175. }
  176. } // namespace mindspore