You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

comm_manager.cc 3.8 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "utils/comm_manager.h"
  17. #include "utils/convert_utils.h"
  18. #ifndef NO_DLIB
  19. #include "hccl/hcom.h"
  20. #endif
  21. namespace mindspore {
  22. CommManager &CommManager::GetInstance() noexcept {
  23. static CommManager instance("hccl");
  24. return instance;
  25. }
  26. #ifndef NO_DLIB
  27. #define HCCL_RUN_CHECK(op_name, group, op) \
  28. do { \
  29. auto hccl_result = (op); \
  30. if (hccl_result != tagHcclResult::HCCL_SUCCESS) { \
  31. MS_LOG(ERROR) << op_name << " failed: #" << group << "#"; \
  32. return false; \
  33. } \
  34. } while (0)
  35. #define HCCL_GROUP_CHECK_EMPTY(group) \
  36. do { \
  37. if (group.length() == 0) { \
  38. MS_LOG(ERROR) << "The length of group name should not be 0"; \
  39. return false; \
  40. } \
  41. } while (0)
  42. #define HCCL_GROUP_CHECK_IS_WORLD(group) \
  43. do { \
  44. if (group == "hccl_world_group") { \
  45. MS_LOG(ERROR) << "The group name should not be hccl_world_group"; \
  46. return false; \
  47. } \
  48. } while (0)
  49. bool CommManager::CreateGroupSync(const string &group, const vector<unsigned int> &rank_id_list) const {
  50. auto rank_size = rank_id_list.size();
  51. HCCL_GROUP_CHECK_EMPTY(group);
  52. HCCL_GROUP_CHECK_IS_WORLD(group);
  53. HCCL_RUN_CHECK(string("create communicate group"), group,
  54. hcom_create_group(group.c_str(), UlongToUint(rank_size), vector<unsigned int>(rank_id_list).data()));
  55. return true;
  56. }
  57. bool CommManager::GetRankID(const string &group, unsigned int *rank_id) const {
  58. HCCL_GROUP_CHECK_EMPTY(group);
  59. HCCL_RUN_CHECK(string("get rank_id"), group, hcom_get_rank_id(group.c_str(), rank_id));
  60. return true;
  61. }
  62. bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) const {
  63. HCCL_GROUP_CHECK_EMPTY(group);
  64. HCCL_RUN_CHECK(string("get rank size"), group, hcom_get_rank_size(group.c_str(), rank_size));
  65. return true;
  66. }
  67. bool CommManager::DestroyGroup(const string &group) const {
  68. HCCL_GROUP_CHECK_EMPTY(group);
  69. HCCL_GROUP_CHECK_IS_WORLD(group);
  70. HCCL_RUN_CHECK(string("destroy communicate group"), group, hcom_destroy_group(group.c_str()));
  71. return true;
  72. }
  73. #else
  74. bool CommManager::CreateGroupSync(const string &, const vector<unsigned int> &) const { return true; }
  75. bool CommManager::GetRankID(const string &group, unsigned int *rank_id) const { return true; }
  76. bool CommManager::GetRankSize(const string &group, unsigned int *rank_size) const {
  77. *rank_size = NO_COMM_DLIB_RANK_SIZE;
  78. return true;
  79. }
  80. bool CommManager::DestroyGroup(const string &group) const { return true; }
  81. #endif
  82. } // namespace mindspore