You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

mpi_wrapper.cc 2.9 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "device/gpu/distribution/mpi_wrapper.h"
  17. #include <cuda_runtime_api.h>
  18. #include <string>
  19. #include "device/gpu/distribution/nccl_wrapper.h"
  20. namespace mindspore {
  21. namespace device {
  22. namespace gpu {
  23. MPIWrapper::MPIWrapper() : rank_id_(0), rank_size_(0), local_rank_id_(0) { Init(); }
  24. MPIWrapper::~MPIWrapper() {
  25. int finalized;
  26. MPI_Finalized(&finalized);
  27. if (finalized == 0) {
  28. MPI_Finalize();
  29. }
  30. }
  31. MPIWrapper &MPIWrapper::instance() {
  32. static MPIWrapper instance;
  33. return instance;
  34. }
  35. int MPIWrapper::local_rank_id() const { return local_rank_id_; }
  36. void MPIWrapper::Init() {
  37. int initialized;
  38. CHECK_RET(MPI_Initialized(&initialized), MPI_SUCCESS, "Failed to check mpi initialization status.");
  39. if (initialized == 0) {
  40. MPI_Init(nullptr, nullptr);
  41. }
  42. CHECK_RET(MPI_Comm_rank(MPI_COMM_WORLD, &rank_id_), MPI_SUCCESS, "Failed to init mpi rank id.");
  43. CHECK_RET(MPI_Comm_size(MPI_COMM_WORLD, &rank_size_), MPI_SUCCESS, "Failed to init mpi rank size.");
  44. NCCLWrapper::instance().set_rank(rank_id_, rank_size_);
  45. AssignLocalRankId();
  46. ncclUniqueId unique_id;
  47. if (rank_id_ == 0) {
  48. unique_id = NCCLWrapper::instance().nccl_unique_id();
  49. }
  50. CHECK_RET(MPI_Bcast(reinterpret_cast<void *>(&unique_id), sizeof(unique_id), MPI_BYTE, 0, MPI_COMM_WORLD),
  51. MPI_SUCCESS, "Failed to broadcast nccl unique id.");
  52. NCCLWrapper::instance().set_nccl_unique_id(unique_id);
  53. return;
  54. }
  55. void MPIWrapper::AssignLocalRankId() {
  56. char host_name[MAX_HOSTNAME_LEN] = {0};
  57. CHECK_RET(gethostname(host_name, MAX_HOSTNAME_LEN), 0, "Getting host name failed.");
  58. size_t host_hash = std::hash<std::string>()(host_name);
  59. const int kRankSize = rank_size_;
  60. size_t all_host_hashs[kRankSize];
  61. all_host_hashs[rank_id_] = host_hash;
  62. CHECK_RET(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_host_hashs, sizeof(size_t), MPI_BYTE, MPI_COMM_WORLD),
  63. MPI_SUCCESS, "MPI_Allgather host hashs failed.");
  64. for (int global_rank = 0; global_rank < kRankSize; global_rank++) {
  65. if (global_rank == rank_id_) {
  66. break;
  67. }
  68. if (all_host_hashs[global_rank] == all_host_hashs[rank_id_]) {
  69. local_rank_id_++;
  70. }
  71. }
  72. return;
  73. }
  74. } // namespace gpu
  75. } // namespace device
  76. } // namespace mindspore