You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

init.cc 3.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. /**
  2. * Copyright 2021 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include "distributed/init.h"
  17. #include <vector>
  18. #include <string>
  19. #include "runtime/recovery/recovery_context.h"
  20. namespace mindspore {
  21. namespace distributed {
  22. using runtime::recovery::RecoveryContext;
  23. bool Initialize() {
  24. if (!InitializeCluster()) {
  25. MS_LOG(ERROR) << "Failed to initialize cluster.";
  26. return false;
  27. }
  28. #if ((defined ENABLE_CPU) && (!defined _WIN32))
  29. if (cluster::ClusterContext::instance()->initialized()) {
  30. // Server and Scheduler don't use collective communication library.
  31. auto node = cluster::ClusterContext::instance()->node();
  32. MS_EXCEPTION_IF_NULL(node);
  33. if (node->role() != ps::core::NodeRole::SCHEDULER && node->role() != ps::core::NodeRole::SERVER) {
  34. // Global rank id and size should be manually set if cluster is initialized by MindSpore communication framework.
  35. auto abstract_node =
  36. std::dynamic_pointer_cast<ps::core::AbstractNode>(cluster::ClusterContext::instance()->node());
  37. MS_EXCEPTION_IF_NULL(abstract_node);
  38. collective::CollectiveManager::instance()->set_global_rank_id(abstract_node->rank_id());
  39. collective::CollectiveManager::instance()->set_global_rank_size(abstract_node->worker_num());
  40. if (RecoveryContext::GetInstance()->enable_recovery()) {
  41. cluster::ClusterContext::instance()->WaitForClusterReady();
  42. }
  43. if (!InitializeCollective()) {
  44. MS_LOG(ERROR) << "Failed to initialize collective communication.";
  45. return false;
  46. }
  47. if (RecoveryContext::GetInstance()->enable_recovery()) {
  48. RecoveryContext::GetInstance()->set_global_rank_id(abstract_node->rank_id());
  49. RecoveryContext::GetInstance()->set_global_rank_size(abstract_node->worker_num());
  50. RecoveryContext::GetInstance()->ObtainGlobalLatestCkptInfo();
  51. }
  52. }
  53. }
  54. #endif
  55. return true;
  56. }
  57. bool Finalize() {
  58. if (!FinalizeCollective()) {
  59. MS_LOG(ERROR) << "Failed to finalize collective communication.";
  60. return false;
  61. }
  62. if (!FinalizeCluster()) {
  63. MS_LOG(ERROR) << "Failed to finalize cluster.";
  64. return false;
  65. }
  66. return true;
  67. }
  68. bool InitializeCluster() { return cluster::ClusterContext::instance()->Initialize(); }
  69. bool FinalizeCluster() { return cluster::ClusterContext::instance()->Finalize(); }
  70. bool InitializeCollective() { return collective::CollectiveManager::instance()->Initialize(); }
  71. bool FinalizeCollective() { return collective::CollectiveManager::instance()->Finalize(); }
  72. } // namespace distributed
  73. } // namespace mindspore