From 246f1bcd06ecbea60905ff91da4e271b4a28cf36 Mon Sep 17 00:00:00 2001
From: ZPaC <zhoupeichen@huawei.com>
Date: Wed, 5 Jan 2022 15:19:09 +0800
Subject: [PATCH] Fix get rank size func in alltoall fusion

---
 .../backend/optimizer/gpu/alltoall_fusion.cc     | 16 +---------------
 .../ccsrc/distributed/cluster/cluster_context.cc |  9 +++++++--
 .../ccsrc/distributed/cluster/cluster_context.h  |  7 +++++++
 .../python/mindspore/communication/management.py |  2 +-
 4 files changed, 16 insertions(+), 18 deletions(-)
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/alltoall_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/alltoall_fusion.cc
index ace626e196..250ca29926 100644
--- a/mindspore/ccsrc/backend/optimizer/gpu/alltoall_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/alltoall_fusion.cc
@@ -37,20 +37,6 @@ inline int64_t NormalizeDim(const std::vector<size_t> &shape, int64_t dim) {
   return dim < 0 ? SizeToLong(shape.size()) + dim : dim;
 }
 
-uint32_t GetRankSize(const std::string &group) {
-  uint32_t rank_size;
-  const void *collective_handle_ = device::gpu::CollectiveInitializer::instance().collective_handle();
-  MS_EXCEPTION_IF_NULL(collective_handle_);
-
-  // Get group size
-  auto get_group_size_funcptr =
-    reinterpret_cast<GetGroupRanks>(dlsym(const_cast<void *>(collective_handle_), "GetGroupRanks"));
-  MS_EXCEPTION_IF_NULL(get_group_size_funcptr);
-  std::vector<int> group_ranks = (*get_group_size_funcptr)(group);
-  rank_size = group_ranks.size();
-  return rank_size;
-}
-
 CNodePtr CreateSplitNode(const FuncGraphPtr &graph, const CNodePtr &all_to_all) {
   MS_EXCEPTION_IF_NULL(graph);
   MS_EXCEPTION_IF_NULL(all_to_all);
@@ -113,7 +99,7 @@ CNodePtr CreateAllToAllvNode(const FuncGraphPtr &graph, const CNodePtr &all_to_a
   std::vector<TypeId> dtypes(split_count, single_type);
   std::vector<std::vector<size_t>> shapes(split_count, single_shape);
   AnfAlgo::SetOutputInferTypeAndShape(dtypes, shapes, all_to_all_v.get());
-  uint32_t rank_size = GetRankSize(group);
+  uint32_t rank_size = device::gpu::CollectiveInitializer::instance().GetGroupSize(group);
   std::vector<int64_t> rank_ids(rank_size, 0);
   for (uint32_t i = 0; i < rank_size; ++i) {
     rank_ids[i] = static_cast<int64_t>(i);
diff --git a/mindspore/ccsrc/distributed/cluster/cluster_context.cc b/mindspore/ccsrc/distributed/cluster/cluster_context.cc
index f7235c4de7..5a6654462d 100644
--- a/mindspore/ccsrc/distributed/cluster/cluster_context.cc
+++ b/mindspore/ccsrc/distributed/cluster/cluster_context.cc
@@ -156,7 +156,7 @@ bool ClusterContext::BuildCluster() {
 void ClusterContext::InitNodeRole() {
   node_role_ = common::GetEnv(kEnvRole);
   if (kValidRoleName.count(node_role_) == 0) {
-    MS_LOG(EXCEPTION) << "Role name " << node_role_ << " is invalid.";
+    MS_LOG(EXCEPTION) << "Role name '" << node_role_ << "' is invalid. " << kDetailedFailureReason;
     return;
   }
 
@@ -177,7 +177,12 @@ void ClusterContext::InitNodeRole() {
   }
 }
 
-void ClusterContext::InitSchedulerIp() { scheduler_host_ = common::GetEnv(kEnvSchedulerHost); }
+void ClusterContext::InitSchedulerIp() {
+  scheduler_host_ = common::GetEnv(kEnvSchedulerHost);
+  if (scheduler_host_.empty()) {
+    MS_LOG(EXCEPTION) << kEnvSchedulerHost << " is empty. " << kEnvSchedulerHost;
+  }
+}
 
 void ClusterContext::InitSchedulerPort() {
   TRY_AND_CATCH_WITH_EXCEPTION((scheduler_port_ = static_cast<uint16_t>(std::stoi(common::GetEnv(kEnvSchedulerPort)))),
diff --git a/mindspore/ccsrc/distributed/cluster/cluster_context.h b/mindspore/ccsrc/distributed/cluster/cluster_context.h
index c36ae69fe2..bb0a4f37d8 100644
--- a/mindspore/ccsrc/distributed/cluster/cluster_context.h
+++ b/mindspore/ccsrc/distributed/cluster/cluster_context.h
@@ -37,6 +37,13 @@
 namespace mindspore {
 namespace distributed {
 namespace cluster {
+// The detailed reason of failing to run 'mindspore.communication.init()' with ClusterContext.
+constexpr char kDetailedFailureReason[] =
+  "Maybe you are trying to call 'mindspore.communication.init()' without using 'mpirun', which will make MindSpore "
+  "load several environment variables and check their validation. Please use 'mpirun' to launch this process to fix "
+  "this issue, or refer to this link if you want to run distributed training without using 'mpirun': "
+  "https://www.mindspore.cn/docs/programming_guide/zh-CN/master/distributed_training_gpu.html#openmpi.";
+
 // Node role based cluster built by MindSpore communication framework.
 class ClusterContext {
  public:
diff --git a/mindspore/python/mindspore/communication/management.py b/mindspore/python/mindspore/communication/management.py
index 63e3bc9364..f004f6ad52 100755
--- a/mindspore/python/mindspore/communication/management.py
+++ b/mindspore/python/mindspore/communication/management.py
@@ -90,7 +90,7 @@ def init(backend_name=None):
         The full name of HCCL is Huawei Collective Communication Library.
         The full name of NCCL is NVIDIA Collective Communication Library.
         This method should be used after set_context. The user needs to preset communication environment variables
-        before running the following example, please see the docstring of the mindspore.managerment.
+        before running the following example, please see the docstring of the mindspore.management.
 
     Args:
         backend_name (str): Backend, using HCCL/NCCL. If the `backend_name` is None, system will recognize