From 0c76dcb721763d91a4ceaf40f2cd8c6c9f259ab3 Mon Sep 17 00:00:00 2001 From: caifubi Date: Tue, 23 Mar 2021 21:18:33 +0800 Subject: [PATCH] add RANK_TABLE_FILE for PyNative Hccl --- .../kernel_compiler/hccl/hccl_context.cc | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_context.cc b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_context.cc index 58bceb4753..abf501ba0f 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_context.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/hccl/hccl_context.cc @@ -19,6 +19,7 @@ #include "hccl/hccl.h" constexpr auto kHcclConfigFile = "MINDSPORE_HCCL_CONFIG_PATH"; +constexpr auto kHcclConfigFileOld = "RANK_TABLE_FILE"; namespace mindspore { namespace kernel { @@ -37,10 +38,25 @@ bool HcclContext::InitHccl() { } auto config_file = std::getenv(kHcclConfigFile); if (config_file == nullptr) { - MS_LOG(ERROR) << "Get hccl config file failed"; + config_file = std::getenv(kHcclConfigFileOld); + if (config_file == nullptr) { + MS_LOG(ERROR) << "Get hccl rank table file failed. Please export MINDSPORE_HCCL_CONFIG_PATH or RANK_TABLE_FILE"; + return false; + } + } + + auto rank_id = GetRankId(); + try { + rank_id_ = std::stoi(rank_id); + } catch (std::invalid_argument &e) { + MS_LOG(ERROR) << "Invalid rankd id env:" << rank_id; + return false; + } + + if (rank_id_ < 0 || rank_id_ > 7) { + MS_LOG(ERROR) << "rank_id needs to be between 0-7"; return false; } - rank_id_ = std::stoi(GetRankId()); auto hccl_result = HcclCommInitClusterInfo(config_file, rank_id_, &hccl_comm_); if (hccl_result != HCCL_SUCCESS) {