From 5ebd8fd3915d6ff3dfff458981a6566350306108 Mon Sep 17 00:00:00 2001 From: chendongsheng Date: Wed, 17 Mar 2021 16:35:44 +0800 Subject: [PATCH] fixed assign rank id --- mindspore/ccsrc/ps/core/abstract_node.cc | 3 +++ mindspore/ccsrc/ps/core/node_manager.cc | 10 ++++++++++ mindspore/ccsrc/ps/core/scheduler_node.cc | 2 +- 3 files changed, 14 insertions(+), 1 deletion(-) diff --git a/mindspore/ccsrc/ps/core/abstract_node.cc b/mindspore/ccsrc/ps/core/abstract_node.cc index 4de1604b01..01f8edf32d 100644 --- a/mindspore/ccsrc/ps/core/abstract_node.cc +++ b/mindspore/ccsrc/ps/core/abstract_node.cc @@ -50,6 +50,9 @@ void AbstractNode::ProcessRegisterResp(std::shared_ptr meta, const << " is not match the current node id:" << node_info_.node_id_; } + if (register_resp_message.rank_id() < 0) { + MS_LOG(EXCEPTION) << "The rank id is wrong."; + } node_info_.rank_id_ = register_resp_message.rank_id(); MS_LOG(INFO) << "The node id is:" << node_info_.node_id_ << ", and the rank id is:" << node_info_.rank_id_ diff --git a/mindspore/ccsrc/ps/core/node_manager.cc b/mindspore/ccsrc/ps/core/node_manager.cc index ed570b0ca2..ab3dfee8de 100644 --- a/mindspore/ccsrc/ps/core/node_manager.cc +++ b/mindspore/ccsrc/ps/core/node_manager.cc @@ -39,6 +39,11 @@ int NodeManager::NextRankId(const RegisterMessage ®ister_message) { uint32_t port = register_message.port(); rank_id = ++next_server_rank_id_; + if (IntToUint(rank_id) >= ClusterMetadata::instance()->total_server_num()) { + MS_LOG(WARNING) << "The rank id is greater than the number of servers."; + rank_id = -1; + --next_server_rank_id_; + } NodeInfo node_info; node_info.node_role_ = NodeRole::SERVER; node_info.node_id_ = node_id; @@ -50,6 +55,11 @@ int NodeManager::NextRankId(const RegisterMessage ®ister_message) { << " assign rank id:" << rank_id; } else if (register_message.role() == NodeRole::WORKER) { rank_id = ++next_worker_rank_id_; + if (IntToUint(rank_id) >= ClusterMetadata::instance()->total_worker_num()) { + MS_LOG(WARNING) << "The rank id is greater than the number of workers."; + rank_id = -1; + --next_worker_rank_id_; + } NodeInfo node_info; node_info.node_role_ = NodeRole::WORKER; node_info.node_id_ = node_id; diff --git a/mindspore/ccsrc/ps/core/scheduler_node.cc b/mindspore/ccsrc/ps/core/scheduler_node.cc index 200f468b8a..378e8399ca 100644 --- a/mindspore/ccsrc/ps/core/scheduler_node.cc +++ b/mindspore/ccsrc/ps/core/scheduler_node.cc @@ -120,7 +120,7 @@ void SchedulerNode::ProcessRegister(std::shared_ptr server, std::shar // assign worker node and server node rank id int rank_id = node_manager_.NextRankId(register_message); if (rank_id < 0) { - MS_LOG(EXCEPTION) << "The rank id is wrong!"; + MS_LOG(WARNING) << "The rank id is wrong!"; } const std::string &node_id = register_message.node_id(); node_manager_.UpdateHeartbeat(node_id);