浏览代码

!13482 fixed assign rank id

From: @anancds
Reviewed-by: @limingqi107
Signed-off-by:
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 4 年前
父节点
当前提交
69bbf161e9
共有 3 个文件被更改,包括 14 次插入1 次删除
  1. +3
    -0
      mindspore/ccsrc/ps/core/abstract_node.cc
  2. +10
    -0
      mindspore/ccsrc/ps/core/node_manager.cc
  3. +1
    -1
      mindspore/ccsrc/ps/core/scheduler_node.cc

+ 3
- 0
mindspore/ccsrc/ps/core/abstract_node.cc 查看文件

@@ -50,6 +50,9 @@ void AbstractNode::ProcessRegisterResp(std::shared_ptr<MessageMeta> meta, const
<< " is not match the current node id:" << node_info_.node_id_;
}

if (register_resp_message.rank_id() < 0) {
MS_LOG(EXCEPTION) << "The rank id is wrong.";
}
node_info_.rank_id_ = register_resp_message.rank_id();

MS_LOG(INFO) << "The node id is:" << node_info_.node_id_ << ", and the rank id is:" << node_info_.rank_id_


+ 10
- 0
mindspore/ccsrc/ps/core/node_manager.cc 查看文件

@@ -39,6 +39,11 @@ int NodeManager::NextRankId(const RegisterMessage &register_message) {
uint32_t port = register_message.port();

rank_id = ++next_server_rank_id_;
if (IntToUint(rank_id) >= ClusterMetadata::instance()->total_server_num()) {
MS_LOG(WARNING) << "The rank id is greater than the number of servers.";
rank_id = -1;
--next_server_rank_id_;
}
NodeInfo node_info;
node_info.node_role_ = NodeRole::SERVER;
node_info.node_id_ = node_id;
@@ -50,6 +55,11 @@ int NodeManager::NextRankId(const RegisterMessage &register_message) {
<< " assign rank id:" << rank_id;
} else if (register_message.role() == NodeRole::WORKER) {
rank_id = ++next_worker_rank_id_;
if (IntToUint(rank_id) >= ClusterMetadata::instance()->total_worker_num()) {
MS_LOG(WARNING) << "The rank id is greater than the number of workers.";
rank_id = -1;
--next_worker_rank_id_;
}
NodeInfo node_info;
node_info.node_role_ = NodeRole::WORKER;
node_info.node_id_ = node_id;


+ 1
- 1
mindspore/ccsrc/ps/core/scheduler_node.cc 查看文件

@@ -120,7 +120,7 @@ void SchedulerNode::ProcessRegister(std::shared_ptr<TcpServer> server, std::shar
// assign worker node and server node rank id
int rank_id = node_manager_.NextRankId(register_message);
if (rank_id < 0) {
MS_LOG(EXCEPTION) << "The rank id is wrong!";
MS_LOG(WARNING) << "The rank id is wrong!";
}
const std::string &node_id = register_message.node_id();
node_manager_.UpdateHeartbeat(node_id);


正在加载...
取消
保存