From 1f5d8102a5e2e00df1c00a5e3e83d46cdcfcb879 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9Cxujincai=E2=80=9D?= <“xujincai@huawei.com”> Date: Thu, 25 Feb 2021 17:03:07 +0800 Subject: [PATCH] fix in hccl config --- .../distributed_process.cc | 4 - .../distributed_servable.cc | 14 ++-- .../ut/cpp/tests/test_agent_config_acquire.cc | 27 +------ .../cpp/tests/test_init_config_on_start_up.cc | 73 +++++++++++++++---- 4 files changed, 64 insertions(+), 54 deletions(-) diff --git a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc index aec7388..d136867 100644 --- a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc +++ b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc @@ -72,10 +72,6 @@ grpc::Status MSDistributedImpl::AgentConfigAcquire(grpc::ServerContext *context, MSI_LOG(ERROR) << "Get distributed servable config failed"; return grpc::Status::CANCELLED; } - if (agent_config.rank_list.empty()) { - MSI_LOG(ERROR) << "Get distributed servable config failed, config_ not init"; - return grpc::Status::CANCELLED; - } MSI_LOG(INFO) << "Begin to set DistributedServableConfig info in reply message"; // set reply message:AgentConfigAcquireReply, parameter:rank_table_content diff --git a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc index ae2e30e..f472a85 100644 --- a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc +++ b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc @@ -233,12 +233,12 @@ Status DistributedServable::StartServable(const std::string &servable_directory, MSI_LOG_ERROR << "Init with rank table on start up failed"; return status; } - config_loaded_ = true; status = CheckRankConfig(); if (status != SUCCESS) { MSI_LOG_ERROR << "Check rank config failed"; return status; } + config_loaded_ = true; status = WaitAgentsReady(wait_agents_time_in_seconds); if (status != SUCCESS) { MSI_LOG_ERROR << "Waiting for ready of agents failed"; @@ -382,10 +382,7 @@ Status DistributedServable::ParserRankTableWithGroupList(const std::string &rank MSI_LOG_ERROR << "Convert rank_id from string to int failed"; return status; } - if (temp_device_id > temp_rank_id) { - return INFER_STATUS_LOG_ERROR(INVALID_INPUTS) - << "device_id large than rank_id in" << rank_table_json_file.c_str(); - } + if (rank_id != temp_rank_id) { return INFER_STATUS_LOG_ERROR(INVALID_INPUTS) << "device size not match rank_id in" << rank_table_json_file.c_str(); @@ -462,10 +459,6 @@ Status DistributedServable::ParserRankTableWithServerList(const std::string &ran return status; } - if (temp_device_id > temp_rank_id) { - return INFER_STATUS_LOG_ERROR(INVALID_INPUTS) - << "device_id large than rank_id in" << rank_table_json_file.c_str(); - } if (rank_id != temp_rank_id) { return INFER_STATUS_LOG_ERROR(INVALID_INPUTS) << "device size not match rank_id in" << rank_table_json_file.c_str(); @@ -615,6 +608,9 @@ Status DistributedServable::CheckRankConfig() { return INFER_STATUS_LOG_ERROR(FAILED) << "Check rank table config failed, device id repeatedly used by rank " << i << " in device ip " << item.ip; } + if (item.device_id >= card_count_per_machine) { + return INFER_STATUS_LOG_ERROR(FAILED) << "Check rank table config failed, device id cannot larger than 8"; + } device_id_list.emplace(item.device_id); } } else { diff --git a/tests/ut/cpp/tests/test_agent_config_acquire.cc b/tests/ut/cpp/tests/test_agent_config_acquire.cc index 02b7645..08a129c 100644 --- a/tests/ut/cpp/tests/test_agent_config_acquire.cc +++ b/tests/ut/cpp/tests/test_agent_config_acquire.cc @@ -78,32 +78,7 @@ TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_success) { TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_not_load_config_failed) { std::shared_ptr servable = std::make_shared(); - servable->config_loaded_ = true; - const std::string server_address = "any_addr"; - MSDistributedImpl mSDistributedImpl(servable, server_address); - grpc::ServerContext context; - const proto::AgentConfigAcquireRequest request; - proto::AgentConfigAcquireReply reply; - const grpc::Status status = mSDistributedImpl.AgentConfigAcquire(&context, &request, &reply); - ASSERT_EQ(status.error_code(), 1); -} - -TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_not_init_config_failed) { - std::shared_ptr servable = std::make_shared(); - std::string rank_table_content = "rank table content"; - CommonServableMeta commonServableMeta; - commonServableMeta.servable_name = "servable_name"; - commonServableMeta.outputs_count = 1; - commonServableMeta.inputs_count = 1; - commonServableMeta.with_batch_dim = false; - commonServableMeta.without_batch_dim_inputs.push_back(8); - DistributedServableMeta distributedServableMeta; - distributedServableMeta.stage_size = 8; - distributedServableMeta.rank_size = 8; - servable->config_.rank_table_content = rank_table_content; - servable->config_.common_meta = commonServableMeta; - servable->config_.distributed_meta = distributedServableMeta; - servable->config_loaded_ = true; + servable->config_loaded_ = false; const std::string server_address = "any_addr"; MSDistributedImpl mSDistributedImpl(servable, server_address); grpc::ServerContext context; diff --git a/tests/ut/cpp/tests/test_init_config_on_start_up.cc b/tests/ut/cpp/tests/test_init_config_on_start_up.cc index fcbc7e9..470c542 100644 --- a/tests/ut/cpp/tests/test_init_config_on_start_up.cc +++ b/tests/ut/cpp/tests/test_init_config_on_start_up.cc @@ -233,7 +233,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_failed ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_failed) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_failed) { nlohmann::json rank_table_server_list = R"( { "version": "1.0", @@ -241,8 +241,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_fai "server_list": [ { "server_id": "10.155.111.140", - "device": [ - {"device_ip": "192.1.27.6","rank_id": "0"}], + "device": "dsfds", "host_nic_ip": "reserve" } ], @@ -254,16 +253,36 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_fai ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_empty_device_failed) { nlohmann::json rank_table_server_list = R"( { "version": "1.0", "server_count": "1", "server_list": [ { - "server_id": "", + "server_id": "10.155.111.140", + "device": [], + "host_nic_ip": "reserve" + } + ], + "status": "completed" + } + )"_json; + auto servable = std::make_shared(); + auto status = servable->ParserRankTableWithServerList("rank_table_file", rank_table_server_list); + ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); +} + +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_failed) { + nlohmann::json rank_table_server_list = R"( + { + "version": "1.0", + "server_count": "1", + "server_list": [ + { + "server_id": "10.155.111.140", "device": [ - {"device_id": "1wdb","device_ip": "192.1.27.6","rank_id": "0"}], + {"device_ip": "192.1.27.6","rank_id": "0"}], "host_nic_ip": "reserve" } ], @@ -275,7 +294,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed2) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed) { nlohmann::json rank_table_server_list = R"( { "version": "1.0", @@ -284,7 +303,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id { "server_id": "", "device": [ - {"device_id": "8","device_ip": "192.1.27.6","rank_id": "0"}], + {"device_id": "1wdb","device_ip": "192.1.27.6","rank_id": "0"}], "host_nic_ip": "reserve" } ], @@ -603,7 +622,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_device_id_failed) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_devices_failed) { nlohmann::json rank_table_server_list = R"( { "board_id": "0x0000", @@ -612,7 +631,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de "instance_count": "1", "instance_list": [ { - "devices": [{"device_ip": "192.1.27.6"}], + "devices": "rtrt", "rank_id": "0", "server_id": "10.155.111.140" } @@ -627,7 +646,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_empty_devices_failed) { nlohmann::json rank_table_server_list = R"( { "board_id": "0x0000", @@ -636,7 +655,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali "instance_count": "1", "instance_list": [ { - "devices": [{"device_id": "wd1gt2", "device_ip": "192.1.27.6"}], + "devices": [], "rank_id": "0", "server_id": "10.155.111.140" } @@ -651,7 +670,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); } -TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed2) { +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_device_id_failed) { nlohmann::json rank_table_server_list = R"( { "board_id": "0x0000", @@ -660,7 +679,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali "instance_count": "1", "instance_list": [ { - "devices": [{"device_id": "8", "device_ip": "192.1.27.6"}], + "devices": [{"device_ip": "192.1.27.6"}], "rank_id": "0", "server_id": "10.155.111.140" } @@ -671,7 +690,31 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali } )"_json; auto servable = std::make_shared(); - auto status = servable->ParserRankTableWithServerList("rank_table_file", rank_table_server_list); + auto status = servable->ParserRankTableWithGroupList("rank_table_file", rank_table_server_list); + ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); +} + +TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed) { + nlohmann::json rank_table_server_list = R"( + { + "board_id": "0x0000", + "group_list": [ + { + "instance_count": "1", + "instance_list": [ + { + "devices": [{"device_id": "wd1gt2", "device_ip": "192.1.27.6"}], + "rank_id": "0", + "server_id": "10.155.111.140" + } + ] + } + ], + "status": "completed" + } + )"_json; + auto servable = std::make_shared(); + auto status = servable->ParserRankTableWithGroupList("rank_table_file", rank_table_server_list); ASSERT_EQ(status.StatusCode(), INVALID_INPUTS); }