Browse Source

fix in hccl config

tags/v1.2.0
“xujincai” 5 years ago
parent
commit
1f5d8102a5
4 changed files with 64 additions and 54 deletions
  1. +0
    -4
      mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc
  2. +5
    -9
      mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc
  3. +1
    -26
      tests/ut/cpp/tests/test_agent_config_acquire.cc
  4. +58
    -15
      tests/ut/cpp/tests/test_init_config_on_start_up.cc

+ 0
- 4
mindspore_serving/ccsrc/worker/distributed_worker/distributed_process/distributed_process.cc View File

@@ -72,10 +72,6 @@ grpc::Status MSDistributedImpl::AgentConfigAcquire(grpc::ServerContext *context,
MSI_LOG(ERROR) << "Get distributed servable config failed";
return grpc::Status::CANCELLED;
}
if (agent_config.rank_list.empty()) {
MSI_LOG(ERROR) << "Get distributed servable config failed, config_ not init";
return grpc::Status::CANCELLED;
}

MSI_LOG(INFO) << "Begin to set DistributedServableConfig info in reply message";
// set reply message:AgentConfigAcquireReply, parameter:rank_table_content


+ 5
- 9
mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc View File

@@ -233,12 +233,12 @@ Status DistributedServable::StartServable(const std::string &servable_directory,
MSI_LOG_ERROR << "Init with rank table on start up failed";
return status;
}
config_loaded_ = true;
status = CheckRankConfig();
if (status != SUCCESS) {
MSI_LOG_ERROR << "Check rank config failed";
return status;
}
config_loaded_ = true;
status = WaitAgentsReady(wait_agents_time_in_seconds);
if (status != SUCCESS) {
MSI_LOG_ERROR << "Waiting for ready of agents failed";
@@ -382,10 +382,7 @@ Status DistributedServable::ParserRankTableWithGroupList(const std::string &rank
MSI_LOG_ERROR << "Convert rank_id from string to int failed";
return status;
}
if (temp_device_id > temp_rank_id) {
return INFER_STATUS_LOG_ERROR(INVALID_INPUTS)
<< "device_id large than rank_id in" << rank_table_json_file.c_str();
}

if (rank_id != temp_rank_id) {
return INFER_STATUS_LOG_ERROR(INVALID_INPUTS)
<< "device size not match rank_id in" << rank_table_json_file.c_str();
@@ -462,10 +459,6 @@ Status DistributedServable::ParserRankTableWithServerList(const std::string &ran
return status;
}

if (temp_device_id > temp_rank_id) {
return INFER_STATUS_LOG_ERROR(INVALID_INPUTS)
<< "device_id large than rank_id in" << rank_table_json_file.c_str();
}
if (rank_id != temp_rank_id) {
return INFER_STATUS_LOG_ERROR(INVALID_INPUTS)
<< "device size not match rank_id in" << rank_table_json_file.c_str();
@@ -615,6 +608,9 @@ Status DistributedServable::CheckRankConfig() {
return INFER_STATUS_LOG_ERROR(FAILED) << "Check rank table config failed, device id repeatedly used by rank "
<< i << " in device ip " << item.ip;
}
if (item.device_id >= card_count_per_machine) {
return INFER_STATUS_LOG_ERROR(FAILED) << "Check rank table config failed, device id cannot larger than 8";
}
device_id_list.emplace(item.device_id);
}
} else {


+ 1
- 26
tests/ut/cpp/tests/test_agent_config_acquire.cc View File

@@ -78,32 +78,7 @@ TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_success) {

TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_not_load_config_failed) {
std::shared_ptr<DistributedServable> servable = std::make_shared<DistributedServable>();
servable->config_loaded_ = true;
const std::string server_address = "any_addr";
MSDistributedImpl mSDistributedImpl(servable, server_address);
grpc::ServerContext context;
const proto::AgentConfigAcquireRequest request;
proto::AgentConfigAcquireReply reply;
const grpc::Status status = mSDistributedImpl.AgentConfigAcquire(&context, &request, &reply);
ASSERT_EQ(status.error_code(), 1);
}

TEST_F(TestAgentConfigAcquire, test_agent_config_acquire_not_init_config_failed) {
std::shared_ptr<DistributedServable> servable = std::make_shared<DistributedServable>();
std::string rank_table_content = "rank table content";
CommonServableMeta commonServableMeta;
commonServableMeta.servable_name = "servable_name";
commonServableMeta.outputs_count = 1;
commonServableMeta.inputs_count = 1;
commonServableMeta.with_batch_dim = false;
commonServableMeta.without_batch_dim_inputs.push_back(8);
DistributedServableMeta distributedServableMeta;
distributedServableMeta.stage_size = 8;
distributedServableMeta.rank_size = 8;
servable->config_.rank_table_content = rank_table_content;
servable->config_.common_meta = commonServableMeta;
servable->config_.distributed_meta = distributedServableMeta;
servable->config_loaded_ = true;
servable->config_loaded_ = false;
const std::string server_address = "any_addr";
MSDistributedImpl mSDistributedImpl(servable, server_address);
grpc::ServerContext context;


+ 58
- 15
tests/ut/cpp/tests/test_init_config_on_start_up.cc View File

@@ -233,7 +233,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_failed
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_failed) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_failed) {
nlohmann::json rank_table_server_list = R"(
{
"version": "1.0",
@@ -241,8 +241,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_fai
"server_list": [
{
"server_id": "10.155.111.140",
"device": [
{"device_ip": "192.1.27.6","rank_id": "0"}],
"device": "dsfds",
"host_nic_ip": "reserve"
}
],
@@ -254,16 +253,36 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_fai
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_empty_device_failed) {
nlohmann::json rank_table_server_list = R"(
{
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "",
"server_id": "10.155.111.140",
"device": [],
"host_nic_ip": "reserve"
}
],
"status": "completed"
}
)"_json;
auto servable = std::make_shared<DistributedServable>();
auto status = servable->ParserRankTableWithServerList("rank_table_file", rank_table_server_list);
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_not_device_id_failed) {
nlohmann::json rank_table_server_list = R"(
{
"version": "1.0",
"server_count": "1",
"server_list": [
{
"server_id": "10.155.111.140",
"device": [
{"device_id": "1wdb","device_ip": "192.1.27.6","rank_id": "0"}],
{"device_ip": "192.1.27.6","rank_id": "0"}],
"host_nic_ip": "reserve"
}
],
@@ -275,7 +294,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed2) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id_failed) {
nlohmann::json rank_table_server_list = R"(
{
"version": "1.0",
@@ -284,7 +303,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_invalid_device_id
{
"server_id": "",
"device": [
{"device_id": "8","device_ip": "192.1.27.6","rank_id": "0"}],
{"device_id": "1wdb","device_ip": "192.1.27.6","rank_id": "0"}],
"host_nic_ip": "reserve"
}
],
@@ -603,7 +622,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_device_id_failed) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_devices_failed) {
nlohmann::json rank_table_server_list = R"(
{
"board_id": "0x0000",
@@ -612,7 +631,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de
"instance_count": "1",
"instance_list": [
{
"devices": [{"device_ip": "192.1.27.6"}],
"devices": "rtrt",
"rank_id": "0",
"server_id": "10.155.111.140"
}
@@ -627,7 +646,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_de
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_empty_devices_failed) {
nlohmann::json rank_table_server_list = R"(
{
"board_id": "0x0000",
@@ -636,7 +655,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali
"instance_count": "1",
"instance_list": [
{
"devices": [{"device_id": "wd1gt2", "device_ip": "192.1.27.6"}],
"devices": [],
"rank_id": "0",
"server_id": "10.155.111.140"
}
@@ -651,7 +670,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed2) {
TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_not_device_id_failed) {
nlohmann::json rank_table_server_list = R"(
{
"board_id": "0x0000",
@@ -660,7 +679,7 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali
"instance_count": "1",
"instance_list": [
{
"devices": [{"device_id": "8", "device_ip": "192.1.27.6"}],
"devices": [{"device_ip": "192.1.27.6"}],
"rank_id": "0",
"server_id": "10.155.111.140"
}
@@ -671,7 +690,31 @@ TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invali
}
)"_json;
auto servable = std::make_shared<DistributedServable>();
auto status = servable->ParserRankTableWithServerList("rank_table_file", rank_table_server_list);
auto status = servable->ParserRankTableWithGroupList("rank_table_file", rank_table_server_list);
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}

TEST_F(TestParseRankTableFile, test_parse_rank_table_file_with_group_list_invalid_device_id_failed) {
nlohmann::json rank_table_server_list = R"(
{
"board_id": "0x0000",
"group_list": [
{
"instance_count": "1",
"instance_list": [
{
"devices": [{"device_id": "wd1gt2", "device_ip": "192.1.27.6"}],
"rank_id": "0",
"server_id": "10.155.111.140"
}
]
}
],
"status": "completed"
}
)"_json;
auto servable = std::make_shared<DistributedServable>();
auto status = servable->ParserRankTableWithGroupList("rank_table_file", rank_table_server_list);
ASSERT_EQ(status.StatusCode(), INVALID_INPUTS);
}



Loading…
Cancel
Save