Browse Source

Serving, bugfix 0309

tags/v1.2.0
xuyongfei 5 years ago
parent
commit
8fc38595bb
3 changed files with 9 additions and 8 deletions
  1. +4
    -3
      mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc
  2. +4
    -4
      mindspore_serving/worker/distributed/agent_startup.py
  3. +1
    -1
      mindspore_serving/worker/distributed/distributed_worker.py

+ 4
- 3
mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc View File

@@ -529,7 +529,7 @@ Status DistributedServable::CheckAgentsInfosAndInitTensorInfos() {
auto stage_size = config_.distributed_meta.stage_size;
auto parallel_count = rank_size / stage_size;
MSI_LOG_INFO << "Check agents infos, rank size :" << rank_size << ", stage size: " << stage_size
<< ", parallel count: " << parallel_count;
<< ", parallel count(rank size/stage size): " << parallel_count;
if (agent_spec_map_.size() != rank_size) {
return INFER_STATUS_LOG_ERROR(FAILED)
<< "Registered agents size " << agent_spec_map_.size() << " not match rank size " << rank_size;
@@ -626,8 +626,9 @@ Status DistributedServable::CheckRankConfig() {
}
if (parallel_count % card_count_per_machine != 0) {
return INFER_STATUS_LOG_ERROR(FAILED)
<< "Parallel count " << parallel_count << " in one stage must be N * " << card_count_per_machine
<< "(card count of one machine), rank size: " << rank_size << ", stage size: " << stage_size;
<< "Parallel count(rank size/stage size) " << parallel_count << " in one stage must be N * "
<< card_count_per_machine << "(card count of one machine) when rank size >= 8, rank size: " << rank_size
<< ", stage size: " << stage_size;
}
for (size_t i = 0; i < rank_size; i += card_count_per_machine) {
const auto &first_item = config_.rank_list[i];


+ 4
- 4
mindspore_serving/worker/distributed/agent_startup.py View File

@@ -73,7 +73,7 @@ def _update_model_files_path(model_files, group_config_files):
logger.info(f"input group config files: {group_config_files}")
model_files_temp = []
for item in model_files:
file_name = os.path.join(script_dir, item)
file_name = os.path.realpath(os.path.join(script_dir, item))
if not os.access(file_name, os.R_OK):
raise RuntimeError(f"Cannot access model file '{file_name}'")
model_files_temp.append(file_name)
@@ -81,7 +81,7 @@ def _update_model_files_path(model_files, group_config_files):
if group_config_files is not None:
group_files_temp = []
for item in group_config_files:
file_name = os.path.join(script_dir, item)
file_name = os.path.realpath(os.path.join(script_dir, item))
if not os.access(file_name, os.R_OK):
raise RuntimeError(f"Cannot access group config file '{file_name}'")
group_files_temp.append(file_name)
@@ -335,9 +335,9 @@ def _startup_agents(common_meta, worker_ip, worker_port,
f"rank table file: {rank_table_file}, model files: {model_files}, group config files: {group_config_files}"
if not ret:
WorkerAgent_.notify_failed(worker_ip, worker_port)
logger.info(f"Failed to start agents, {msg}")
logger.error(f"Failed to start agents, {msg}")
print(f"Failed to start agents, {msg}")
return
raise RuntimeError("Failed to start agents")

logger.info(f"Success to start agents, {msg}")
print(f"Success to start agents, {msg}")


+ 1
- 1
mindspore_serving/worker/distributed/distributed_worker.py View File

@@ -27,7 +27,7 @@ def _get_rank_table_abs_path(rank_table_json_file):
"""Get absolute path of rank table file"""
script_dir = os.path.dirname(os.path.realpath(sys.argv[0]))
logger.info(f"input rank table file: {rank_table_json_file}")
rank_table_json_file = os.path.join(script_dir, rank_table_json_file)
rank_table_json_file = os.path.realpath(os.path.join(script_dir, rank_table_json_file))
logger.info(f"absolute path of rank table file: {rank_table_json_file}")
return rank_table_json_file



Loading…
Cancel
Save