From 8fc38595bb73194c875583ddeb485b9bd38ebdb6 Mon Sep 17 00:00:00 2001 From: xuyongfei Date: Tue, 9 Mar 2021 17:46:38 +0800 Subject: [PATCH] Serving, bugfix 0309 --- .../worker/distributed_worker/distributed_servable.cc | 7 ++++--- mindspore_serving/worker/distributed/agent_startup.py | 8 ++++---- .../worker/distributed/distributed_worker.py | 2 +- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc index 3e6f463..54dc975 100644 --- a/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc +++ b/mindspore_serving/ccsrc/worker/distributed_worker/distributed_servable.cc @@ -529,7 +529,7 @@ Status DistributedServable::CheckAgentsInfosAndInitTensorInfos() { auto stage_size = config_.distributed_meta.stage_size; auto parallel_count = rank_size / stage_size; MSI_LOG_INFO << "Check agents infos, rank size :" << rank_size << ", stage size: " << stage_size - << ", parallel count: " << parallel_count; + << ", parallel count(rank size/stage size): " << parallel_count; if (agent_spec_map_.size() != rank_size) { return INFER_STATUS_LOG_ERROR(FAILED) << "Registered agents size " << agent_spec_map_.size() << " not match rank size " << rank_size; @@ -626,8 +626,9 @@ Status DistributedServable::CheckRankConfig() { } if (parallel_count % card_count_per_machine != 0) { return INFER_STATUS_LOG_ERROR(FAILED) - << "Parallel count " << parallel_count << " in one stage must be N * " << card_count_per_machine - << "(card count of one machine), rank size: " << rank_size << ", stage size: " << stage_size; + << "Parallel count(rank size/stage size) " << parallel_count << " in one stage must be N * " + << card_count_per_machine << "(card count of one machine) when rank size >= 8, rank size: " << rank_size + << ", stage size: " << stage_size; } for (size_t i = 0; i < rank_size; i += card_count_per_machine) { const auto &first_item = config_.rank_list[i]; diff --git a/mindspore_serving/worker/distributed/agent_startup.py b/mindspore_serving/worker/distributed/agent_startup.py index 80faa96..5ddca78 100644 --- a/mindspore_serving/worker/distributed/agent_startup.py +++ b/mindspore_serving/worker/distributed/agent_startup.py @@ -73,7 +73,7 @@ def _update_model_files_path(model_files, group_config_files): logger.info(f"input group config files: {group_config_files}") model_files_temp = [] for item in model_files: - file_name = os.path.join(script_dir, item) + file_name = os.path.realpath(os.path.join(script_dir, item)) if not os.access(file_name, os.R_OK): raise RuntimeError(f"Cannot access model file '{file_name}'") model_files_temp.append(file_name) @@ -81,7 +81,7 @@ def _update_model_files_path(model_files, group_config_files): if group_config_files is not None: group_files_temp = [] for item in group_config_files: - file_name = os.path.join(script_dir, item) + file_name = os.path.realpath(os.path.join(script_dir, item)) if not os.access(file_name, os.R_OK): raise RuntimeError(f"Cannot access group config file '{file_name}'") group_files_temp.append(file_name) @@ -335,9 +335,9 @@ def _startup_agents(common_meta, worker_ip, worker_port, f"rank table file: {rank_table_file}, model files: {model_files}, group config files: {group_config_files}" if not ret: WorkerAgent_.notify_failed(worker_ip, worker_port) - logger.info(f"Failed to start agents, {msg}") + logger.error(f"Failed to start agents, {msg}") print(f"Failed to start agents, {msg}") - return + raise RuntimeError("Failed to start agents") logger.info(f"Success to start agents, {msg}") print(f"Success to start agents, {msg}") diff --git a/mindspore_serving/worker/distributed/distributed_worker.py b/mindspore_serving/worker/distributed/distributed_worker.py index 4b87e1c..c27f962 100644 --- a/mindspore_serving/worker/distributed/distributed_worker.py +++ b/mindspore_serving/worker/distributed/distributed_worker.py @@ -27,7 +27,7 @@ def _get_rank_table_abs_path(rank_table_json_file): """Get absolute path of rank table file""" script_dir = os.path.dirname(os.path.realpath(sys.argv[0])) logger.info(f"input rank table file: {rank_table_json_file}") - rank_table_json_file = os.path.join(script_dir, rank_table_json_file) + rank_table_json_file = os.path.realpath(os.path.join(script_dir, rank_table_json_file)) logger.info(f"absolute path of rank table file: {rank_table_json_file}") return rank_table_json_file