Browse Source

!5933 Fix pserver error and optimize worker and server log.

Merge pull request !5933 from ZPaC/master-fix-error-when-pserver-finish-training
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
fc8bd0dd03
3 changed files with 12 additions and 1 deletions
  1. +4
    -0
      mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
  2. +5
    -1
      mindspore/ccsrc/frontend/parallel/ps/worker.h
  3. +3
    -0
      model_zoo/official/recommend/wide_and_deep/src/callbacks.py

+ 4
- 0
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h View File

@@ -736,7 +736,9 @@ void ParameterServer<T>::SyncEmbeddingTables() {
template <typename T>
void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) {
MS_LOG(INFO) << "PServer starts connecting to scheduler and workers...";
::ps::Start(0);
MS_LOG(INFO) << "PServer connected successfully.";
if (!::ps::IsServer()) {
std::cout << "This is not ther Server" << std::endl;
return;
@@ -744,7 +746,9 @@ void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) {
Init(func_graph);
PSContext::instance()->SetPSRankId(rank_id_);
thread_->join();
MS_LOG(INFO) << "PServer finished updating models, starts finalizing...";
::ps::Finalize(0, true);
MS_LOG(INFO) << "PServer finalized successfully.";
}
} // namespace ps
} // namespace parallel


+ 5
- 1
mindspore/ccsrc/frontend/parallel/ps/worker.h View File

@@ -86,7 +86,9 @@ void Worker<T>::Run() {
MS_LOG(INFO) << "'Worker is already running.";
return;
}
MS_LOG(INFO) << "Worker starts connecting to scheduler and server...";
::ps::Start(0);
MS_LOG(INFO) << "Worker connected successfully.";
if (!::ps::IsWorker()) {
MS_LOG(EXCEPTION) << "The role is not worker.";
}
@@ -176,9 +178,11 @@ void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const :
template <typename T>
void Worker<T>::Finalize() {
if (running_) {
MS_LOG(INFO) << "Worker starts finalizing...";
kv_worker_->Finalize();
kv_worker_.reset();
running_ = false;
MS_LOG(INFO) << "Worker finalized successfully.";
}
}

@@ -315,7 +319,7 @@ void Worker<T>::InitPSParamAndOptim(const std::string &param_name, tensor::Tenso

size_t param_key = GetParamKey(param_name);
if (param_key == kInvalidKey) {
MS_LOG(INFO) << "Parameter " << param_name << " has no key assigned.";
MS_LOG(DEBUG) << "Parameter " << param_name << " has no key assigned.";
return;
}
bool init_in_server = false;


+ 3
- 0
model_zoo/official/recommend/wide_and_deep/src/callbacks.py View File

@@ -36,6 +36,7 @@ class LossCallBack(Callback):

Note:
If per_print_times is 0, do NOT print loss.
If this process is MS_PSERVER role, do not run callbacks.

Args:
per_print_times (int): Print loss every times. Default: 1.
@@ -50,6 +51,8 @@ class LossCallBack(Callback):
def step_end(self, run_context):
"""Monitor the loss in training."""
cb_params = run_context.original_args()
if cb_params.net_outputs is None:
return
wide_loss, deep_loss = cb_params.net_outputs[0].asnumpy(), cb_params.net_outputs[1].asnumpy()
cur_step_in_epoch = (cb_params.cur_step_num - 1) % cb_params.batch_num + 1
cur_num = cb_params.cur_step_num


Loading…
Cancel
Save