Browse Source

fixed core dump

pull/13773/head
chendongsheng 4 years ago
parent
commit
54da7f28d2
5 changed files with 18 additions and 25 deletions
  1. +11
    -7
      mindspore/ccsrc/ps/core/abstract_node.cc
  2. +1
    -0
      mindspore/ccsrc/ps/core/abstract_node.h
  3. +1
    -0
      mindspore/ccsrc/ps/core/server_node.cc
  4. +3
    -11
      mindspore/ccsrc/ps/core/tcp_client.cc
  5. +2
    -7
      mindspore/ccsrc/ps/core/worker_node.cc

+ 11
- 7
mindspore/ccsrc/ps/core/abstract_node.cc View File

@@ -418,14 +418,18 @@ bool AbstractNode::InitClientToScheduler() {
client_to_scheduler_ = std::make_shared<TcpClient>(scheduler_host, scheduler_port);
client_to_scheduler_->SetMessageCallback(
[&](std::shared_ptr<MessageMeta> meta, const Protos &protos, const void *data, size_t size) {
if (handlers_.count(meta->cmd()) == 0) {
MS_LOG(EXCEPTION) << "The cmd:" << meta->cmd() << " is not supported!";
}
if (handlers_[meta->cmd()] != nullptr) {
const auto &handler_ptr = handlers_[meta->cmd()];
(this->*handler_ptr)(meta, data, size);
try {
if (handlers_.count(meta->cmd()) == 0) {
MS_LOG(EXCEPTION) << "The cmd:" << meta->cmd() << " is not supported!";
}
if (handlers_[meta->cmd()] != nullptr) {
const auto &handler_ptr = handlers_[meta->cmd()];
(this->*handler_ptr)(meta, data, size);
}
NotifyMessageArrival(meta);
} catch (const std::exception &e) {
MsException::Instance().SetException();
}
NotifyMessageArrival(meta);
});

client_to_scheduler_->Init();


+ 1
- 0
mindspore/ccsrc/ps/core/abstract_node.h View File

@@ -26,6 +26,7 @@

#include "ps/core/node.h"
#include "ps/core/message.h"
#include "utils/ms_exception.h"

namespace mindspore {
namespace ps {


+ 1
- 0
mindspore/ccsrc/ps/core/server_node.cc View File

@@ -40,6 +40,7 @@ bool ServerNode::Start(const uint32_t &timeout) {
FetchServers(client_to_scheduler_);
MS_LOG(INFO) << "Server node get all the servers address successful!";
}
MsException::Instance().CheckException();
MS_LOG(INFO) << "Start the node is successful!";
return true;
}


+ 3
- 11
mindspore/ccsrc/ps/core/tcp_client.cc View File

@@ -141,17 +141,9 @@ void TcpClient::StartWithDelay(int seconds) {
void TcpClient::Stop() {
std::lock_guard<std::mutex> lock(connection_mutex_);
MS_LOG(INFO) << "Stop tcp client!";
if (event_base_got_break(event_base_)) {
MS_LOG(DEBUG) << "The event base has stopped!";
is_stop_ = true;
return;
}
if (!is_stop_.load()) {
is_stop_ = true;
int ret = event_base_loopbreak(event_base_);
if (ret != 0) {
MS_LOG(ERROR) << "Event base loop break failed!";
}
int ret = event_base_loopbreak(event_base_);
if (ret != 0) {
MS_LOG(ERROR) << "Event base loop break failed!";
}
}



+ 2
- 7
mindspore/ccsrc/ps/core/worker_node.cc View File

@@ -41,6 +41,7 @@ bool WorkerNode::Start(const uint32_t &timeout) {
FetchServers(client_to_scheduler_);
MS_LOG(INFO) << "Worker node get all the servers address successful!";
}
MsException::Instance().CheckException();
MS_LOG(INFO) << "The Worker node has successfully started.";
return true;
}
@@ -59,23 +60,17 @@ void WorkerNode::Initialize() {
}

bool WorkerNode::Stop() {
MS_LOG(INFO) << "Stop worker node!";
if (!is_already_stopped_.load()) {
MS_LOG(INFO) << "Stop worker node!";
is_ready_ = true;
is_timeout_ = true;
is_finish_ = true;
if (heart_beat_thread_->joinable()) {
heart_beat_thread_->join();
}
client_to_scheduler_->Stop();
if (!connected_nodes_.empty()) {
for (auto &connected_node : connected_nodes_) {
connected_node.second->Stop();
}
}
if (client_to_scheduler_thread_->joinable()) {
client_to_scheduler_thread_->join();
}
is_already_stopped_ = true;
}
return true;


Loading…
Cancel
Save