Browse Source

fix: device occupied tdt hung

tags/v0.6.0-beta
jonyguo 5 years ago
parent
commit
0d375bbaa3
1 changed files with 7 additions and 5 deletions
  1. +7
    -5
      mindspore/ccsrc/utils/context/ms_context.cc

+ 7
- 5
mindspore/ccsrc/utils/context/ms_context.cc View File

@@ -194,17 +194,19 @@ bool MsContext::OpenTsd() {
}

MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";

TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}

int32_t initStatus = tdt::TdtHostInit(device_id);
if (initStatus != TDT_OK_CODE) {
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
return false;
}
tdt_print_ = std::thread(TensorPrint());
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
tsd_ref_++;
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
return true;


Loading…
Cancel
Save