Browse Source

fix gpu rank_id problem

tags/v1.2.0-rc1
xiefangqi 5 years ago
parent
commit
93d7642374
4 changed files with 24 additions and 15 deletions
  1. +21
    -9
      mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
  2. +1
    -0
      mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
  3. +1
    -1
      mindspore/core/utils/status.cc
  4. +1
    -5
      mindspore/dataset/core/config.py

+ 21
- 9
mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc View File

@@ -240,10 +240,24 @@ Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
#endif

#ifdef ENABLE_GPUQUE
Status DeviceQueueOp::LaunchParallelCopyThread() {
Status DeviceQueueOp::SetThreadDevice() {
// Without cudaSetDevice cuda memory will allocate on GPU:0 as default
// and will overload in distribute scenario, so don't remove this line
cudaSetDevice(rank_id_);
// and will overload in distribute scenario.
auto ret = cudaSetDevice(rank_id_);
if (ret != cudaSuccess) {
std::string err;
err += "cudaSetDevice failed, ret[";
err += std::to_string(static_cast<int>(ret));
err += "], ";
err += cudaGetErrorString(ret);
return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__, err);
}
return Status::OK();
}

Status DeviceQueueOp::LaunchParallelCopyThread() {
// Every thread use cuda api should SetThreadDevice
RETURN_IF_NOT_OK(SetThreadDevice());
// CircularPool may not safe under multi-threads scenario, so one worker with one pool
for (int i = 0; i < num_workers_; i++) {
std::shared_ptr<MemoryPool> pool;
@@ -262,9 +276,8 @@ Status DeviceQueueOp::LaunchParallelCopyThread() {
}

Status DeviceQueueOp::PushDataToGPU() {
// Without cudaSetDevice cuda memory will allocate on GPU:0 as default
// and will overload in distribute scenario, so don't remove this line
cudaSetDevice(rank_id_);
// Every thread use cuda api should SetThreadDevice
RETURN_IF_NOT_OK(SetThreadDevice());
TaskManager::FindMe()->Post();
uint64_t batch_start_time = 0;
uint64_t end_time = 0;
@@ -357,9 +370,8 @@ Status DeviceQueueOp::PushDataToGPU() {

// WorkEntry of DeviceQueueOp just do multi_threads memcpy for performance optimization.
Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
// Without cudaSetDevice cuda memory will allocate on GPU:0 as default
// and will overload in distribute scenario, so don't remove this line
cudaSetDevice(rank_id_);
// Every thread use cuda api should SetThreadDevice
RETURN_IF_NOT_OK(SetThreadDevice());
TaskManager::FindMe()->Post();
std::unique_ptr<DataBuffer> current_buffer;
uint32_t batch_num = 0;


+ 1
- 0
mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h View File

@@ -196,6 +196,7 @@ class DeviceQueueOp : public PipelineOp {
Status LaunchParallelCopyThread();
Status PushDataToGPU();
Status WorkerEntry(int32_t worker_id);
Status SetThreadDevice();

QueueList<std::unique_ptr<DataBuffer>> receive_queues_;
std::vector<std::shared_ptr<MemoryPool>> pool_;


+ 1
- 1
mindspore/core/utils/status.cc View File

@@ -51,7 +51,7 @@ std::string Status::CodeAsString(enum StatusCode c) {
{kCoreFailed, "Common error code."},
// MD
{kMDOutOfMemory, "Out of memory"},
{kMDShapeMisMatch, "Shape is incorrect."},
{kMDShapeMisMatch, "Shape is incorrect"},
{kMDInterrupted, "Interrupted system call"},
{kMDNoSpace, "No space left on device"},
{kMDPyFuncException, "Exception thrown from PyFunc"},


+ 1
- 5
mindspore/dataset/core/config.py View File

@@ -46,11 +46,7 @@ def _init_device_info():
rank_id = _get_global_rank()
parallel_mode = auto_parallel_context().get_parallel_mode()
if parallel_mode == "stand_alone":
cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES")
if cuda_device_info:
cuda_id = int(cuda_device_info.split(",")[0].strip())
if cuda_id != rank_id:
rank_id = cuda_id
rank_id = context.get_context("device_id")
_config.set_rank_id(rank_id)
elif context.get_context("device_target") == "Ascend":
# Ascend is a special scenario, we'd better get rank info from env


Loading…
Cancel
Save