fix gpu rank_id problem

5 years ago · 93d7642374
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc
@@ -240,10 +240,24 @@ Status DeviceQueueOp::GetDataInfo(DATA_INFO *data_info) {
 #endif

 #ifdef ENABLE_GPUQUE
 Status DeviceQueueOp::LaunchParallelCopyThread() {
 Status DeviceQueueOp::SetThreadDevice() {
  // Without cudaSetDevice cuda memory will allocate on GPU:0 as default
  // and will overload in distribute scenario, so don't remove this line
  cudaSetDevice(rank_id_);
  // and will overload in distribute scenario.
  auto ret = cudaSetDevice(rank_id_);
  if (ret != cudaSuccess) {
    std::string err;
    err += "cudaSetDevice failed, ret[";
    err += std::to_string(static_cast<int>(ret));
    err += "], ";
    err += cudaGetErrorString(ret);
    return Status(StatusCode::kMDUnexpectedError, __LINE__, __FILE__, err);
  }
  return Status::OK();
 }

 Status DeviceQueueOp::LaunchParallelCopyThread() {
  // Every thread use cuda api should SetThreadDevice
  RETURN_IF_NOT_OK(SetThreadDevice());
  // CircularPool may not safe under multi-threads scenario, so one worker with one pool
  for (int i = 0; i < num_workers_; i++) {
    std::shared_ptr<MemoryPool> pool;
@@ -262,9 +276,8 @@ Status DeviceQueueOp::LaunchParallelCopyThread() {
 }

 Status DeviceQueueOp::PushDataToGPU() {
  // Without cudaSetDevice cuda memory will allocate on GPU:0 as default
  // and will overload in distribute scenario, so don't remove this line
  cudaSetDevice(rank_id_);
  // Every thread use cuda api should SetThreadDevice
  RETURN_IF_NOT_OK(SetThreadDevice());
  TaskManager::FindMe()->Post();
  uint64_t batch_start_time = 0;
  uint64_t end_time = 0;
@@ -357,9 +370,8 @@ Status DeviceQueueOp::PushDataToGPU() {

 // WorkEntry of DeviceQueueOp just do multi_threads memcpy for performance optimization.
 Status DeviceQueueOp::WorkerEntry(int32_t worker_id) {
  // Without cudaSetDevice cuda memory will allocate on GPU:0 as default
  // and will overload in distribute scenario, so don't remove this line
  cudaSetDevice(rank_id_);
  // Every thread use cuda api should SetThreadDevice
  RETURN_IF_NOT_OK(SetThreadDevice());
  TaskManager::FindMe()->Post();
  std::unique_ptr<DataBuffer> current_buffer;
  uint32_t batch_num = 0;
--- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
+++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.h
@@ -196,6 +196,7 @@ class DeviceQueueOp : public PipelineOp {
  Status LaunchParallelCopyThread();
  Status PushDataToGPU();
  Status WorkerEntry(int32_t worker_id);
  Status SetThreadDevice();

  QueueList<std::unique_ptr<DataBuffer>> receive_queues_;
  std::vector<std::shared_ptr<MemoryPool>> pool_;
--- a/mindspore/core/utils/status.cc
+++ b/mindspore/core/utils/status.cc
@@ -51,7 +51,7 @@ std::string Status::CodeAsString(enum StatusCode c) {
                                                            {kCoreFailed, "Common error code."},
                                                            // MD
                                                            {kMDOutOfMemory, "Out of memory"},
                                                            {kMDShapeMisMatch, "Shape is incorrect."},
                                                            {kMDShapeMisMatch, "Shape is incorrect"},
                                                            {kMDInterrupted, "Interrupted system call"},
                                                            {kMDNoSpace, "No space left on device"},
                                                            {kMDPyFuncException, "Exception thrown from PyFunc"},
--- a/mindspore/dataset/core/config.py
+++ b/mindspore/dataset/core/config.py
@@ -46,11 +46,7 @@ def _init_device_info():
        rank_id = _get_global_rank()
        parallel_mode = auto_parallel_context().get_parallel_mode()
        if parallel_mode == "stand_alone":
            cuda_device_info = os.getenv("CUDA_VISIBLE_DEVICES")
            if cuda_device_info:
                cuda_id = int(cuda_device_info.split(",")[0].strip())
                if cuda_id != rank_id:
                    rank_id = cuda_id
            rank_id = context.get_context("device_id")
        _config.set_rank_id(rank_id)
    elif context.get_context("device_target") == "Ascend":
        # Ascend is a special scenario, we'd better get rank info from env