From f644e9da46e9fb0f8ac5f07c0eac6e0ded1d80ca Mon Sep 17 00:00:00 2001 From: xiefangqi Date: Fri, 18 Dec 2020 10:49:14 +0800 Subject: [PATCH] Fix devicequeue multi-machine problem --- .../minddata/dataset/engine/datasetops/device_queue_op.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc index 6025e4c02c..6c2c600e6a 100644 --- a/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc +++ b/mindspore/ccsrc/minddata/dataset/engine/datasetops/device_queue_op.cc @@ -45,8 +45,14 @@ DeviceQueueOp::DeviceQueueOp(std::string channel_name, DeviceType device_type, i total_batch_(total_batch), create_data_info_queue_(create_data_info_queue) { #ifdef ENABLE_GPUQUE + // Get the total device num of current machine + int32_t device_count = 0; + cudaGetDeviceCount(&device_count); std::shared_ptr cfg = GlobalContext::config_manager(); rank_id_ = cfg->rank_id(); // Get the current rank_id + if (device_count > 0) { + rank_id_ = rank_id_ % device_count; + } // Be careful when try to modified these num_workers_ and queue_capacity_, // and we suggest num_workers_ * queue_capacity_ not greater than 16, because // one worker one circular_pool with 1G pin memory, so num_workers_ * queue_capacity_