Browse Source

!9929 gpu all-reduce memory alloc fixed

From: @limingqi107
Reviewed-by: @kisnwang,@zhoufeng54,@cristoval
Signed-off-by: @cristoval
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
316298ee58
7 changed files with 60 additions and 13 deletions
  1. +10
    -0
      mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc
  2. +2
    -0
      mindspore/ccsrc/runtime/device/gpu/cuda_driver.h
  3. +6
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc
  4. +2
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.h
  5. +2
    -12
      mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
  6. +35
    -0
      mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc
  7. +3
    -1
      mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h

+ 10
- 0
mindspore/ccsrc/runtime/device/gpu/cuda_driver.cc View File

@@ -110,6 +110,16 @@ bool CudaDriver::CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMem
return true;
}

bool CudaDriver::CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const DeviceMemPtr &src, size_t size,
DeviceStream stream) {
auto ret = cudaMemcpyAsync(dst, src, size, cudaMemcpyDeviceToDevice, (cudaStream_t)stream);
if (ret != cudaSuccess) {
MS_LOG(ERROR) << "cudaMemcpyAsync failed, ret[" << static_cast<int>(ret) << "], " << cudaGetErrorString(ret);
return false;
}
return true;
}

size_t CudaDriver::total_mem_size() {
size_t free;
size_t total;


+ 2
- 0
mindspore/ccsrc/runtime/device/gpu/cuda_driver.h View File

@@ -42,6 +42,8 @@ class CudaDriver {
static bool CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream = 0);
static bool CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size,
DeviceStream stream = 0);
static bool CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const DeviceMemPtr &src, size_t size,
DeviceStream stream = 0);
static size_t total_mem_size();
static size_t free_mem_size();


+ 6
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.cc View File

@@ -108,6 +108,12 @@ bool GPUDeviceManager::CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const v
DeviceStream stream) const {
return CudaDriver::CopyHostMemToDeviceAsync(dst, src, size, stream);
}
bool GPUDeviceManager::CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const DeviceMemPtr &src, size_t size,
DeviceStream stream) const {
return CudaDriver::CopyDeviceMemToDeviceAsync(dst, src, size, stream);
}
} // namespace gpu
} // namespace device
} // namespace mindspore

+ 2
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_device_manager.h View File

@@ -51,6 +51,8 @@ class GPUDeviceManager {
bool CopyDeviceMemToHostAsync(const HostMemPtr &dst, const DeviceMemPtr &src, size_t size, DeviceStream stream) const;
bool CopyHostMemToDeviceAsync(const DeviceMemPtr &dst, const void *src, size_t size, DeviceStream stream) const;
bool CopyDeviceMemToDeviceAsync(const DeviceMemPtr &dst, const DeviceMemPtr &src, size_t size,
DeviceStream stream) const;
static GPUDeviceManager &GetInstance() {
static GPUDeviceManager instance;


+ 2
- 12
mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc View File

@@ -1053,22 +1053,12 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
AllocCommunicationOpMemory(is_need_alloc_memory, is_need_free_memory, addr_list, total_size, size_list);
}

void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool is_need_free_memory,
const DeviceAddressPtrList addr_list, size_t total_size,
std::vector<size_t> size_list) {
void GPUKernelRuntime::AllocCommunicationOpMemory(bool is_need_alloc_memory, bool, const DeviceAddressPtrList addr_list,
size_t total_size, std::vector<size_t> size_list) {
MS_EXCEPTION_IF_NULL(mem_manager_);
if (!is_need_alloc_memory) {
return;
}
if (is_need_free_memory) {
for (const auto &iter : addr_list) {
MS_EXCEPTION_IF_NULL(iter);
// Free the inputs/outputs of communication kernel which are not released.
if (iter->ptr_ != nullptr) {
mem_manager_->FreeMemFromMemPool(iter);
}
}
}
auto ret = mem_manager_->MallocContinuousMemFromMemPool(addr_list, total_size, size_list);
if (!ret) {
MS_LOG(EXCEPTION) << "Malloc device memory failed.";


+ 35
- 0
mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.cc View File

@@ -19,6 +19,8 @@
#include "utils/ms_context.h"
#include "utils/convert_utils.h"
#include "ps/ps_cache/ps_cache_manager.h"
#include "runtime/device/gpu/gpu_device_manager.h"
#include "runtime/device/gpu/gpu_common.h"
namespace mindspore {
namespace device {
namespace gpu {
@@ -34,6 +36,39 @@ std::vector<void *> GPUMemoryManager::MallocContinuousMemFromMemPool(size_t tota
return GPUMemoryAllocator::GetInstance().AllocContinuousTensorMem(total_size, size_list);
}

bool GPUMemoryManager::MallocContinuousMemFromMemPool(const DeviceAddressPtrList addr_list, size_t total_size,
std::vector<size_t> size_list) {
auto device_ptr_list = MallocContinuousMemFromMemPool(total_size, size_list);
if (device_ptr_list.size() == 0) {
return false;
}
if (addr_list.size() != device_ptr_list.size()) {
MS_LOG(EXCEPTION) << "The size of device list is not equal to the size of address list.";
}
auto &stream = GPUDeviceManager::GetInstance().default_stream();
MS_EXCEPTION_IF_NULL(stream);
bool need_sync_stream = false;
for (size_t i = 0; i < addr_list.size(); i++) {
MS_EXCEPTION_IF_NULL(addr_list[i]);
auto old_addr = addr_list[i]->ptr_;
auto new_addr = device_ptr_list[i];
MS_EXCEPTION_IF_NULL(new_addr);
if (old_addr != nullptr) {
need_sync_stream = true;
CHECK_OP_RET_WITH_EXCEPT(
GPUDeviceManager::GetInstance().CopyDeviceMemToDeviceAsync(new_addr, old_addr, size_list[i], stream),
"Failed to copyHostMemToDeviceAsync.");
FreeMemFromMemPool(old_addr);
}
addr_list[i]->ptr_ = new_addr;
addr_list[i]->from_mem_pool_ = true;
}
if (need_sync_stream) {
return GPUDeviceManager::GetInstance().SyncStream(stream);
}
return true;
}

void GPUMemoryManager::MallocDeviceMemory() {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);


+ 3
- 1
mindspore/ccsrc/runtime/device/gpu/gpu_memory_manager.h View File

@@ -31,7 +31,9 @@ class GPUMemoryManager : public MemoryManager {

void *MallocMemFromMemPool(size_t size) override;
void FreeMemFromMemPool(void *device_ptr) override;
std::vector<void *> MallocContinuousMemFromMemPool(size_t total_size, std::vector<size_t> size_list);
std::vector<void *> MallocContinuousMemFromMemPool(size_t total_size, std::vector<size_t> size_list) override;
bool MallocContinuousMemFromMemPool(const DeviceAddressPtrList addr_list, size_t total_size,
std::vector<size_t> size_list) override;

protected:
uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;


Loading…
Cancel
Save