diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc index c94f0f71dc..e96cadb4a9 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc @@ -101,12 +101,22 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind) auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + auto execution_mode = ms_context->get_param(MS_CTX_EXECUTION_MODE); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->SetContext(); - auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); - if (ret_rt_memcpy != RT_ERROR_NONE) { - MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; + + // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode + if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) { + auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); + if (ret_rt_memcpy != RT_ERROR_NONE) { + MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; + } + } else { + auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast(kind)); + if (!ret) { + MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed"; + } } } @@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { return true; } - SyncStream(); + bool sync_ok = false; std::vector host_shape; (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize); diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc index 13556fedf7..0432b77ba3 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc @@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() { MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error."; return false; } + FreeAndClearBufferPtrs(); + return true; +} + +bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { + InnerSetContext(); + if (stream_ == nullptr) { + MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr"; + return false; + } + + std::shared_ptr buffer(new char[size]()); + MS_EXCEPTION_IF_NULL(buffer); + std::copy(reinterpret_cast(src), reinterpret_cast(src) + size, buffer.get()); + AddBufferPtr(buffer); + if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast(kind), stream_)) { + MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error."; + return false; + } return true; } diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h index 1288670414..ccbf2de596 100644 --- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h @@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime { const std::vector &execution_order) override; void ClearGlobalIdleMem() override; bool SyncStream() override; + bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; void SetContext() override; void CreateContext() override; void *context() const override { return rt_context_; } diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h index 2998841162..6eebb13809 100644 --- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h @@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime { protected: bool SyncStream() override { return true; }; + bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; }; DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) override; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc index 4893ebdc38..a5e79b20a4 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc @@ -18,7 +18,9 @@ #include #include #include "runtime/device/gpu/gpu_device_manager.h" +#include "runtime/device/kernel_runtime_manager.h" #include "utils/log_adapter.h" +#include "utils/ms_context.h" #include "runtime/device/gpu/gpu_memory_allocator.h" #include "ir/tensor.h" #ifdef ENABLE_DEBUGGER @@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; } - if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { - MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; - return false; + + auto ms_context = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(ms_context); + bool execution_mode = ms_context->get_param(MS_CTX_EXECUTION_MODE); + if (execution_mode != kPynativeMode) { + if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { + MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; + return false; + } + return GPUDeviceManager::GetInstance().SyncStream(stream); + } else { + auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); + auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id); + MS_EXCEPTION_IF_NULL(runtime_instance); + return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0); } - return GPUDeviceManager::GetInstance().SyncStream(stream); } void GPUDeviceAddress::ClearDeviceMemory() { diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 8579dcbbe0..510161b50e 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet; using mindspore::device::memswap::MemSwapManager; using mindspore::device::memswap::SwapKind; static const size_t PARAMETER_OUTPUT_INDEX = 0; -bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } + +bool GPUKernelRuntime::SyncStream() { + if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) { + MS_LOG(ERROR) << "Call SyncStream error."; + return false; + } + FreeAndClearBufferPtrs(); + return true; +} bool GPUKernelRuntime::Init() { auto context_ptr = MsContext::GetInstance(); @@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, } } // namespace +bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { + std::shared_ptr buffer(new char[size]()); + MS_EXCEPTION_IF_NULL(buffer); + std::copy(reinterpret_cast(src), reinterpret_cast(src) + size, buffer.get()); + AddBufferPtr(buffer); + + auto &stream = GPUDeviceManager::GetInstance().default_stream(); + MS_EXCEPTION_IF_NULL(stream); + auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream); + if (!ret) { + MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; + return false; + } + return ret; +} + DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) { return std::make_shared(device_ptr, device_size, format, type_id); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h index 83b80ce0b0..1813c23b66 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h @@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime { DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, TypeId type_id) override; bool SyncStream() override; + bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; private: GPUKernelRuntime(const GPUKernelRuntime &); diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h index 54f7fb2052..df12b002aa 100644 --- a/mindspore/ccsrc/runtime/device/kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h @@ -75,6 +75,7 @@ class KernelRuntime { const std::unordered_set &value_nodes, const std::vector &execution_order); virtual bool SyncStream() = 0; + virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0; virtual void ClearGlobalIdleMem() {} virtual void CreateContext() {} virtual void SetContext() {} @@ -101,6 +102,8 @@ class KernelRuntime { virtual void PreInit() {} virtual uint64_t GetAvailableMemMaxSize() const { return 0; } + void AddBufferPtr(std::shared_ptr ptr) { buffer_ptrs_.push_back(ptr); } + void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); } protected: virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, @@ -149,6 +152,7 @@ class KernelRuntime { void *stream_ = nullptr; std::shared_ptr mem_manager_{nullptr}; std::map> graph_dynamic_kernel_map_; + std::vector> buffer_ptrs_ = {}; }; using KernelRuntimePtr = std::shared_ptr; } // namespace device