| @@ -101,12 +101,22 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind) | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE); | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| runtime_instance->SetContext(); | |||
| auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); | |||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; | |||
| // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode | |||
| if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) { | |||
| auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); | |||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||
| MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; | |||
| } | |||
| } else { | |||
| auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind)); | |||
| if (!ret) { | |||
| MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed"; | |||
| } | |||
| } | |||
| } | |||
| @@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size | |||
| if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { | |||
| return true; | |||
| } | |||
| SyncStream(); | |||
| bool sync_ok = false; | |||
| std::vector<size_t> host_shape; | |||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize); | |||
| @@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() { | |||
| MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error."; | |||
| return false; | |||
| } | |||
| FreeAndClearBufferPtrs(); | |||
| return true; | |||
| } | |||
| bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { | |||
| InnerSetContext(); | |||
| if (stream_ == nullptr) { | |||
| MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr"; | |||
| return false; | |||
| } | |||
| std::shared_ptr<char[]> buffer(new char[size]()); | |||
| MS_EXCEPTION_IF_NULL(buffer); | |||
| std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get()); | |||
| AddBufferPtr(buffer); | |||
| if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) { | |||
| MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error."; | |||
| return false; | |||
| } | |||
| return true; | |||
| } | |||
| @@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||
| const std::vector<CNodePtr> &execution_order) override; | |||
| void ClearGlobalIdleMem() override; | |||
| bool SyncStream() override; | |||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; | |||
| void SetContext() override; | |||
| void CreateContext() override; | |||
| void *context() const override { return rt_context_; } | |||
| @@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime { | |||
| protected: | |||
| bool SyncStream() override { return true; }; | |||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; }; | |||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) override; | |||
| @@ -18,7 +18,9 @@ | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "runtime/device/gpu/gpu_device_manager.h" | |||
| #include "runtime/device/kernel_runtime_manager.h" | |||
| #include "utils/log_adapter.h" | |||
| #include "utils/ms_context.h" | |||
| #include "runtime/device/gpu/gpu_memory_allocator.h" | |||
| #include "ir/tensor.h" | |||
| #ifdef ENABLE_DEBUGGER | |||
| @@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId | |||
| // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size | |||
| MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; | |||
| } | |||
| if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { | |||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||
| return false; | |||
| auto ms_context = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(ms_context); | |||
| bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE); | |||
| if (execution_mode != kPynativeMode) { | |||
| if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { | |||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||
| return false; | |||
| } | |||
| return GPUDeviceManager::GetInstance().SyncStream(stream); | |||
| } else { | |||
| auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id); | |||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||
| return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0); | |||
| } | |||
| return GPUDeviceManager::GetInstance().SyncStream(stream); | |||
| } | |||
| void GPUDeviceAddress::ClearDeviceMemory() { | |||
| @@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet; | |||
| using mindspore::device::memswap::MemSwapManager; | |||
| using mindspore::device::memswap::SwapKind; | |||
| static const size_t PARAMETER_OUTPUT_INDEX = 0; | |||
| bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } | |||
| bool GPUKernelRuntime::SyncStream() { | |||
| if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) { | |||
| MS_LOG(ERROR) << "Call SyncStream error."; | |||
| return false; | |||
| } | |||
| FreeAndClearBufferPtrs(); | |||
| return true; | |||
| } | |||
| bool GPUKernelRuntime::Init() { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| @@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||
| } | |||
| } // namespace | |||
| bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { | |||
| std::shared_ptr<char[]> buffer(new char[size]()); | |||
| MS_EXCEPTION_IF_NULL(buffer); | |||
| std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get()); | |||
| AddBufferPtr(buffer); | |||
| auto &stream = GPUDeviceManager::GetInstance().default_stream(); | |||
| MS_EXCEPTION_IF_NULL(stream); | |||
| auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream); | |||
| if (!ret) { | |||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||
| return false; | |||
| } | |||
| return ret; | |||
| } | |||
| DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) { | |||
| return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id); | |||
| @@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime { | |||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| TypeId type_id) override; | |||
| bool SyncStream() override; | |||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; | |||
| private: | |||
| GPUKernelRuntime(const GPUKernelRuntime &); | |||
| @@ -75,6 +75,7 @@ class KernelRuntime { | |||
| const std::unordered_set<ValueNodePtr> &value_nodes, | |||
| const std::vector<CNodePtr> &execution_order); | |||
| virtual bool SyncStream() = 0; | |||
| virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0; | |||
| virtual void ClearGlobalIdleMem() {} | |||
| virtual void CreateContext() {} | |||
| virtual void SetContext() {} | |||
| @@ -101,6 +102,8 @@ class KernelRuntime { | |||
| virtual void PreInit() {} | |||
| virtual uint64_t GetAvailableMemMaxSize() const { return 0; } | |||
| void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); } | |||
| void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); } | |||
| protected: | |||
| virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | |||
| @@ -149,6 +152,7 @@ class KernelRuntime { | |||
| void *stream_ = nullptr; | |||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | |||
| std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_; | |||
| std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {}; | |||
| }; | |||
| using KernelRuntimePtr = std::shared_ptr<KernelRuntime>; | |||
| } // namespace device | |||