| @@ -101,12 +101,22 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind) | |||||
| auto ms_context = MsContext::GetInstance(); | auto ms_context = MsContext::GetInstance(); | ||||
| MS_EXCEPTION_IF_NULL(ms_context); | MS_EXCEPTION_IF_NULL(ms_context); | ||||
| auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | ||||
| auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE); | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); | auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id); | ||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | MS_EXCEPTION_IF_NULL(runtime_instance); | ||||
| runtime_instance->SetContext(); | runtime_instance->SetContext(); | ||||
| auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); | |||||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||||
| MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; | |||||
| // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode | |||||
| if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) { | |||||
| auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind); | |||||
| if (ret_rt_memcpy != RT_ERROR_NONE) { | |||||
| MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed"; | |||||
| } | |||||
| } else { | |||||
| auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind)); | |||||
| if (!ret) { | |||||
| MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed"; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| @@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size | |||||
| if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { | if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) { | ||||
| return true; | return true; | ||||
| } | } | ||||
| SyncStream(); | |||||
| bool sync_ok = false; | bool sync_ok = false; | ||||
| std::vector<size_t> host_shape; | std::vector<size_t> host_shape; | ||||
| (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize); | (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize); | ||||
| @@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() { | |||||
| MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error."; | MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error."; | ||||
| return false; | return false; | ||||
| } | } | ||||
| FreeAndClearBufferPtrs(); | |||||
| return true; | |||||
| } | |||||
| bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { | |||||
| InnerSetContext(); | |||||
| if (stream_ == nullptr) { | |||||
| MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr"; | |||||
| return false; | |||||
| } | |||||
| std::shared_ptr<char[]> buffer(new char[size]()); | |||||
| MS_EXCEPTION_IF_NULL(buffer); | |||||
| std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get()); | |||||
| AddBufferPtr(buffer); | |||||
| if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) { | |||||
| MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error."; | |||||
| return false; | |||||
| } | |||||
| return true; | return true; | ||||
| } | } | ||||
| @@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime { | |||||
| const std::vector<CNodePtr> &execution_order) override; | const std::vector<CNodePtr> &execution_order) override; | ||||
| void ClearGlobalIdleMem() override; | void ClearGlobalIdleMem() override; | ||||
| bool SyncStream() override; | bool SyncStream() override; | ||||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; | |||||
| void SetContext() override; | void SetContext() override; | ||||
| void CreateContext() override; | void CreateContext() override; | ||||
| void *context() const override { return rt_context_; } | void *context() const override { return rt_context_; } | ||||
| @@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime { | |||||
| protected: | protected: | ||||
| bool SyncStream() override { return true; }; | bool SyncStream() override { return true; }; | ||||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; }; | |||||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| TypeId type_id) override; | TypeId type_id) override; | ||||
| @@ -18,7 +18,9 @@ | |||||
| #include <vector> | #include <vector> | ||||
| #include <memory> | #include <memory> | ||||
| #include "runtime/device/gpu/gpu_device_manager.h" | #include "runtime/device/gpu/gpu_device_manager.h" | ||||
| #include "runtime/device/kernel_runtime_manager.h" | |||||
| #include "utils/log_adapter.h" | #include "utils/log_adapter.h" | ||||
| #include "utils/ms_context.h" | |||||
| #include "runtime/device/gpu/gpu_memory_allocator.h" | #include "runtime/device/gpu/gpu_memory_allocator.h" | ||||
| #include "ir/tensor.h" | #include "ir/tensor.h" | ||||
| #ifdef ENABLE_DEBUGGER | #ifdef ENABLE_DEBUGGER | ||||
| @@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId | |||||
| // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size | // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size | ||||
| MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; | MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_; | ||||
| } | } | ||||
| if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { | |||||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||||
| return false; | |||||
| auto ms_context = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(ms_context); | |||||
| bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE); | |||||
| if (execution_mode != kPynativeMode) { | |||||
| if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) { | |||||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||||
| return false; | |||||
| } | |||||
| return GPUDeviceManager::GetInstance().SyncStream(stream); | |||||
| } else { | |||||
| auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID); | |||||
| auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id); | |||||
| MS_EXCEPTION_IF_NULL(runtime_instance); | |||||
| return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0); | |||||
| } | } | ||||
| return GPUDeviceManager::GetInstance().SyncStream(stream); | |||||
| } | } | ||||
| void GPUDeviceAddress::ClearDeviceMemory() { | void GPUDeviceAddress::ClearDeviceMemory() { | ||||
| @@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet; | |||||
| using mindspore::device::memswap::MemSwapManager; | using mindspore::device::memswap::MemSwapManager; | ||||
| using mindspore::device::memswap::SwapKind; | using mindspore::device::memswap::SwapKind; | ||||
| static const size_t PARAMETER_OUTPUT_INDEX = 0; | static const size_t PARAMETER_OUTPUT_INDEX = 0; | ||||
| bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } | |||||
| bool GPUKernelRuntime::SyncStream() { | |||||
| if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) { | |||||
| MS_LOG(ERROR) << "Call SyncStream error."; | |||||
| return false; | |||||
| } | |||||
| FreeAndClearBufferPtrs(); | |||||
| return true; | |||||
| } | |||||
| bool GPUKernelRuntime::Init() { | bool GPUKernelRuntime::Init() { | ||||
| auto context_ptr = MsContext::GetInstance(); | auto context_ptr = MsContext::GetInstance(); | ||||
| @@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel, | |||||
| } | } | ||||
| } // namespace | } // namespace | ||||
| bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) { | |||||
| std::shared_ptr<char[]> buffer(new char[size]()); | |||||
| MS_EXCEPTION_IF_NULL(buffer); | |||||
| std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get()); | |||||
| AddBufferPtr(buffer); | |||||
| auto &stream = GPUDeviceManager::GetInstance().default_stream(); | |||||
| MS_EXCEPTION_IF_NULL(stream); | |||||
| auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream); | |||||
| if (!ret) { | |||||
| MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed"; | |||||
| return false; | |||||
| } | |||||
| return ret; | |||||
| } | |||||
| DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| TypeId type_id) { | TypeId type_id) { | ||||
| return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id); | return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id); | ||||
| @@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime { | |||||
| DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| TypeId type_id) override; | TypeId type_id) override; | ||||
| bool SyncStream() override; | bool SyncStream() override; | ||||
| bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override; | |||||
| private: | private: | ||||
| GPUKernelRuntime(const GPUKernelRuntime &); | GPUKernelRuntime(const GPUKernelRuntime &); | ||||
| @@ -75,6 +75,7 @@ class KernelRuntime { | |||||
| const std::unordered_set<ValueNodePtr> &value_nodes, | const std::unordered_set<ValueNodePtr> &value_nodes, | ||||
| const std::vector<CNodePtr> &execution_order); | const std::vector<CNodePtr> &execution_order); | ||||
| virtual bool SyncStream() = 0; | virtual bool SyncStream() = 0; | ||||
| virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0; | |||||
| virtual void ClearGlobalIdleMem() {} | virtual void ClearGlobalIdleMem() {} | ||||
| virtual void CreateContext() {} | virtual void CreateContext() {} | ||||
| virtual void SetContext() {} | virtual void SetContext() {} | ||||
| @@ -101,6 +102,8 @@ class KernelRuntime { | |||||
| virtual void PreInit() {} | virtual void PreInit() {} | ||||
| virtual uint64_t GetAvailableMemMaxSize() const { return 0; } | virtual uint64_t GetAvailableMemMaxSize() const { return 0; } | ||||
| void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); } | |||||
| void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); } | |||||
| protected: | protected: | ||||
| virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format, | ||||
| @@ -149,6 +152,7 @@ class KernelRuntime { | |||||
| void *stream_ = nullptr; | void *stream_ = nullptr; | ||||
| std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | std::shared_ptr<MemoryManager> mem_manager_{nullptr}; | ||||
| std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_; | std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_; | ||||
| std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {}; | |||||
| }; | }; | ||||
| using KernelRuntimePtr = std::shared_ptr<KernelRuntime>; | using KernelRuntimePtr = std::shared_ptr<KernelRuntime>; | ||||
| } // namespace device | } // namespace device | ||||