nlp perf(Pynative): change memory sync mode from synchronous to asynchronous in SyncHostToDevice

4 years ago · 3fa26683ac
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -101,12 +101,22 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind)
  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
  auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
  MS_EXCEPTION_IF_NULL(runtime_instance);
  runtime_instance->SetContext();
  auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
  if (ret_rt_memcpy != RT_ERROR_NONE) {
    MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";

  // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode
  if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) {
    auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
    if (ret_rt_memcpy != RT_ERROR_NONE) {
      MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";
    }
  } else {
    auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind));
    if (!ret) {
      MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed";
    }
  }
 }

@@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
  if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
    return true;
  }
  SyncStream();

  bool sync_ok = false;
  std::vector<size_t> host_shape;
  (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() {
    MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error.";
    return false;
  }
  FreeAndClearBufferPtrs();
  return true;
 }

 bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
  InnerSetContext();
  if (stream_ == nullptr) {
    MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr";
    return false;
  }

  std::shared_ptr<char[]> buffer(new char[size]());
  MS_EXCEPTION_IF_NULL(buffer);
  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
  AddBufferPtr(buffer);
  if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) {
    MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error.";
    return false;
  }
  return true;
 }

--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
                                 const std::vector<CNodePtr> &execution_order) override;
  void ClearGlobalIdleMem() override;
  bool SyncStream() override;
  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
  void SetContext() override;
  void CreateContext() override;
  void *context() const override { return rt_context_; }
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime {

 protected:
  bool SyncStream() override { return true; };
  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; };
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;

--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -18,7 +18,9 @@
 #include <vector>
 #include <memory>
 #include "runtime/device/gpu/gpu_device_manager.h"
 #include "runtime/device/kernel_runtime_manager.h"
 #include "utils/log_adapter.h"
 #include "utils/ms_context.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
 #include "ir/tensor.h"
 #ifdef ENABLE_DEBUGGER
@@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
    // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size
    MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_;
  }
  if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
    MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
    return false;

  auto ms_context = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(ms_context);
  bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
  if (execution_mode != kPynativeMode) {
    if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
      MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
      return false;
    }
    return GPUDeviceManager::GetInstance().SyncStream(stream);
  } else {
    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id);
    MS_EXCEPTION_IF_NULL(runtime_instance);
    return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0);
  }
  return GPUDeviceManager::GetInstance().SyncStream(stream);
 }

 void GPUDeviceAddress::ClearDeviceMemory() {
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet;
 using mindspore::device::memswap::MemSwapManager;
 using mindspore::device::memswap::SwapKind;
 static const size_t PARAMETER_OUTPUT_INDEX = 0;
 bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }

 bool GPUKernelRuntime::SyncStream() {
  if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) {
    MS_LOG(ERROR) << "Call SyncStream error.";
    return false;
  }
  FreeAndClearBufferPtrs();
  return true;
 }

 bool GPUKernelRuntime::Init() {
  auto context_ptr = MsContext::GetInstance();
@@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
 }
 }  // namespace

 bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
  std::shared_ptr<char[]> buffer(new char[size]());
  MS_EXCEPTION_IF_NULL(buffer);
  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
  AddBufferPtr(buffer);

  auto &stream = GPUDeviceManager::GetInstance().default_stream();
  MS_EXCEPTION_IF_NULL(stream);
  auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream);
  if (!ret) {
    MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
    return false;
  }
  return ret;
 }

 DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                       TypeId type_id) {
  return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime {
  DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                       TypeId type_id) override;
  bool SyncStream() override;
  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;

 private:
  GPUKernelRuntime(const GPUKernelRuntime &);
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -75,6 +75,7 @@ class KernelRuntime {
                                  const std::unordered_set<ValueNodePtr> &value_nodes,
                                  const std::vector<CNodePtr> &execution_order);
  virtual bool SyncStream() = 0;
  virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0;
  virtual void ClearGlobalIdleMem() {}
  virtual void CreateContext() {}
  virtual void SetContext() {}
@@ -101,6 +102,8 @@ class KernelRuntime {

  virtual void PreInit() {}
  virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
  void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); }
  void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); }

 protected:
  virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@@ -149,6 +152,7 @@ class KernelRuntime {
  void *stream_ = nullptr;
  std::shared_ptr<MemoryManager> mem_manager_{nullptr};
  std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_;
  std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {};
 };
 using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
 }  // namespace device