diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
index c94f0f71dc..e96cadb4a9 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_device_address.cc
@@ -101,12 +101,22 @@ void SyncMemory(void *dst, const void *src, uint64_t size, rtMemcpyKind_t kind)
   auto ms_context = MsContext::GetInstance();
   MS_EXCEPTION_IF_NULL(ms_context);
   auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+  auto execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
   auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id);
   MS_EXCEPTION_IF_NULL(runtime_instance);
   runtime_instance->SetContext();
-  auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
-  if (ret_rt_memcpy != RT_ERROR_NONE) {
-    MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";
+
+  // Only apply asynchronous copy in Pynative && RT_MEMCPY_HOST_TO_DEVICE mode
+  if (execution_mode != kPynativeMode || kind != RT_MEMCPY_HOST_TO_DEVICE) {
+    auto ret_rt_memcpy = rtMemcpy(dst, size, src, size, kind);
+    if (ret_rt_memcpy != RT_ERROR_NONE) {
+      MS_EXCEPTION(DeviceProcessError) << "rtMemcpy failed";
+    }
+  } else {
+    auto ret = runtime_instance->MemcpyAsync(dst, src, size, static_cast<int32_t>(kind));
+    if (!ret) {
+      MS_EXCEPTION(DeviceProcessError) << "MemcpyAsync failed";
+    }
   }
 }
 
@@ -527,7 +537,7 @@ bool AscendDeviceAddress::SyncHostToDevice(const ShapeVector &shape, size_t size
   if (type_id_ > kMonadTypeBegin && type_id_ < kMonadTypeEnd) {
     return true;
   }
-  SyncStream();
+
   bool sync_ok = false;
   std::vector<size_t> host_shape;
   (void)std::transform(shape.begin(), shape.end(), std::back_inserter(host_shape), LongToSize);
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
index 13556fedf7..0432b77ba3 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.cc
@@ -718,6 +718,25 @@ bool AscendKernelRuntime::SyncStream() {
     MS_LOG(ERROR) << "Call runtime rtStreamSynchronize error.";
     return false;
   }
+  FreeAndClearBufferPtrs();
+  return true;
+}
+
+bool AscendKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
+  InnerSetContext();
+  if (stream_ == nullptr) {
+    MS_LOG(ERROR) << "MemcpyAsync failed. stream_ is nullptr";
+    return false;
+  }
+
+  std::shared_ptr<char[]> buffer(new char[size]());
+  MS_EXCEPTION_IF_NULL(buffer);
+  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
+  AddBufferPtr(buffer);
+  if (RT_ERROR_NONE != rtMemcpyAsync(dst, size, buffer.get(), size, static_cast<rtMemcpyKind_t>(kind), stream_)) {
+    MS_LOG(ERROR) << "Call runtime rtMemcpyAsync error.";
+    return false;
+  }
   return true;
 }
 
diff --git a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
index 1288670414..ccbf2de596 100644
--- a/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/ascend/ascend_kernel_runtime.h
@@ -51,6 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
                                  const std::vector<CNodePtr> &execution_order) override;
   void ClearGlobalIdleMem() override;
   bool SyncStream() override;
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
   void SetContext() override;
   void CreateContext() override;
   void *context() const override { return rt_context_; }
diff --git a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
index 2998841162..6eebb13809 100644
--- a/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_kernel_runtime.h
@@ -49,6 +49,7 @@ class CPUKernelRuntime : public KernelRuntime {
 
  protected:
   bool SyncStream() override { return true; };
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override { return true; };
   DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                        TypeId type_id) override;
 
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
index 4893ebdc38..a5e79b20a4 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@@ -18,7 +18,9 @@
 #include <vector>
 #include <memory>
 #include "runtime/device/gpu/gpu_device_manager.h"
+#include "runtime/device/kernel_runtime_manager.h"
 #include "utils/log_adapter.h"
+#include "utils/ms_context.h"
 #include "runtime/device/gpu/gpu_memory_allocator.h"
 #include "ir/tensor.h"
 #ifdef ENABLE_DEBUGGER
@@ -62,11 +64,22 @@ bool GPUDeviceAddress::SyncHostToDevice(const ShapeVector &, size_t size, TypeId
     // nccl kernel input and output device address is aligned, may lead to host size is not equal to device size
     MS_LOG(INFO) << "Sync memory size is inconsistent, host size: " << size << ", device size " << size_;
   }
-  if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
-    MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
-    return false;
+
+  auto ms_context = MsContext::GetInstance();
+  MS_EXCEPTION_IF_NULL(ms_context);
+  bool execution_mode = ms_context->get_param<int>(MS_CTX_EXECUTION_MODE);
+  if (execution_mode != kPynativeMode) {
+    if (!GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(ptr_, host_ptr, size, stream)) {
+      MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
+      return false;
+    }
+    return GPUDeviceManager::GetInstance().SyncStream(stream);
+  } else {
+    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kGPUDevice, device_id);
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    return runtime_instance->MemcpyAsync(ptr_, host_ptr, size, 0);
   }
-  return GPUDeviceManager::GetInstance().SyncStream(stream);
 }
 
 void GPUDeviceAddress::ClearDeviceMemory() {
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
index 8579dcbbe0..510161b50e 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc
@@ -48,7 +48,15 @@ using mindspore::device::memswap::MemSwapInfoSet;
 using mindspore::device::memswap::MemSwapManager;
 using mindspore::device::memswap::SwapKind;
 static const size_t PARAMETER_OUTPUT_INDEX = 0;
-bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); }
+
+bool GPUKernelRuntime::SyncStream() {
+  if (!GPUDeviceManager::GetInstance().SyncStream(stream_)) {
+    MS_LOG(ERROR) << "Call SyncStream error.";
+    return false;
+  }
+  FreeAndClearBufferPtrs();
+  return true;
+}
 
 bool GPUKernelRuntime::Init() {
   auto context_ptr = MsContext::GetInstance();
@@ -183,6 +191,22 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
 }
 }  // namespace
 
+bool GPUKernelRuntime::MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) {
+  std::shared_ptr<char[]> buffer(new char[size]());
+  MS_EXCEPTION_IF_NULL(buffer);
+  std::copy(reinterpret_cast<const char *>(src), reinterpret_cast<const char *>(src) + size, buffer.get());
+  AddBufferPtr(buffer);
+
+  auto &stream = GPUDeviceManager::GetInstance().default_stream();
+  MS_EXCEPTION_IF_NULL(stream);
+  auto ret = GPUDeviceManager::GetInstance().CopyHostMemToDeviceAsync(dst, buffer.get(), size, stream);
+  if (!ret) {
+    MS_LOG(ERROR) << "CopyHostMemToDeviceAsync failed";
+    return false;
+  }
+  return ret;
+}
+
 DeviceAddressPtr GPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                                        TypeId type_id) {
   return std::make_shared<GPUDeviceAddress>(device_ptr, device_size, format, type_id);
diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
index 83b80ce0b0..1813c23b66 100644
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h
@@ -52,6 +52,7 @@ class GPUKernelRuntime : public KernelRuntime {
   DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
                                        TypeId type_id) override;
   bool SyncStream() override;
+  bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) override;
 
  private:
   GPUKernelRuntime(const GPUKernelRuntime &);
diff --git a/mindspore/ccsrc/runtime/device/kernel_runtime.h b/mindspore/ccsrc/runtime/device/kernel_runtime.h
index 54f7fb2052..df12b002aa 100644
--- a/mindspore/ccsrc/runtime/device/kernel_runtime.h
+++ b/mindspore/ccsrc/runtime/device/kernel_runtime.h
@@ -75,6 +75,7 @@ class KernelRuntime {
                                   const std::unordered_set<ValueNodePtr> &value_nodes,
                                   const std::vector<CNodePtr> &execution_order);
   virtual bool SyncStream() = 0;
+  virtual bool MemcpyAsync(void *dst, const void *src, uint64_t size, int32_t kind) = 0;
   virtual void ClearGlobalIdleMem() {}
   virtual void CreateContext() {}
   virtual void SetContext() {}
@@ -101,6 +102,8 @@ class KernelRuntime {
 
   virtual void PreInit() {}
   virtual uint64_t GetAvailableMemMaxSize() const { return 0; }
+  void AddBufferPtr(std::shared_ptr<char[]> ptr) { buffer_ptrs_.push_back(ptr); }
+  void FreeAndClearBufferPtrs() { buffer_ptrs_.clear(); }
 
  protected:
   virtual DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
@@ -149,6 +152,7 @@ class KernelRuntime {
   void *stream_ = nullptr;
   std::shared_ptr<MemoryManager> mem_manager_{nullptr};
   std::map<uint32_t, std::vector<DynamicKernelPtr>> graph_dynamic_kernel_map_;
+  std::vector<std::shared_ptr<char[]>> buffer_ptrs_ = {};
 };
 using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
 }  // namespace device