diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h index e10bf76fd8..b299f4c4b1 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_copy_manager.h @@ -46,7 +46,7 @@ struct KernelExecutionInfo { size_t swap_in_task_num_{0}; // Key: output index, value: topo orders of node users std::map> node_users_map_; - // Key: output idx, value: (host addr, dirty or not) + // Key: output index, value: pair (host addr, dirty or not) std::map> host_addrs_; KernelExecutionInfo() {} @@ -105,7 +105,12 @@ class MemCopyManager { virtual void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {} - virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) {} + virtual void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr, bool profiling, + float *cost_time) {} + + virtual void AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) {} + + virtual void AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) {} virtual bool SyncMemCopyStream(SwapKind swap_kind) { return true; } @@ -113,11 +118,17 @@ class MemCopyManager { virtual DeviceAddressPtr UpdateSwapInQueue() { return nullptr; } + virtual DeviceAddressPtr UpdateSwapOutQueueMock() { return nullptr; } + + virtual DeviceAddressPtr UpdateSwapInQueueMock() { return nullptr; } + virtual bool AllocHostPinnedMem(size_t size, void **addr) const { return true; } virtual void FreeHostPinnedMem(void *addr) const {} virtual void ClearSwapQueue() {} + + virtual void ClearSwapQueueMock() {} }; using MemCopyManagerPtr = std::shared_ptr; using MemSwapInfoSet = std::set; diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc index 5f0569a41d..5cebc3714b 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.cc @@ -147,6 +147,30 @@ bool MemSwapManager::CheckDistanceBetweenKernels(const TensorInfo &tensor_info) return false; } +std::vector> MemSwapManager::CheckDistanceBetweenKernelsWithIdx( + const TensorInfo &tensor_info) const { + const AnfNodePtr &kernel = tensor_info.kernel_; + auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); + auto &node_users_map = kernel_exec_info.node_users_map_; + std::vector> need_swap_topo_pair_list; + + auto iter = node_users_map.find(tensor_info.output_idx_); + if (iter == node_users_map.end()) { + return need_swap_topo_pair_list; + } + auto &node_users = iter->second; + if (node_users.front() - kernel_exec_info.topo_order_ > distance_threshold_) { + need_swap_topo_pair_list.emplace_back(kernel_exec_info.topo_order_, node_users.front()); + } + + for (size_t i = 1; i < node_users.size(); ++i) { + if (node_users[i] - node_users[i - 1] > distance_threshold_) { + need_swap_topo_pair_list.emplace_back(node_users[i - 1], node_users[i]); + } + } + return need_swap_topo_pair_list; +} + bool MemSwapManager::IsCommunicationRelevantOp(const AnfNodePtr &kernel) const { MS_EXCEPTION_IF_NULL(kernel); if (AnfAlgo::IsCommunicationOp(kernel)) { @@ -201,56 +225,55 @@ void MemSwapManager::AddSwapInfo() { break; } - size_t output_idx = tensor.output_idx_; const AnfNodePtr &kernel = tensor.kernel_; if (IsCommunicationRelevantOp(kernel)) { continue; } - auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); - auto &node_users_map = kernel_exec_info.node_users_map_; - auto iter = node_users_map.find(output_idx); - if (iter == node_users_map.end()) { - continue; - } - auto &node_users = iter->second; - bool need_swap = (node_users.size() == 1 && node_users[0] - kernel_exec_info.topo_order_ >= distance_threshold_) || - (node_users.size() > 1 && node_users[1] - node_users[0] >= distance_threshold_); - if (!need_swap) { + auto need_swap_topo_pair_list = CheckDistanceBetweenKernelsWithIdx(tensor); + if (need_swap_topo_pair_list.empty()) { continue; } HostAddress host_addr; host_addr.size = tensor_size; - auto ret = AllocHostPinnedMem(tensor_size, reinterpret_cast(&host_addr.addr)); - if (!ret) { - MS_LOG(EXCEPTION) << "Alloc host pinned memory[" << tensor_size << "] failed."; - } + host_addr.addr = nullptr; + + size_t output_idx = tensor.output_idx_; + auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); kernel_exec_info.host_addrs_[output_idx] = std::make_pair(host_addr, true); - MemSwapInfo mem_swap_out_info = {SwapKind::kDeviceToHost, kernel_exec_info.topo_order_, output_idx, 0}; - if (node_users.size() > 1) { - AddKernelMemSwapInfo(execution_order_[node_users[0]], mem_swap_out_info); - } else { - AddKernelMemSwapInfo(kernel, mem_swap_out_info); - } - size_t swap_in_order = node_users.size() == 1 ? node_users[0] - 1 : node_users[1] - 1; - if (swap_in_order <= kernel_exec_info.topo_order_) { - MS_LOG(EXCEPTION) << "Select swap in point failed for op[" << AnfAlgo::GetCNodeName(kernel) << "]"; + for (auto &swap_topo_pair : need_swap_topo_pair_list) { + size_t swap_out_order = swap_topo_pair.first; + MemSwapInfo mem_swap_out_info = {SwapKind::kDeviceToHost, kernel_exec_info.topo_order_, output_idx, + swap_out_order}; + AddKernelMemSwapInfo(execution_order_[swap_out_order], mem_swap_out_info); + + size_t swap_in_order = swap_topo_pair.second - 1; + MemSwapInfo mem_swap_in_info = {SwapKind::kHostToDevice, kernel_exec_info.topo_order_, output_idx, + swap_out_order}; + if (swap_in_order <= swap_out_order) { + MS_LOG(EXCEPTION) << "Select swap in point failed for op[" << AnfAlgo::GetCNodeName(kernel) << "]"; + } + AddKernelMemSwapInfo(execution_order_[swap_in_order], mem_swap_in_info); } - auto swap_in_kernel = execution_order_[swap_in_order]; - MemSwapInfo mem_swap_in_info = {SwapKind::kHostToDevice, kernel_exec_info.topo_order_, output_idx, 0}; - AddKernelMemSwapInfo(swap_in_kernel, mem_swap_in_info); - - host_addrs_list_.push_back(host_addr); } } void MemSwapManager::AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address, - const HostAddress &host_address) const { + const HostAddress &host_address, bool mock, bool profiling, + float *cost_time) const { + if (!mock) { + if (swap_kind == SwapKind::kDeviceToHost) { + mem_copy_manager_->AddMemSwapOutTask(device_address, host_address); + } else if (swap_kind == SwapKind::kHostToDevice) { + mem_copy_manager_->AddMemSwapInTask(device_address, host_address, profiling, cost_time); + } + } + if (swap_kind == SwapKind::kDeviceToHost) { - mem_copy_manager_->AddMemSwapOutTask(device_address, host_address); + mem_copy_manager_->AddMemSwapOutTaskMock(device_address); } else if (swap_kind == SwapKind::kHostToDevice) { - mem_copy_manager_->AddMemSwapInTask(device_address, host_address); + mem_copy_manager_->AddMemSwapInTaskMock(device_address); } } @@ -258,11 +281,19 @@ bool MemSwapManager::SyncMemCopyStream(SwapKind swap_kind) const { return mem_copy_manager_->SyncMemCopyStream(swap_kind); } -DeviceAddressPtr MemSwapManager::UpdateSwapQueue(SwapKind swap_kind) const { +DeviceAddressPtr MemSwapManager::UpdateSwapQueue(SwapKind swap_kind, bool mock) const { + if (!mock) { + if (swap_kind == SwapKind::kDeviceToHost) { + return mem_copy_manager_->UpdateSwapOutQueue(); + } else { + return mem_copy_manager_->UpdateSwapInQueue(); + } + } + if (swap_kind == SwapKind::kDeviceToHost) { - return mem_copy_manager_->UpdateSwapOutQueue(); + return mem_copy_manager_->UpdateSwapOutQueueMock(); } else { - return mem_copy_manager_->UpdateSwapInQueue(); + return mem_copy_manager_->UpdateSwapInQueueMock(); } } @@ -273,19 +304,7 @@ bool MemSwapManager::RetreatSwapInfo() { } if (swap_info_already_set_) { ResetSwapInfo(); - if (distance_threshold_ >= kDistanceLowerBound) { - auto distance_decay_step = execution_order_.size() / kDistanceInitFactor / tensor_size_num_; - distance_threshold_ -= (distance_decay_step > 1 ? distance_decay_step : 1); - } - - while (tensor_size_threshold_idx_ < ordered_tensors_.size() - 1) { - ++tensor_size_threshold_idx_; - if (tensor_size_threshold_ > ordered_tensors_[tensor_size_threshold_idx_].tensor_size_) { - tensor_size_threshold_ = ordered_tensors_[tensor_size_threshold_idx_].tensor_size_; - break; - } - } - + RetreatSwapThreshold(); if (tensor_size_threshold_idx_ == ordered_tensors_.size() - 1 && distance_threshold_ < kDistanceLowerBound) { MS_LOG(ERROR) << "Retreat swap info failed"; return false; @@ -373,7 +392,7 @@ bool MemSwapManager::QueryFirstTimeMovePos(const AnfNodePtr &kernel, size_t inde } size_t MemSwapManager::BestSwapInPerformPos(const AnfNodePtr &trigger_kernel, const MemSwapInfo &mem_swap_info) const { - auto need_swap_kernel = QueryKerneByTopoOrder(mem_swap_info.topo_order_); + auto need_swap_kernel = QueryKernelByTopoOrder(mem_swap_info.topo_order_); const PerformPair &perform_pair = QueryKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_); float swap_in_cost_time = perform_pair.second; size_t swap_out_pos = mem_swap_info.swap_out_pos_; @@ -383,11 +402,11 @@ size_t MemSwapManager::BestSwapInPerformPos(const AnfNodePtr &trigger_kernel, co size_t pos = trigger_kernel_pos; for (; pos > swap_out_pos + 1; pos--) { - auto kernel = QueryKerneByTopoOrder(pos - 1); + auto kernel = QueryKernelByTopoOrder(pos - 1); if (QueryKernelTriggerSwapIn(kernel)) { return pos; } - kernel_execution_time += QueryKernelExecutionPerform(QueryKerneByTopoOrder(pos)); + kernel_execution_time += QueryKernelExecutionPerform(QueryKernelByTopoOrder(pos)); if (kernel_execution_time >= swap_in_cost_time) { return pos - 1; } @@ -399,8 +418,8 @@ void MemSwapManager::MoveSwapInfoPos(size_t des_pos, size_t src_pos, const MemSw if (des_pos == src_pos) { MS_LOG(EXCEPTION) << "destination pos can not equal source pos"; } - auto des_kernel = QueryKerneByTopoOrder(des_pos); - auto src_kernel = QueryKerneByTopoOrder(src_pos); + auto des_kernel = QueryKernelByTopoOrder(des_pos); + auto src_kernel = QueryKernelByTopoOrder(src_pos); AddKernelMemSwapInfo(des_kernel, mem_swap_info); RemoveKernelMemSwapInfo(src_kernel, mem_swap_info); } @@ -422,7 +441,10 @@ void MemSwapManager::AddKernelExecutionPerform(const AnfNodePtr &kernel, float p void MemSwapManager::AddKernelSwapPerform(const AnfNodePtr &kernel, size_t output_idx, const std::pair &perform) { MS_EXCEPTION_IF_NULL(kernel); - kernel_swap_perform_[kernel.get()][output_idx] = perform; + auto iter = kernel_swap_perform_.find(kernel.get()); + if (iter == kernel_swap_perform_.end()) { + kernel_swap_perform_[kernel.get()][output_idx] = perform; + } } void MemSwapManager::AddKernelMemSwapInfo(const AnfNodePtr &kernel, const MemSwapInfo &mem_swap_info) { @@ -485,13 +507,18 @@ size_t MemSwapManager::QueryKernelTriggerSwapInTaskNum(const AnfNodePtr &kernel) return kernel_exec_info.swap_in_task_num_; } -const AnfNodePtr MemSwapManager::QueryKerneByTopoOrder(size_t index) const { +const AnfNodePtr MemSwapManager::QueryKernelByTopoOrder(size_t index) const { if (index >= execution_order_.size()) { MS_LOG(EXCEPTION) << "Index [" << index << "] out of range"; } return execution_order_[index]; } +size_t MemSwapManager::QueryKernelTopoOrder(const AnfNodePtr &kernel) const { + const auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); + return kernel_exec_info.topo_order_; +} + const PerformPair &MemSwapManager::QueryKernelSwapPerform(const AnfNodePtr &kernel, size_t output_idx) const { MS_EXCEPTION_IF_NULL(kernel); auto iter_kernel = kernel_swap_perform_.find(kernel.get()); @@ -572,13 +599,6 @@ void MemSwapManager::ResetHostAddrIsDirty() { } } -void MemSwapManager::InsertSwapInBlackList(const void *device_ptr) { swap_in_blacklist_.insert(device_ptr); } - -bool MemSwapManager::FindInSwapInBlackList(const void *device_ptr) const { - auto iter = swap_in_blacklist_.find(device_ptr); - return iter != swap_in_blacklist_.end(); -} - bool MemSwapManager::AllocHostPinnedMem(size_t size, void **addr) const { return mem_copy_manager_->AllocHostPinnedMem(size, addr); } @@ -592,10 +612,16 @@ void MemSwapManager::ReleaseHostPinnedMem() { host_addrs_list_.clear(); } -void MemSwapManager::ClearSwapQueue() const { mem_copy_manager_->ClearSwapQueue(); } +void MemSwapManager::ClearSwapQueue(bool mock) const { + if (!mock) { + mem_copy_manager_->ClearSwapQueue(); + } else { + mem_copy_manager_->ClearSwapQueueMock(); + } +} void MemSwapManager::ResetSwapInfo() { - ClearSwapQueue(); + ClearSwapQueue(true); for (auto &kernel_exec_info_pair : kernel_execution_info_) { auto &kernel_exec_info = kernel_exec_info_pair.second; kernel_exec_info.trigger_swap_out_ = false; @@ -603,10 +629,53 @@ void MemSwapManager::ResetSwapInfo() { kernel_exec_info.swap_in_task_num_ = 0; kernel_exec_info.host_addrs_.clear(); } - ReleaseHostPinnedMem(); - swap_in_blacklist_.clear(); mem_swap_info_map_.clear(); } + +void MemSwapManager::DumpSwapInfo() const { + for (auto &kernel : execution_order_) { + if (!QueryKernelTriggerSwap(kernel)) { + continue; + } + auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); + MS_LOG(WARNING) << "Trigger kernel topo order[" << kernel_exec_info.topo_order_ << "] , op name[" + << AnfAlgo::GetCNodeName(kernel) << "]"; + + const MemSwapInfoSet &mem_swap_info_set = QueryKernelMemSwapInfo(kernel); + for (auto &mem_swap_info : mem_swap_info_set) { + if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { + MS_LOG(WARNING) << " Swap Out Task: swapped kernel topo order[" << mem_swap_info.topo_order_ << "], op name[" + << AnfAlgo::GetCNodeName(QueryKernelByTopoOrder(mem_swap_info.topo_order_)) << "], output idx[" + << mem_swap_info.output_idx_ << "]"; + } else { + MS_LOG(WARNING) << " Swap In Task: swapped kernel topo order[" << mem_swap_info.topo_order_ << "], op name[" + << AnfAlgo::GetCNodeName(QueryKernelByTopoOrder(mem_swap_info.topo_order_)) << "], output idx[" + << mem_swap_info.output_idx_ << "]"; + } + } + } +} + +void MemSwapManager::DumpUserNodes() const { + for (auto &kernel : execution_order_) { + const auto &kernel_exec_info = SearchKernelExecutionInfo(kernel); + const auto &node_users_map = kernel_exec_info.node_users_map_; + MS_LOG(WARNING) << "Kernel topo order[" << kernel_exec_info.topo_order_ << "], op name[" + << AnfAlgo::GetCNodeName(kernel) << "]"; + if (node_users_map.empty()) { + MS_LOG(WARNING) << " Kernel does not own any user node"; + } + + for (auto &item : node_users_map) { + size_t output_idx = item.first; + auto &node_users = item.second; + for (auto &order : node_users) { + MS_LOG(WARNING) << " Output index[" << output_idx << "] tensor is used by kernel[" + << AnfAlgo::GetCNodeName(QueryKernelByTopoOrder(order)) << "], topo order[" << order << "]"; + } + } + } +} } // namespace memswap } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h index 812e6e5fea..7799726f15 100644 --- a/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h +++ b/mindspore/ccsrc/backend/optimizer/mem_reuse/mem_swap_manager.h @@ -48,12 +48,12 @@ class MemSwapManager { bool Init(const mindspore::session::KernelGraph *kernel_graph, size_t swap_mem_size = 0); - void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address, - const HostAddress &host_address) const; + void AddMemSwapTask(SwapKind swap_kind, const DeviceAddressPtr &device_address, const HostAddress &host_address, + bool mock, bool profiling = false, float *cost_time = nullptr) const; bool SyncMemCopyStream(SwapKind swap_kind) const; - DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind) const; + DeviceAddressPtr UpdateSwapQueue(SwapKind swap_kind, bool mock) const; bool RetreatSwapInfo(); @@ -63,8 +63,6 @@ class MemSwapManager { bool mem_swap_init() const { return mem_swap_initialized_; } - KernelExecutionInfo &SearchKernelExecutionInfo(const AnfNodePtr &kernel) const; - void AddKernelExecutionPerform(const AnfNodePtr &kernel, float perform); float QueryKernelExecutionPerform(const AnfNodePtr &kernel) const; @@ -79,7 +77,9 @@ class MemSwapManager { size_t QueryKernelTriggerSwapInTaskNum(const AnfNodePtr &kernel) const; - const AnfNodePtr QueryKerneByTopoOrder(size_t index) const; + const AnfNodePtr QueryKernelByTopoOrder(size_t index) const; + + size_t QueryKernelTopoOrder(const AnfNodePtr &kernel) const; const MemSwapInfoSet &QueryKernelMemSwapInfo(const AnfNodePtr &kernel) const; @@ -93,17 +93,19 @@ class MemSwapManager { void ResetHostAddrIsDirty(); - void InsertSwapInBlackList(const void *device_ptr); - - bool FindInSwapInBlackList(const void *device_ptr) const; - bool AllocHostPinnedMem(size_t size, void **addr) const; void ReleaseHostPinnedMem(); - void ClearSwapQueue() const; + void ClearSwapQueue(bool mock) const; + + void DumpSwapInfo() const; + + void DumpUserNodes() const; private: + KernelExecutionInfo &SearchKernelExecutionInfo(const AnfNodePtr &kernel) const; + void AddSwapInfo(); void ResetSwapInfo(); @@ -130,6 +132,8 @@ class MemSwapManager { bool CheckDistanceBetweenKernels(const TensorInfo &tensor_info) const; + std::vector> CheckDistanceBetweenKernelsWithIdx(const TensorInfo &tensor_info) const; + bool IsCommunicationRelevantOp(const AnfNodePtr &kernel) const; std::vector execution_order_; @@ -139,7 +143,6 @@ class MemSwapManager { // Key: trigger swap kernel, value: MemSwapInfoSet of kernel need to be swapped std::unordered_map mem_swap_info_map_; std::vector host_addrs_list_; - std::unordered_set swap_in_blacklist_; // Key: cache kernel address, value: lists of first time move pos or not std::map> kernel_first_move_cache_map_; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc index 2b9f437977..3a5d9ca34a 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.cc @@ -112,7 +112,7 @@ void GPUKernelRuntime::ReleaseDeviceRes() { auto &mem_swap_manager = item.second; MS_EXCEPTION_IF_NULL(mem_swap_manager); if (mem_swap_manager->trigger_swap()) { - mem_swap_manager->ClearSwapQueue(); + mem_swap_manager->ClearSwapQueue(false); mem_swap_manager->ReleaseHostPinnedMem(); } } @@ -141,6 +141,7 @@ void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) { InitMemorySwapInfo(graph); InitKernelOutputAddress(graph); InitKernelWorkspaceAddress(graph); + SaveGraphOutputNode(graph); } else { AssignDynamicMemory(graph); } @@ -168,12 +169,8 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { } mem_reuse_util_ = mem_reuse_iter->second; MS_EXCEPTION_IF_NULL(mem_reuse_util_); - while (!LaunchKernelDynamic(graph)) { - MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; - if (!UpdateMemorySwapInfo(graph)) { - return false; - } - } + + ret = RunOneStep(graph); } else { ret = LaunchKernel(graph); } @@ -185,7 +182,29 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) { return ret; } +bool GPUKernelRuntime::RunOneStep(const session::KernelGraph *graph) { + bool ret = true; + auto graph_id = graph->graph_id(); + if (!is_first_step_map_[graph_id]) { + // Normally run graph + ret = LaunchKernelDynamic(graph); + } else { + // Mock run first step + ret = LaunchKernelDynamic(graph, true, false); + if (ret) { + // Normally run graph + ret = LaunchKernelDynamic(graph); + } else { + // Trigger memory swap + ret = SearchMemSwapScheme(graph); + } + is_first_step_map_[graph_id] = false; + } + return ret; +} + bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { + MS_LOG(WARNING) << "Run out of memory and try memory swapping, it may take some time, please wait a moment."; bool ret = false; ClearKernelOldOutputAndWorkspace(graph); if (!mem_swap_manager_->mem_swap_init()) { @@ -214,6 +233,7 @@ bool GPUKernelRuntime::SearchMemSwapScheme(const session::KernelGraph *graph) { } bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { + MS_LOG(WARNING) << "Refine memory swap scheme, it may take some time, please wait a moment."; auto &kernels = graph->execution_order(); for (const auto &kernel : kernels) { if (!mem_swap_manager_->QueryKernelTriggerSwapIn(kernel)) { @@ -228,6 +248,7 @@ bool GPUKernelRuntime::RefineMemSwapScheme(const session::KernelGraph *graph) { ret = LaunchKernelDynamic(graph, true, false); if (!ret) { ClearKernelOldOutputAndWorkspace(graph); + ClearSwapInfo(true); } } } @@ -297,6 +318,26 @@ void GPUKernelRuntime::InitKernelWorkspaceAddress(const session::KernelGraph *gr } } +void GPUKernelRuntime::SaveGraphOutputNode(const session::KernelGraph *graph) { + MS_EXCEPTION_IF_NULL(graph); + auto graph_id = graph->graph_id(); + const auto &output_nodes = AnfAlgo::GetAllOutput(graph->output(), {prim::kPrimTupleGetItem}); + for (const auto &node : output_nodes) { + graph_output_map_[graph_id].insert(node); + } +} + +bool GPUKernelRuntime::IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const { + MS_EXCEPTION_IF_NULL(graph); + auto graph_id = graph->graph_id(); + auto iter = graph_output_map_.find(graph_id); + if (iter == graph_output_map_.end()) { + MS_LOG(EXCEPTION) << "Find graph output info failed."; + } + auto &graph_output_set = iter->second; + return (graph_output_set.find(kernel) != graph_output_set.end()); +} + void GPUKernelRuntime::ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph) { ClearKernelOutputAddress(graph); ClearKernelWorkspaceAddress(graph); @@ -306,6 +347,9 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap MS_EXCEPTION_IF_NULL(graph); auto &kernels = graph->execution_order(); for (const auto &kernel : kernels) { + if (IsGraphOutput(graph, kernel)) { + continue; + } auto kernel_mod = AnfAlgo::GetKernelMod(kernel); MS_EXCEPTION_IF_NULL(kernel_mod); auto output_sizes = kernel_mod->GetOutputSizeList(); @@ -354,18 +398,27 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo AddressPtrList kernel_inputs; AddressPtrList kernel_workspaces; AddressPtrList kernel_outputs; - auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs); + auto ret = AllocKernelDynamicRes(*kernel_mod, kernel, &kernel_inputs, &kernel_workspaces, &kernel_outputs, mock); if (!ret) { return false; } - if (!kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_)) { - MS_LOG(EXCEPTION) << "Launch kernel failed."; + if (!mock) { + if (!profiling) { + CHECK_OP_RET_WITH_EXCEPT(kernel_mod->Launch(kernel_inputs, kernel_workspaces, kernel_outputs, stream_), + "Launch kernel failed."); + } else { + LaunchKernelWithTimeProfiling(kernel, kernel_inputs, kernel_workspaces, kernel_outputs); + } } FreeKernelDynamicRes(kernel); - UpdateMemorySwapTask(kernel); + if (!UpdateMemorySwapTask(kernel, mock, profiling)) { + return false; + } + } + if (!mock) { + CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); } - CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); - ClearSwapQueue(); + ClearSwapInfo(mock); return true; } @@ -393,29 +446,38 @@ void GPUKernelRuntime::LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, c CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(end), "Failed to destroy event."); } -bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel) { +bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { MS_EXCEPTION_IF_NULL(mem_swap_manager_); const MemSwapInfoSet &mem_swap_info_set = mem_swap_manager_->QueryKernelMemSwapInfo(kernel); for (auto &mem_swap_info : mem_swap_info_set) { - auto need_swap_kernel = mem_swap_manager_->QueryKerneByTopoOrder(mem_swap_info.topo_order_); + auto need_swap_kernel = mem_swap_manager_->QueryKernelByTopoOrder(mem_swap_info.topo_order_); MS_EXCEPTION_IF_NULL(need_swap_kernel); const HostAddress &host_address = mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_); auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false); if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { - mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address); + if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) { + mem_swap_manager_->AddMemSwapTask(SwapKind::kDeviceToHost, device_address, host_address, mock); + mem_swap_manager_->AddKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_, false); + } else { + mem_manager_->FreeMemFromMemPool(device_address); + device_address->set_status(DeviceAddressStatus::kInHost); + } } else if (mem_swap_info.swap_kind_ == SwapKind::kHostToDevice) { auto status = device_address->status(); if (status == DeviceAddressStatus::kInDeviceToHost) { - mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_); device_address->set_status(DeviceAddressStatus::kInDevice); } else if (status == DeviceAddressStatus::kInHost) { - if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_)) { + if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) { return false; } - if (!mem_swap_manager_->FindInSwapInBlackList(device_address->ptr_)) { - mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address); + float cost_time = 0; + mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling, + &cost_time); + if (profiling) { + mem_swap_manager_->AddKernelSwapPerform(need_swap_kernel, mem_swap_info.output_idx_, + std::make_pair(0, cost_time)); } } } @@ -423,87 +485,81 @@ bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel) { return true; } -bool GPUKernelRuntime::UpdateMemorySwapInfo(const session::KernelGraph *graph) { - MS_EXCEPTION_IF_NULL(mem_swap_manager_); - ClearKernelOldOutputAndWorkspace(graph); - if (!mem_swap_manager_->mem_swap_init()) { - if (!mem_swap_manager_->Init(graph)) { - return false; - } - } - return mem_swap_manager_->RetreatSwapInfo(); -} - -bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel) { +bool GPUKernelRuntime::UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling) { MS_EXCEPTION_IF_NULL(mem_swap_manager_); if (!mem_swap_manager_->trigger_swap()) { return true; } if (mem_swap_manager_->QueryKernelTriggerSwap(kernel)) { - CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); - if (!AddMemorySwapTask(kernel)) { + if (!mock) { + CHECK_OP_RET_WITH_EXCEPT(SyncStream(), "SyncStream failed."); + } + if (!AddMemorySwapTask(kernel, mock, profiling)) { return false; } + if (!mock) { + CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed."); + } } - CHECK_OP_RET_WITH_EXCEPT(mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost), "SyncCopyStream failed."); return true; } -void GPUKernelRuntime::UpdateHostSwapQueue(const DeviceAddressPtr device_address) { +void GPUKernelRuntime::UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock) { MS_EXCEPTION_IF_NULL(mem_swap_manager_); if (!mem_swap_manager_->trigger_swap()) { return; } - while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) { + while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); } + auto status = device_address->status(); switch (status) { case DeviceAddressStatus::kInDevice: break; case DeviceAddressStatus::kInDeviceToHost: { - mem_swap_manager_->InsertSwapInBlackList(device_address->ptr_); device_address->set_status(DeviceAddressStatus::kInDevice); break; } case DeviceAddressStatus::kInHostToDevice: { while (device_address->status() != DeviceAddressStatus::kInDevice) { - while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice)) { + while (auto device_address_swap_in = mem_swap_manager_->UpdateSwapQueue(SwapKind::kHostToDevice, mock)) { device_address_swap_in->set_status(DeviceAddressStatus::kInDevice); } } break; } case DeviceAddressStatus::kInHost: - MS_LOG(ERROR) << "Invaild device address status:" << status; + MS_LOG(WARNING) << "Unexpected device address status: " << status; break; default: - MS_LOG(EXCEPTION) << "Invaild device address status:" << status; + MS_LOG(EXCEPTION) << "Invaild device address status: " << status; } } -void GPUKernelRuntime::UpdateDeviceSwapQueue() { +void GPUKernelRuntime::UpdateHostSwapOutQueue(bool mock) { MS_EXCEPTION_IF_NULL(mem_swap_manager_); if (!mem_swap_manager_->trigger_swap()) { return; } - while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) { - if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) { + while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost, mock)) { + if (device_address_swap_out->status() == DeviceAddressStatus::kInDeviceToHost && device_address_swap_out->ptr_) { device_address_swap_out->set_status(DeviceAddressStatus::kInHost); mem_manager_->FreeMemFromMemPool(device_address_swap_out); } } } -void GPUKernelRuntime::ClearSwapQueue() { +void GPUKernelRuntime::ClearSwapInfo(bool mock) { MS_EXCEPTION_IF_NULL(mem_swap_manager_); if (!mem_swap_manager_->trigger_swap()) { return; } - mem_swap_manager_->ClearSwapQueue(); + mem_swap_manager_->ClearSwapQueue(mock); + mem_swap_manager_->ResetHostAddrIsDirty(); } -bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size) { +bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock) { MS_EXCEPTION_IF_NULL(mem_manager_); MS_EXCEPTION_IF_NULL(mem_swap_manager_); auto ret = mem_manager_->MallocMemFromMemPool(device_address, size); @@ -511,13 +567,11 @@ bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, if (!mem_swap_manager_->trigger_swap()) { return false; } - mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost); - while (auto device_address_swap_out = mem_swap_manager_->UpdateSwapQueue(SwapKind::kDeviceToHost)) { - if (!mem_swap_manager_->FindInSwapInBlackList(device_address_swap_out->ptr_) && device_address_swap_out->ptr_) { - device_address_swap_out->set_status(DeviceAddressStatus::kInHost); - mem_manager_->FreeMemFromMemPool(device_address_swap_out); - } + if (!mock) { + mem_swap_manager_->SyncMemCopyStream(SwapKind::kDeviceToHost); } + UpdateHostSwapOutQueue(mock); + ret = mem_manager_->MallocMemFromMemPool(device_address, size); if (!ret) { return false; @@ -528,20 +582,22 @@ bool GPUKernelRuntime::AttemptMallocMem(const DeviceAddressPtr &device_address, bool GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, - AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs) { - if (!AllocKernelInputDynamicRes(kernel, kernel_inputs)) { + AddressPtrList *kernel_workspaces, AddressPtrList *kernel_outputs, + bool mock) { + if (!AllocKernelInputDynamicRes(kernel, kernel_inputs, mock)) { return false; } - if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs)) { + if (!AllocKernelOutputDynamicRes(kernel_mod, kernel, kernel_outputs, mock)) { return false; } - if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces)) { + if (!AllocKernelWorkspaceDynamicRes(kernel_mod, kernel, kernel_workspaces, mock)) { return false; } return true; } -bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs) { +bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, + bool mock) { MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(kernel_inputs); MS_EXCEPTION_IF_NULL(mem_reuse_util_); @@ -555,7 +611,7 @@ bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &k device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); } MS_EXCEPTION_IF_NULL(device_address); - UpdateHostSwapQueue(device_address); + UpdateHostSwapInQueue(device_address, mock); MS_EXCEPTION_IF_NULL(device_address->ptr_); kernel::AddressPtr input = std::make_shared(); MS_EXCEPTION_IF_NULL(input); @@ -567,16 +623,16 @@ bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &k } bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_outputs) { + const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_outputs, + bool mock) { MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(kernel_outputs); - UpdateDeviceSwapQueue(); + UpdateHostSwapOutQueue(mock); auto output_sizes = kernel_mod.GetOutputSizeList(); for (size_t i = 0; i < output_sizes.size(); ++i) { auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); MS_EXCEPTION_IF_NULL(device_address); - if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i])) { + if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) { return false; } kernel::AddressPtr output = std::make_shared(); @@ -590,7 +646,7 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_workspaces) { + AddressPtrList *kernel_workspaces, bool mock) { MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(kernel_workspaces); auto workspace_sizes = kernel_mod.GetWorkspaceSizeList(); @@ -600,7 +656,7 @@ bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::K continue; } auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i); - if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i])) { + if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) { return false; } kernel::AddressPtr workspace = std::make_shared(); diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h index 12141c0c03..9a210c8e77 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_kernel_runtime.h @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include "runtime/device/kernel_runtime.h" @@ -55,23 +56,27 @@ class GPUKernelRuntime : public KernelRuntime { void InitKernelOutputAddress(const session::KernelGraph *graph); void InitKernelWorkspaceAddress(const session::KernelGraph *graph); void InitMemorySwapInfo(const session::KernelGraph *graph); + void SaveGraphOutputNode(const session::KernelGraph *graph); + bool IsGraphOutput(const session::KernelGraph *graph, const mindspore::AnfNodePtr &kernel) const; void ClearKernelOutputAddress(const session::KernelGraph *graph); void ClearKernelWorkspaceAddress(const session::KernelGraph *graph); void ClearKernelOldOutputAndWorkspace(const session::KernelGraph *graph); + bool RunOneStep(const session::KernelGraph *graph); bool SearchMemSwapScheme(const session::KernelGraph *graph); bool RefineMemSwapScheme(const session::KernelGraph *graph); bool LaunchKernelDynamic(const session::KernelGraph *graph, bool mock = false, bool profiling = false); void LaunchKernelWithTimeProfiling(const AnfNodePtr &kernel, const AddressPtrList &inputs, const AddressPtrList &workspace, const AddressPtrList &outputs); - bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size); + bool AttemptMallocMem(const DeviceAddressPtr &device_address, size_t size, bool mock); bool AllocKernelDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, AddressPtrList *kernel_workspaces, - AddressPtrList *kernel_outputs); - bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs); + AddressPtrList *kernel_outputs, bool mock); + bool AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_inputs, bool mock); bool AllocKernelOutputDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel, - AddressPtrList *kernel_outputs); + AddressPtrList *kernel_outputs, bool mock); bool AllocKernelWorkspaceDynamicRes(const mindspore::kernel::KernelMod &kernel_mod, - const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces); + const mindspore::AnfNodePtr &kernel, AddressPtrList *kernel_workspaces, + bool mock); void AllocCommunicationOpDynamicRes(const session::KernelGraph *graph); void AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel); void AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel); @@ -79,15 +84,16 @@ class GPUKernelRuntime : public KernelRuntime { const DeviceAddressPtrList addr_list, size_t total_size, std::vector size_list); void FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel); - bool AddMemorySwapTask(const AnfNodePtr &kernel); - bool UpdateMemorySwapInfo(const session::KernelGraph *graph); - bool UpdateMemorySwapTask(const AnfNodePtr &kernel); - void UpdateHostSwapQueue(const DeviceAddressPtr device_address); - void UpdateDeviceSwapQueue(); - void ClearSwapQueue(); + bool UpdateMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); + bool AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bool profiling); + void UpdateHostSwapInQueue(const DeviceAddressPtr device_address, bool mock); + void UpdateHostSwapOutQueue(bool mock); + void ClearSwapInfo(bool mock); std::unordered_map mem_reuse_util_map_; std::unordered_map mem_swap_map_; std::unordered_map is_first_step_map_; + std::unordered_map> graph_output_map_; + MemReuseUtilPtr mem_reuse_util_{nullptr}; MemSwapManagerPtr mem_swap_manager_{nullptr}; }; diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.cc b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.cc index 0406c0f151..74cae92dea 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.cc +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.cc @@ -47,11 +47,20 @@ void GPUMemCopyManager::AddMemSwapOutTask(const DeviceAddressPtr &device_address swap_out_queue_.emplace(device_address, event); } -void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) { +void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr, + bool profiling, float *cost_time) { MS_EXCEPTION_IF_NULL(device_address); MS_EXCEPTION_IF_NULL(host_addr.addr); - DeviceEvent event = nullptr; - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&event, cudaEventDisableTiming), "Failed to create CUDA event."); + DeviceEvent start = nullptr; + DeviceEvent end = nullptr; + if (profiling) { + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&start), "Failed to create CUDA event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end), "Failed to create CUDA event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(start, swap_in_stream_), + "Failed to record CUDA event to swap in stream."); + } else { + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::CreateEvent(&end, cudaEventDisableTiming), "Failed to create CUDA event."); + } DeviceMemPtr device_ptr = const_cast(device_address->GetPtr()); MS_EXCEPTION_IF_NULL(device_ptr); device_address->set_status(DeviceAddressStatus::kInHostToDevice); @@ -59,9 +68,27 @@ void GPUMemCopyManager::AddMemSwapInTask(const DeviceAddressPtr &device_address, CHECK_OP_RET_WITH_EXCEPT( CudaDriver::CopyHostMemToDeviceAsync(device_ptr, host_addr.addr, host_addr.size, swap_in_stream_), "Failed to copy host memory to device."); - CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(event, swap_in_stream_), + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::RecordEvent(end, swap_in_stream_), "Failed to record CUDA event to swap in stream."); - swap_in_queue_.emplace(device_address, event); + if (profiling) { + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(start), "Failed to sync event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::SyncEvent(end), "Failed to sync event."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::ElapsedTime(cost_time, start, end), "Failed to record elapsed time."); + CHECK_OP_RET_WITH_EXCEPT(CudaDriver::DestroyEvent(start), "Failed to destroy event."); + } + swap_in_queue_.emplace(device_address, end); +} + +void GPUMemCopyManager::AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) { + MS_EXCEPTION_IF_NULL(device_address); + device_address->set_status(DeviceAddressStatus::kInDeviceToHost); + swap_out_queue_mock_.emplace(device_address); +} + +void GPUMemCopyManager::AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) { + MS_EXCEPTION_IF_NULL(device_address); + device_address->set_status(DeviceAddressStatus::kInHostToDevice); + swap_in_queue_mock_.emplace(device_address); } bool GPUMemCopyManager::SyncMemCopyStream(SwapKind swap_kind) { @@ -104,6 +131,24 @@ DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueue() { return device_address; } +DeviceAddressPtr GPUMemCopyManager::UpdateSwapOutQueueMock() { + if (swap_out_queue_mock_.empty()) { + return nullptr; + } + auto device_address = swap_out_queue_mock_.front(); + swap_out_queue_mock_.pop(); + return device_address; +} + +DeviceAddressPtr GPUMemCopyManager::UpdateSwapInQueueMock() { + if (swap_in_queue_mock_.empty()) { + return nullptr; + } + auto device_address = swap_in_queue_mock_.front(); + swap_in_queue_mock_.pop(); + return device_address; +} + bool GPUMemCopyManager::AllocHostPinnedMem(size_t size, void **addr) const { auto alloc_size = CudaDriver::AllocHostPinnedMem(size, addr); return alloc_size == size; @@ -126,6 +171,15 @@ void GPUMemCopyManager::ClearSwapQueue() { swap_in_queue_.pop(); } } + +void GPUMemCopyManager::ClearSwapQueueMock() { + while (!swap_out_queue_mock_.empty()) { + swap_out_queue_mock_.pop(); + } + while (!swap_in_queue_mock_.empty()) { + swap_in_queue_mock_.pop(); + } +} } // namespace gpu } // namespace device } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.h b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.h index 7a9cc551f8..067972a38f 100644 --- a/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.h +++ b/mindspore/ccsrc/runtime/device/gpu/gpu_memory_copy_manager.h @@ -40,7 +40,12 @@ class GPUMemCopyManager : public MemCopyManager { void AddMemSwapOutTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override; - void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr) override; + void AddMemSwapInTask(const DeviceAddressPtr &device_address, const HostAddress &host_addr, bool profiling, + float *cost_time) override; + + void AddMemSwapOutTaskMock(const DeviceAddressPtr &device_address) override; + + void AddMemSwapInTaskMock(const DeviceAddressPtr &device_address) override; bool SyncMemCopyStream(SwapKind swap_kind) override; @@ -48,17 +53,25 @@ class GPUMemCopyManager : public MemCopyManager { DeviceAddressPtr UpdateSwapInQueue() override; + DeviceAddressPtr UpdateSwapOutQueueMock() override; + + DeviceAddressPtr UpdateSwapInQueueMock() override; + bool AllocHostPinnedMem(size_t size, void **addr) const override; void FreeHostPinnedMem(void *addr) const override; void ClearSwapQueue() override; + void ClearSwapQueueMock() override; + private: DeviceStream swap_out_stream_{nullptr}; DeviceStream swap_in_stream_{nullptr}; std::queue> swap_out_queue_; std::queue> swap_in_queue_; + std::queue swap_out_queue_mock_; + std::queue swap_in_queue_mock_; }; using GPUMemCopyManagerPtr = std::shared_ptr; } // namespace gpu diff --git a/tests/st/networks/test_gpu_resnet.py b/tests/st/networks/test_gpu_resnet.py index 6bd947c712..d440c5cacb 100644 --- a/tests/st/networks/test_gpu_resnet.py +++ b/tests/st/networks/test_gpu_resnet.py @@ -355,7 +355,7 @@ def test_trainTensor(num_classes=10, epoch=8, batch_size=1): @pytest.mark.level0 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard -def test_trainTensor_big_batchSize(num_classes=10, epoch=8, batch_size=170): +def test_trainTensor_big_batchSize(num_classes=10, epoch=8, batch_size=338): net = resnet50(num_classes) lr = 0.1 momentum = 0.9