|
|
@@ -51,6 +51,10 @@ static const size_t PARAMETER_OUTPUT_INDEX = 0; |
|
|
bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } |
|
|
bool GPUKernelRuntime::SyncStream() { return GPUDeviceManager::GetInstance().SyncStream(stream_); } |
|
|
|
|
|
|
|
|
bool GPUKernelRuntime::Init() { |
|
|
bool GPUKernelRuntime::Init() { |
|
|
|
|
|
auto context_ptr = MsContext::GetInstance(); |
|
|
|
|
|
MS_EXCEPTION_IF_NULL(context_ptr); |
|
|
|
|
|
enable_relation_cache_ = context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL); |
|
|
|
|
|
|
|
|
if (device_init_ == true) { |
|
|
if (device_init_ == true) { |
|
|
GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory(); |
|
|
GPUMemoryAllocator::GetInstance().CheckMaxDeviceMemory(); |
|
|
return true; |
|
|
return true; |
|
|
@@ -282,7 +286,7 @@ void GPUKernelRuntime::AllocInplaceNodeMemory(const session::KernelGraph *graph) |
|
|
|
|
|
|
|
|
auto primitive = AnfAlgo::GetCNodePrimitive(item[0]); |
|
|
auto primitive = AnfAlgo::GetCNodePrimitive(item[0]); |
|
|
auto output_index = GetValue<uint32_t>(primitive->GetAttr("inplace_output_index")); |
|
|
auto output_index = GetValue<uint32_t>(primitive->GetAttr("inplace_output_index")); |
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(item[0], output_index, false); |
|
|
|
|
|
|
|
|
auto device_address = GetMutableOutputAddr(item[0], output_index, false); |
|
|
if (device_address->GetPtr() != nullptr) { |
|
|
if (device_address->GetPtr() != nullptr) { |
|
|
continue; |
|
|
continue; |
|
|
} |
|
|
} |
|
|
@@ -670,7 +674,7 @@ bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bo |
|
|
MS_EXCEPTION_IF_NULL(need_swap_kernel); |
|
|
MS_EXCEPTION_IF_NULL(need_swap_kernel); |
|
|
const HostAddress &host_address = |
|
|
const HostAddress &host_address = |
|
|
mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_); |
|
|
mem_swap_manager_->QueryKernelHostAddr(need_swap_kernel, mem_swap_info.output_idx_); |
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false); |
|
|
|
|
|
|
|
|
auto device_address = GetMutableOutputAddr(need_swap_kernel, mem_swap_info.output_idx_, false); |
|
|
|
|
|
|
|
|
if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { |
|
|
if (mem_swap_info.swap_kind_ == SwapKind::kDeviceToHost) { |
|
|
if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) { |
|
|
if (mem_swap_manager_->QueryKernelHostAddrIsDirty(need_swap_kernel, mem_swap_info.output_idx_)) { |
|
|
@@ -821,10 +825,10 @@ bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &k |
|
|
DeviceAddressPtr device_address; |
|
|
DeviceAddressPtr device_address; |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
} else { |
|
|
} else { |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
// Get in-place output_address |
|
|
// Get in-place output_address |
|
|
@@ -833,7 +837,7 @@ bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &k |
|
|
auto input_index = GetValue<uint32_t>(primitive->GetAttr("aggregate_input_index")); |
|
|
auto input_index = GetValue<uint32_t>(primitive->GetAttr("aggregate_input_index")); |
|
|
if (i == input_index) { |
|
|
if (i == input_index) { |
|
|
auto skip_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(kernel), input_index); |
|
|
auto skip_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(kernel), input_index); |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(skip_node, 0, false); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(skip_node, 0, false); |
|
|
MS_LOG(INFO) << "[inplace optimizer] aggregate: " << kernel->DebugString() |
|
|
MS_LOG(INFO) << "[inplace optimizer] aggregate: " << kernel->DebugString() |
|
|
<< ", skip: " << skip_node->DebugString() << ", address: " << device_address->GetMutablePtr(); |
|
|
<< ", skip: " << skip_node->DebugString() << ", address: " << device_address->GetMutablePtr(); |
|
|
} |
|
|
} |
|
|
@@ -859,7 +863,7 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern |
|
|
UpdateHostSwapOutQueue(mock); |
|
|
UpdateHostSwapOutQueue(mock); |
|
|
auto output_sizes = kernel_mod.GetOutputSizeList(); |
|
|
auto output_sizes = kernel_mod.GetOutputSizeList(); |
|
|
for (size_t i = 0; i < output_sizes.size(); ++i) { |
|
|
for (size_t i = 0; i < output_sizes.size(); ++i) { |
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
auto device_address = GetMutableOutputAddr(kernel, i, false); |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) { |
|
|
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) { |
|
|
return false; |
|
|
return false; |
|
|
@@ -929,10 +933,10 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN |
|
|
DeviceAddressPtr device_address; |
|
|
DeviceAddressPtr device_address; |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
} else { |
|
|
} else { |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
} |
|
|
} |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
if (device_address->ptr_ == nullptr) { |
|
|
if (device_address->ptr_ == nullptr) { |
|
|
@@ -958,7 +962,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf |
|
|
MS_EXCEPTION_IF_NULL(kernel_mod); |
|
|
MS_EXCEPTION_IF_NULL(kernel_mod); |
|
|
auto output_sizes = kernel_mod->GetOutputSizeList(); |
|
|
auto output_sizes = kernel_mod->GetOutputSizeList(); |
|
|
for (size_t i = 0; i < output_sizes.size(); ++i) { |
|
|
for (size_t i = 0; i < output_sizes.size(); ++i) { |
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
auto device_address = GetMutableOutputAddr(kernel, i, false); |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
MS_EXCEPTION_IF_NULL(device_address); |
|
|
if (device_address->ptr_ == nullptr) { |
|
|
if (device_address->ptr_ == nullptr) { |
|
|
is_need_alloc_memory = true; |
|
|
is_need_alloc_memory = true; |
|
|
@@ -1013,7 +1017,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(kernel, i); |
|
|
|
|
|
|
|
|
auto kernel_with_index = GetPrevNodeOutput(kernel, i); |
|
|
if (AnfAlgo::IsCommunicationOp(kernel_with_index.first)) { |
|
|
if (AnfAlgo::IsCommunicationOp(kernel_with_index.first)) { |
|
|
continue; |
|
|
continue; |
|
|
} |
|
|
} |
|
|
@@ -1030,10 +1034,10 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) |
|
|
DeviceAddressPtr device_address; |
|
|
DeviceAddressPtr device_address; |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
if (mem_reuse_util_->is_all_nop_node()) { |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, false); |
|
|
} else { |
|
|
} else { |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
// Graph may be "nop node + depend + node", the input of node is the depend, so this case need skip nop node. |
|
|
device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
|
|
|
|
|
|
device_address = GetPrevNodeMutableOutputAddr(kernel, i, true); |
|
|
} |
|
|
} |
|
|
mem_manager_->FreeMemFromMemPool(device_address); |
|
|
mem_manager_->FreeMemFromMemPool(device_address); |
|
|
device_address->set_status(DeviceAddressStatus::kInDevice); |
|
|
device_address->set_status(DeviceAddressStatus::kInDevice); |
|
|
@@ -1046,7 +1050,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) |
|
|
continue; |
|
|
continue; |
|
|
} |
|
|
} |
|
|
if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { |
|
|
if (kernel_ref_count_ptr->ref_count_dynamic_use_ == 0) { |
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(kernel, i, false); |
|
|
|
|
|
|
|
|
auto device_address = GetMutableOutputAddr(kernel, i, false); |
|
|
mem_manager_->FreeMemFromMemPool(device_address); |
|
|
mem_manager_->FreeMemFromMemPool(device_address); |
|
|
device_address->set_status(DeviceAddressStatus::kInDevice); |
|
|
device_address->set_status(DeviceAddressStatus::kInDevice); |
|
|
} |
|
|
} |
|
|
@@ -1062,6 +1066,71 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel) |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
DeviceAddressPtr GPUKernelRuntime::GetPrevNodeMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node) { |
|
|
|
|
|
if (!enable_relation_cache_) { |
|
|
|
|
|
return AnfAlgo::GetPrevNodeMutableOutputAddr(node, i, visit_nop_node); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
auto &addr_cache = visit_nop_node ? prev_node_mut_output_addr_cache_ : prev_node_mut_output_addr_skip_nop_node_cache_; |
|
|
|
|
|
std::unordered_map<AnfNodePtr, std::vector<DeviceAddressPtr>>::iterator addr_iter; |
|
|
|
|
|
if (auto iter = addr_cache.find(node); iter == addr_cache.end()) { |
|
|
|
|
|
addr_iter = addr_cache.insert({node, {AnfAlgo::GetInputTensorNum(node), nullptr}}).first; |
|
|
|
|
|
} else { |
|
|
|
|
|
addr_iter = iter; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (addr_iter->second[i] == nullptr) { |
|
|
|
|
|
auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(node, i, visit_nop_node); |
|
|
|
|
|
addr_iter->second[i] = device_address; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return addr_iter->second[i]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
DeviceAddressPtr GPUKernelRuntime::GetMutableOutputAddr(const AnfNodePtr &node, size_t i, bool visit_nop_node) { |
|
|
|
|
|
if (!enable_relation_cache_) { |
|
|
|
|
|
return AnfAlgo::GetMutableOutputAddr(node, i, visit_nop_node); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
auto &addr_cache = visit_nop_node ? mut_output_addr_cache_ : mut_output_addr_skip_nop_node_cache_; |
|
|
|
|
|
std::unordered_map<AnfNodePtr, std::vector<DeviceAddressPtr>>::iterator addr_iter; |
|
|
|
|
|
if (auto iter = addr_cache.find(node); iter == addr_cache.end()) { |
|
|
|
|
|
auto kernel_mod = AnfAlgo::GetKernelMod(node); |
|
|
|
|
|
MS_EXCEPTION_IF_NULL(kernel_mod); |
|
|
|
|
|
auto output_sizes = kernel_mod->GetOutputSizeList(); |
|
|
|
|
|
addr_iter = addr_cache.insert({node, {output_sizes.size(), nullptr}}).first; |
|
|
|
|
|
} else { |
|
|
|
|
|
addr_iter = iter; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (addr_iter->second[i] == nullptr) { |
|
|
|
|
|
auto device_address = AnfAlgo::GetMutableOutputAddr(node, i, visit_nop_node); |
|
|
|
|
|
addr_iter->second[i] = device_address; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return addr_iter->second[i]; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
session::KernelWithIndex GPUKernelRuntime::GetPrevNodeOutput(const AnfNodePtr &node, size_t i) { |
|
|
|
|
|
if (!enable_relation_cache_) { |
|
|
|
|
|
return AnfAlgo::GetPrevNodeOutput(node, i); |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
std::unordered_map<AnfNodePtr, std::vector<session::KernelWithIndex>>::iterator addr_iter; |
|
|
|
|
|
if (auto iter = prev_node_output_cache_.find(node); iter == prev_node_output_cache_.end()) { |
|
|
|
|
|
addr_iter = prev_node_output_cache_.insert({node, {AnfAlgo::GetInputTensorNum(node), {nullptr, 0}}}).first; |
|
|
|
|
|
} else { |
|
|
|
|
|
addr_iter = iter; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
if (addr_iter->second[i].first == nullptr) { |
|
|
|
|
|
auto kernel_with_index = AnfAlgo::GetPrevNodeOutput(node, i); |
|
|
|
|
|
addr_iter->second[i] = kernel_with_index; |
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
return addr_iter->second[i]; |
|
|
|
|
|
} |
|
|
} // namespace gpu |
|
|
} // namespace gpu |
|
|
} // namespace device |
|
|
} // namespace device |
|
|
} // namespace mindspore |
|
|
} // namespace mindspore |