/** * Copyright 2020-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "runtime/device/kernel_adjust.h" #include #include #include #include #include #include "backend/common/session/anf_runtime_algorithm.h" #include "include/common/utils/anfalgo.h" #include "utils/ms_context.h" #include "runtime/device/ms_device_shape_transfer.h" #include "include/common/utils/config_manager.h" #include "utils/ms_utils.h" #include "kernel/kernel_build_info.h" #include "include/common/utils/utils.h" #include "plugin/device/ascend/hal/device/profiling/profiling_manager.h" #include "runtime/base.h" #include "plugin/device/ascend/hal/device/ascend_stream_manager.h" #include "utils/shape_utils.h" #ifndef ENABLE_SECURITY #include "debug/data_dump/dump_json_parser.h" #endif namespace { constexpr auto kGradients = "Gradients"; constexpr auto kSpecifyParameter = "accu_status"; size_t kNPUShape = 8; constexpr size_t kLastHandleDiff = 2; } // namespace namespace mindspore { namespace device { #ifndef ENABLE_SECURITY using device::ascend::ProfilingUtils; #endif void KernelAdjust::ReorderGetNext(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); const std::vector &origin_cnode_list = kernel_graph_ptr->execution_order(); std::vector getnext_list; std::vector other_list; for (const auto &cnode : origin_cnode_list) { if (common::AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) { getnext_list.emplace_back(cnode); } else { other_list.emplace_back(cnode); } } std::vector new_order_list; new_order_list.insert(new_order_list.end(), getnext_list.begin(), getnext_list.end()); new_order_list.insert(new_order_list.end(), other_list.begin(), other_list.end()); kernel_graph_ptr->set_execution_order(new_order_list); } bool KernelAdjust::NeedLoopSink() { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); return (context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param(MS_CTX_ENABLE_LOOP_SINK) && ConfigManager::GetInstance().iter_num() > 1); } CNodePtr KernelAdjust::CreateSendApplyKernel(const std::shared_ptr &graph_ptr, uint32_t event_id) { MS_EXCEPTION_IF_NULL(graph_ptr); auto send_op = std::make_shared(kSendOpName); MS_EXCEPTION_IF_NULL(send_op); auto send_apply = std::make_shared(send_op); MS_EXCEPTION_IF_NULL(send_apply); std::vector send_input_list = {send_apply}; CNodePtr send_node_ptr = graph_ptr->NewCNode(send_input_list); MS_EXCEPTION_IF_NULL(send_node_ptr); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), send_node_ptr.get()); common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), send_node_ptr); auto abstract_none = std::make_shared(); MS_EXCEPTION_IF_NULL(abstract_none); send_node_ptr->set_abstract(abstract_none); return send_node_ptr; } CNodePtr KernelAdjust::CreateRecvApplyKernel(const std::shared_ptr &graph_ptr, uint32_t event_id) { MS_EXCEPTION_IF_NULL(graph_ptr); auto recv_op = std::make_shared(kRecvOpName); MS_EXCEPTION_IF_NULL(recv_op); auto recv_apply = std::make_shared(recv_op); MS_EXCEPTION_IF_NULL(recv_apply); std::vector recv_input_list = {recv_apply}; CNodePtr recv_node_ptr = graph_ptr->NewCNode(recv_input_list); MS_EXCEPTION_IF_NULL(recv_node_ptr); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), recv_node_ptr.get()); common::AnfAlgo::SetNodeAttr(kAttrEventId, MakeValue(event_id), recv_node_ptr); auto abstract_none = std::make_shared(); MS_EXCEPTION_IF_NULL(abstract_none); recv_node_ptr->set_abstract(abstract_none); return recv_node_ptr; } bool KernelAdjust::ExistGetNext(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); const std::vector &cnode_list = kernel_graph_ptr->execution_order(); for (const auto &cnode : cnode_list) { if (common::AnfAlgo::GetCNodeName(cnode) == kGetNextOpName) { return true; } } return false; } bool KernelAdjust::ExistIndependent(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); const auto &exe_orders = kernel_graph_ptr->execution_order(); for (const auto &node : exe_orders) { if (AnfAlgo::IsIndependentNode(node) && AnfAlgo::GetGraphId(node.get()) == kernel_graph_ptr->graph_id()) { MS_LOG(INFO) << "graph exit independent node"; return true; } } return false; } void KernelAdjust::InsertIndepentParallel(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, std::vector *exec_order) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); CNodePtr independent_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kIndependentStreamSwitch); MS_EXCEPTION_IF_NULL(independent_switch_app); uint32_t independent_switch_stream_id = resource_manager.ApplyNewStream(); AnfAlgo::SetStreamId(independent_switch_stream_id, independent_switch_app.get()); common::AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue(true), independent_switch_app); common::AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue(kIndependentStreamSwitch), independent_switch_app); (*exec_order).push_back(independent_switch_app); MS_LOG(INFO) << "Independent op loop insert Stream Switch " << independent_switch_app->fullname_with_scope(); } void KernelAdjust::InsertFpBpLoopStreamSwitch(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, std::vector *exec_order, uint32_t *fpbp_stream_id, uint32_t *fpbp_switch_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(fpbp_stream_id); MS_EXCEPTION_IF_NULL(fpbp_switch_stream_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *fpbp_switch_stream_id = resource_manager.ApplyNewStream(); *fpbp_stream_id = resource_manager.ApplyNewStream(); CNodePtr fpbp_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kFpBpStreamSwitch); MS_EXCEPTION_IF_NULL(fpbp_switch_app); AnfAlgo::SetStreamId(*fpbp_switch_stream_id, fpbp_switch_app.get()); common::AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue(true), fpbp_switch_app); // update fpbp loop stream switch true_branch_stream attr common::AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue(*fpbp_stream_id), fpbp_switch_app); common::AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue(kFpBpStreamSwitch), fpbp_switch_app); (*exec_order).push_back(fpbp_switch_app); MS_LOG(INFO) << "FpBp loop insert Stream Switch " << fpbp_switch_app->fullname_with_scope(); } void KernelAdjust::CopyMemcpyList(const std::shared_ptr &kernel_graph_ptr, const std::vector &orders, size_t order_index, std::vector *memcpy_list, std::vector *other_list) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(memcpy_list); MS_EXCEPTION_IF_NULL(other_list); CNodePtr cur_cnode = nullptr; for (size_t idx = order_index + 1; idx < orders.size(); idx++) { cur_cnode = orders[idx]; if (common::AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, cur_cnode)) { auto pre_node = orders[idx - 1]; auto pre_kernel_name = common::AnfAlgo::GetCNodeName(pre_node); if (pre_kernel_name == kAtomicAddrCleanOpName) { (*other_list).pop_back(); (*memcpy_list).push_back(pre_node); } (*memcpy_list).emplace_back(cur_cnode); } else { (*other_list).emplace_back(cur_cnode); } } } void KernelAdjust::InsertEosDoneRecv(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t eos_done_event_id, uint32_t fpbp_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr eos_done_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_done_event_id); AnfAlgo::SetStreamId(fpbp_stream_id, eos_done_recv.get()); (*exec_order).push_back(eos_done_recv); MS_LOG(INFO) << "FpBp loop insert EoS done Recv " << eos_done_recv->fullname_with_scope(); } void KernelAdjust::InsertGetNextLoopStreamActive(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, const std::vector &getnext_active_streams) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr getnext_active_app = CreateStreamActiveOp(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(getnext_active_app); common::AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue>(getnext_active_streams), getnext_active_app); (*exec_order).push_back(getnext_active_app); MS_LOG(INFO) << "FpBp loop insert GetNext loop Stream Active " << getnext_active_app->fullname_with_scope(); } void KernelAdjust::InsertFpBpStartRecv(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t fpbp_start_event_id, uint32_t fpbp_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr fpbp_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, fpbp_start_event_id); AnfAlgo::SetStreamId(fpbp_stream_id, fpbp_start_recv.get()); (*exec_order).push_back(fpbp_start_recv); MS_LOG(INFO) << "FpBp loop insert FpBp start Recv " << fpbp_start_recv->fullname_with_scope(); } void KernelAdjust::InsertNextLoopAssignAdd(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, const std::map &switch_loop_input, uint32_t fpbp_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr assign_add_one = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, false); MS_EXCEPTION_IF_NULL(assign_add_one); AnfAlgo::SetStreamId(fpbp_stream_id, assign_add_one.get()); (*exec_order).push_back(assign_add_one); MS_LOG(INFO) << "FpBp loop insert next loop AssignAdd " << assign_add_one->fullname_with_scope(); } void KernelAdjust::InsertCurrentLoopAssignAdd(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, const std::map &switch_loop_input) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr cur_assign_add = CreateStreamAssignAddnOP(kernel_graph_ptr, switch_loop_input, true); MS_EXCEPTION_IF_NULL(cur_assign_add); common::AnfAlgo::SetNodeAttr(kAttrFpBpEnd, MakeValue(true), cur_assign_add); (*exec_order).push_back(cur_assign_add); MS_LOG(INFO) << "FpBp loop insert current loop AssignAdd " << cur_assign_add->fullname_with_scope(); } void KernelAdjust::InsertFpBpAndEosLoopStreamActive(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, const std::vector &fpbp_active_streams) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr fpbp_active_app = CreateStreamActiveOp(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(fpbp_active_app); common::AnfAlgo::SetNodeAttr(kAttrActiveStreamList, MakeValue>(fpbp_active_streams), fpbp_active_app); (*exec_order).push_back(fpbp_active_app); MS_LOG(INFO) << "FpBp loop insert FpBp loop and Eos loop Stream Active " << fpbp_active_app->fullname_with_scope(); } void KernelAdjust::InsertGetNextLoopStreamSwitch( const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t *getnext_switch_stream_id, uint32_t *getnext_stream_id, const std::map &switch_loop_input) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(getnext_switch_stream_id); MS_EXCEPTION_IF_NULL(getnext_stream_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *getnext_switch_stream_id = resource_manager.ApplyNewStream(); *getnext_stream_id = resource_manager.ApplyNewStream(); CNodePtr getnext_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kGetNextStreamSwitch); MS_EXCEPTION_IF_NULL(getnext_switch_app); AnfAlgo::SetStreamId(*getnext_switch_stream_id, getnext_switch_app.get()); // update getnext loop stream switch true_branch_stream attr common::AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue(true), getnext_switch_app); common::AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue(*getnext_stream_id), getnext_switch_app); common::AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue(kGetNextStreamSwitch), getnext_switch_app); (*exec_order).push_back(getnext_switch_app); MS_LOG(INFO) << "GetNext loop insert Stream Switch " << getnext_switch_app->fullname_with_scope(); } void KernelAdjust::SetBeforeGetNextStreamID(std::vector *exec_order, const std::vector &orders, size_t *order_index, CNodePtr getnext_cnode, uint32_t getnext_stream_id) { MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(order_index); for (; *order_index < orders.size(); (*order_index)++) { auto node = orders[*order_index]; (*exec_order).push_back(node); AnfAlgo::SetStreamId(getnext_stream_id, (*exec_order)[(*exec_order).size() - 1].get()); if (common::AnfAlgo::GetCNodeName(node) == kGetNextOpName) { getnext_cnode = node; break; } } } void KernelAdjust::InsertGetNextLoopFpBpStartSend(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t *fpbp_start_event_id, uint32_t getnext_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(fpbp_start_event_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *fpbp_start_event_id = resource_manager.ApplyNewEvent(); CNodePtr fpbp_start_send = CreateSendApplyKernel(kernel_graph_ptr, *fpbp_start_event_id); AnfAlgo::SetStreamId(getnext_stream_id, fpbp_start_send.get()); (*exec_order).push_back(fpbp_start_send); MS_LOG(INFO) << "GetNext loop insert FpBp start Send " << fpbp_start_send->fullname_with_scope(); } void KernelAdjust::InsertGetNextLoopEosStartSend(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t *eos_start_event_id, uint32_t getnext_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(eos_start_event_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *eos_start_event_id = resource_manager.ApplyNewEvent(); CNodePtr eos_start_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_start_event_id); AnfAlgo::SetStreamId(getnext_stream_id, eos_start_send.get()); (*exec_order).push_back(eos_start_send); MS_LOG(INFO) << "GetNext loop insert EoS start Send " << eos_start_send->fullname_with_scope(); } void KernelAdjust::InsertEosStreamSwitch(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, std::vector *exec_order, uint32_t *eos_switch_stream_id, uint32_t *eos_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(eos_switch_stream_id); MS_EXCEPTION_IF_NULL(eos_stream_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *eos_switch_stream_id = resource_manager.ApplyNewStream(); *eos_stream_id = resource_manager.ApplyNewStream(); CNodePtr eos_switch_app = CreateStreamSwitchOp(kernel_graph_ptr, switch_loop_input, kEosStreamSwitch); MS_EXCEPTION_IF_NULL(eos_switch_app); AnfAlgo::SetStreamId(*eos_switch_stream_id, eos_switch_app.get()); common::AnfAlgo::SetNodeAttr(kStreamNeedActivedFirst, MakeValue(true), eos_switch_app); // update eos loop stream switch true_branch_stream attr common::AnfAlgo::SetNodeAttr(kAttrTrueBranchStream, MakeValue(*eos_stream_id), eos_switch_app); common::AnfAlgo::SetNodeAttr(kAttrStreamSwitchKind, MakeValue(kEosStreamSwitch), eos_switch_app); (*exec_order).push_back(eos_switch_app); MS_LOG(INFO) << "EoS loop insert Stream Switch " << eos_switch_app->fullname_with_scope(); } void KernelAdjust::InsertGetNextLoopEosStartRecv(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t eos_start_event_id, uint32_t eos_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); CNodePtr eos_start_recv = CreateRecvApplyKernel(kernel_graph_ptr, eos_start_event_id); AnfAlgo::SetStreamId(eos_stream_id, eos_start_recv.get()); (*exec_order).push_back(eos_start_recv); MS_LOG(INFO) << "EoS loop insert EoS Recv " << eos_start_recv->fullname_with_scope(); } void KernelAdjust::InsertEosOp(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, const CNodePtr &getnext_cnode, uint32_t eos_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(getnext_cnode); CNodePtr end_of_sequence_op = CreateEndOfSequenceOP(kernel_graph_ptr, getnext_cnode); MS_EXCEPTION_IF_NULL(end_of_sequence_op); AnfAlgo::SetStreamId(eos_stream_id, end_of_sequence_op.get()); (*exec_order).push_back(end_of_sequence_op); MS_LOG(INFO) << "EoS loop insert Eos Op " << end_of_sequence_op->fullname_with_scope(); } void KernelAdjust::InsertEosDoneSend(const std::shared_ptr &kernel_graph_ptr, std::vector *exec_order, uint32_t *eos_done_event_id, uint32_t eos_stream_id) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(exec_order); MS_EXCEPTION_IF_NULL(eos_done_event_id); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); *eos_done_event_id = resource_manager.ApplyNewEvent(); CNodePtr eos_done_send = CreateSendApplyKernel(kernel_graph_ptr, *eos_done_event_id); AnfAlgo::SetStreamId(eos_stream_id, eos_done_send.get()); (*exec_order).push_back(eos_done_send); MS_LOG(INFO) << "EoS loop insert EoS done Send " << eos_done_send->fullname_with_scope(); } void KernelAdjust::ProcessLoopSink(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); device::ascend::AscendStreamMng &resource_manager = device::ascend::AscendStreamMng::GetInstance(); resource_manager.ResetResource(); if (!NeedLoopSink()) { return; } if (kernel_graph_ptr->is_dynamic_shape()) { MS_LOG(INFO) << "KernelGraph:" << kernel_graph_ptr->graph_id() << " is dynamic shape, skip ProcessLoopSink"; return; } bool exist_getnext = ExistGetNext(kernel_graph_ptr); bool eos_mode = ConfigManager::GetInstance().iter_num() == INT32_MAX && exist_getnext; MS_LOG(INFO) << "GetNext exist:" << exist_getnext << " End of Sequence mode:" << eos_mode << " iter num:" << ConfigManager::GetInstance().iter_num(); if (exist_getnext) { ReorderGetNext(kernel_graph_ptr); } auto switch_loop_input = kernel_graph_ptr->device_loop_control_params(); const std::vector &orders = kernel_graph_ptr->execution_order(); if (orders.empty()) { MS_LOG(EXCEPTION) << "graph " << kernel_graph_ptr->graph_id() << " execution order is empty"; } std::vector exec_order; CNodePtr getnext_cnode; uint32_t getnext_switch_stream_id = UINT32_MAX; uint32_t fpbp_start_event_id = UINT32_MAX; uint32_t eos_start_event_id = UINT32_MAX; uint32_t getnext_stream_id = UINT32_MAX; size_t order_index = 0; if (exist_getnext) { InsertGetNextLoopStreamSwitch(kernel_graph_ptr, &exec_order, &getnext_switch_stream_id, &getnext_stream_id, switch_loop_input); SetBeforeGetNextStreamID(&exec_order, orders, &order_index, getnext_cnode, getnext_stream_id); InsertGetNextLoopFpBpStartSend(kernel_graph_ptr, &exec_order, &fpbp_start_event_id, getnext_stream_id); if (eos_mode) { InsertGetNextLoopEosStartSend(kernel_graph_ptr, &exec_order, &eos_start_event_id, getnext_stream_id); } } uint32_t eos_switch_stream_id = UINT32_MAX; uint32_t eos_stream_id = UINT32_MAX; uint32_t eos_done_event_id = UINT32_MAX; std::vector fpbp_active_streams; if (eos_mode) { InsertEosStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &eos_switch_stream_id, &eos_stream_id); InsertGetNextLoopEosStartRecv(kernel_graph_ptr, &exec_order, eos_start_event_id, eos_stream_id); InsertEosOp(kernel_graph_ptr, &exec_order, getnext_cnode, eos_stream_id); InsertEosDoneSend(kernel_graph_ptr, &exec_order, &eos_done_event_id, eos_stream_id); fpbp_active_streams.push_back(eos_switch_stream_id); } bool exist_independent = ExistIndependent(kernel_graph_ptr); if (exist_independent) { InsertIndepentParallel(kernel_graph_ptr, switch_loop_input, &exec_order); } uint32_t fpbp_stream_id = UINT32_MAX; uint32_t fpbp_switch_stream_id = UINT32_MAX; InsertFpBpLoopStreamSwitch(kernel_graph_ptr, switch_loop_input, &exec_order, &fpbp_stream_id, &fpbp_switch_stream_id); if (exist_getnext) { InsertFpBpStartRecv(kernel_graph_ptr, &exec_order, fpbp_start_event_id, fpbp_stream_id); } InsertNextLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input, fpbp_stream_id); std::vector memcpy_list; std::vector other_list; if (exist_getnext) { CopyMemcpyList(kernel_graph_ptr, orders, order_index, &memcpy_list, &other_list); (void)std::copy(memcpy_list.begin(), memcpy_list.end(), std::back_inserter(exec_order)); } else { other_list = orders; } if (eos_mode) { InsertEosDoneRecv(kernel_graph_ptr, &exec_order, eos_done_event_id, fpbp_stream_id); } std::vector getnext_active_streams; if (exist_getnext) { // small loop active getnext_active_streams.push_back(getnext_switch_stream_id); InsertGetNextLoopStreamActive(kernel_graph_ptr, &exec_order, getnext_active_streams); } (void)std::copy(other_list.begin(), other_list.end(), std::back_inserter(exec_order)); InsertCurrentLoopAssignAdd(kernel_graph_ptr, &exec_order, switch_loop_input); // big loop active fpbp_active_streams.push_back(fpbp_switch_stream_id); InsertFpBpAndEosLoopStreamActive(kernel_graph_ptr, &exec_order, fpbp_active_streams); kernel_graph_ptr->set_execution_order(exec_order); } kernel::KernelBuildInfo::KernelBuildInfoBuilder KernelAdjust::CreateMngKernelBuilder( const std::vector &formats, const std::vector &type_ids) { kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat(formats); selected_kernel_builder.SetInputsDeviceType(type_ids); selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::RT_KERNEL); return selected_kernel_builder; } CNodePtr KernelAdjust::CreateStreamSwitchOp(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, StreamSwitchKind kind) { kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); auto typeNone_abstract = std::make_shared(); auto stream_switch = std::make_shared(kStreamSwitchOpName); std::vector inputs; inputs.push_back(NewValueNode(stream_switch)); if (kind == kFpBpStreamSwitch || kind == kEosStreamSwitch) { inputs.push_back(switch_loop_input.at(kNextLoopCountName)); } else if (kind == kGetNextStreamSwitch || kind == kIndependentStreamSwitch) { inputs.push_back(switch_loop_input.at(kNextLoopCountName)); } else { MS_LOG(ERROR) << "unknown stream switch kind: " << kind; } inputs.push_back(switch_loop_input.at(kConstLoopNumInEpochName)); MS_EXCEPTION_IF_NULL(kernel_graph_ptr); CNodePtr stream_switch_app = kernel_graph_ptr->NewCNode(inputs); MS_EXCEPTION_IF_NULL(stream_switch_app); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_switch_app.get()); stream_switch_app->set_abstract(typeNone_abstract); // set attr: cond_ RT_LESS int condition = static_cast(RT_LESS_OR_EQUAL); ValuePtr cond = MakeValue(condition); common::AnfAlgo::SetNodeAttr(kAttrSwitchCondition, cond, stream_switch_app); // set attr:data_type int data_type = static_cast(RT_SWITCH_INT64); ValuePtr dt = MakeValue(data_type); common::AnfAlgo::SetNodeAttr(kAttrDataType, dt, stream_switch_app); // set distinction label and graph id return stream_switch_app; } CNodePtr KernelAdjust::CreateStreamActiveOp(const std::shared_ptr &kernel_graph_ptr) { kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); abstract::AbstractBasePtr typeNone_abstract = std::make_shared(); auto stream_active_others = std::make_shared(kStreamActiveOpName); std::vector inputs; inputs.push_back(NewValueNode(stream_active_others)); MS_EXCEPTION_IF_NULL(kernel_graph_ptr); CNodePtr stream_active_others_app = kernel_graph_ptr->NewCNode(inputs); MS_EXCEPTION_IF_NULL(stream_active_others_app); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), stream_active_others_app.get()); stream_active_others_app->set_abstract(typeNone_abstract); return stream_active_others_app; } CNodePtr KernelAdjust::CreatTupleGetItemNode(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &node, size_t output_idx) { auto idx = NewValueNode(SizeToLong(output_idx)); MS_EXCEPTION_IF_NULL(idx); auto imm = std::make_shared(SizeToInt(output_idx)); auto abstract_scalar = std::make_shared(imm); idx->set_abstract(abstract_scalar); CNodePtr tuple_getitem = kernel_graph_ptr->NewCNode({NewValueNode(prim::kPrimTupleGetItem), node, idx}); MS_EXCEPTION_IF_NULL(tuple_getitem); tuple_getitem->set_scope(node->scope()); std::vector origin_shape = common::AnfAlgo::GetOutputInferShape(node, output_idx); TypeId origin_type = common::AnfAlgo::GetOutputInferDataType(node, output_idx); common::AnfAlgo::SetOutputInferTypeAndShape({origin_type}, {origin_shape}, tuple_getitem.get()); return tuple_getitem; } CNodePtr KernelAdjust::CreateEndOfSequenceOP(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &getnext_cnode) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetInputsDeviceType({kNumberTypeUInt8}); selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE); selected_kernel_builder.SetProcessor(kernel::Processor::AICPU); selected_kernel_builder.SetKernelType(KernelType::AICPU_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeUInt8}); // EndOfSequence auto end_of_sequence = std::make_shared(kEndOfSequence); std::vector inputs; inputs.push_back(NewValueNode(end_of_sequence)); // GetNext output 0 is EndOfSequence's input auto tuple_get_item = CreatTupleGetItemNode(kernel_graph_ptr, getnext_cnode, 0); inputs.push_back(tuple_get_item); CNodePtr end_of_sequence_node = kernel_graph_ptr->NewCNode(inputs); MS_EXCEPTION_IF_NULL(end_of_sequence_node); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), end_of_sequence_node.get()); std::vector input_names = {"x"}; ValuePtr input_names_v = MakeValue(input_names); common::AnfAlgo::SetNodeAttr("input_names", input_names_v, end_of_sequence_node); std::vector output_names = {"y"}; ValuePtr output_names_v = MakeValue(output_names); common::AnfAlgo::SetNodeAttr("output_names", output_names_v, end_of_sequence_node); end_of_sequence_node->set_abstract(tuple_get_item->abstract()); return end_of_sequence_node; } CNodePtr KernelAdjust::CreateStreamAssignAddnOP(const std::shared_ptr &kernel_graph_ptr, const std::map &switch_loop_input, bool cur_loop) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeInt32, TypeId::kNumberTypeInt32}); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeInt32}); // AssignAdd auto assign_add = std::make_shared(kAssignAddOpName); std::vector inputs; inputs.push_back(NewValueNode(assign_add)); if (cur_loop) { inputs.push_back(switch_loop_input.at(kCurLoopCountName)); } else { inputs.push_back(switch_loop_input.at(kNextLoopCountName)); } inputs.push_back(switch_loop_input.at(kConstOneName)); CNodePtr assign_add_one = kernel_graph_ptr->NewCNode(inputs); MS_EXCEPTION_IF_NULL(assign_add_one); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_one.get()); std::vector input_names = {"ref", "value"}; std::vector output_names = {"output"}; ValuePtr input_names_v = MakeValue(input_names); ValuePtr output_names_v = MakeValue(output_names); common::AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_one); common::AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_one); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); MS_EXCEPTION_IF_NULL(switch_loop_input.at(kCurLoopCountName)); assign_add_one->set_abstract(switch_loop_input.at(kCurLoopCountName)->abstract()); // add AssignAdd op to kernel ref node map session::AnfWithOutIndex final_pair = std::make_pair(assign_add_one, 0); session::KernelWithIndex kernel_with_index = common::AnfAlgo::VisitKernel(common::AnfAlgo::GetInputNode(assign_add_one, 0), 0); kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index); return assign_add_one; } #ifndef ENABLE_SECURITY void KernelAdjust::Profiling(NotNull kernel_graph_ptr) { if (!ascend::ProfilingManager::GetInstance().IsProfilingInitialized()) { MS_LOG(INFO) << "No need to profiling"; return; } ProfilingTraceInfo profiling_trace_info = ProfilingUtils::GenerateProfilingTrace(*kernel_graph_ptr); if (!profiling_trace_info.IsValid()) { MS_LOG(INFO) << "[profiling] no profiling node found!"; return; } InsertProfilingKernel(profiling_trace_info, kernel_graph_ptr); } void KernelAdjust::InsertProfilingKernel(const ProfilingTraceInfo &profiling_trace_info, NotNull kernel_graph_ptr) { MS_LOG(INFO) << "[profiling] Insert profiling kernel start"; if (!profiling_trace_info.IsValid()) { MS_LOG(WARNING) << "Profiling trace point not found"; return; } std::vector new_cnode_list; std::vector cnode_ptr_list = kernel_graph_ptr->execution_order(); if (cnode_ptr_list.empty()) { MS_LOG(ERROR) << "No CNode in graph " << kernel_graph_ptr->graph_id(); return; } for (const auto &cnode_ptr : cnode_ptr_list) { ProfilingUtils::InsertProfilingTraceFp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); new_cnode_list.emplace_back(cnode_ptr); ProfilingUtils::InsertProfilingCustomOp(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); ProfilingUtils::InsertProfilingTraceBpEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); ProfilingUtils::InsertProfilingTraceIterEnd(cnode_ptr, profiling_trace_info, kernel_graph_ptr, NOT_NULL(&new_cnode_list)); } kernel_graph_ptr->set_execution_order(new_cnode_list); } #endif CNodePtr KernelAdjust::CreateNPUGetFloatStatus(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &npu_alloc_cnode) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(npu_alloc_cnode); auto npu_get_primitive = std::make_shared(kNPUGetFloatStatusOpName); std::vector npu_get_inputs = {NewValueNode(npu_get_primitive), npu_alloc_cnode}; auto npu_get_cnode = kernel_graph_ptr->NewCNode(npu_get_inputs); MS_EXCEPTION_IF_NULL(npu_get_cnode); npu_alloc_cnode->set_scope(kDefaultScope); npu_get_cnode->set_abstract(npu_alloc_cnode->abstract()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32}); selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_get_cnode.get()); return npu_get_cnode; } CNodePtr KernelAdjust::CreateNPUClearStatus(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &npu_alloc_cnode) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(npu_alloc_cnode); auto npu_clear_primitive = std::make_shared(kNPUClearFloatStatusOpName); std::vector npu_clear_inputs = {NewValueNode(npu_clear_primitive), npu_alloc_cnode}; auto npu_clear_cnode = kernel_graph_ptr->NewCNode(npu_clear_inputs); MS_EXCEPTION_IF_NULL(npu_clear_cnode); npu_alloc_cnode->set_scope(kDefaultScope); npu_clear_cnode->set_abstract(npu_alloc_cnode->abstract()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetInputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetInputsDeviceType({kNumberTypeFloat32}); selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_clear_cnode.get()); return npu_clear_cnode; } CNodePtr KernelAdjust::CreateNPUAllocStatus(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); // create npu_alloc_cnode auto npu_alloc_primitive = std::make_shared(kNPUAllocFloatStatusOpName); std::vector npu_alloc_inputs = {NewValueNode(npu_alloc_primitive)}; auto npu_alloc_cnode = kernel_graph_ptr->NewCNode(npu_alloc_inputs); MS_EXCEPTION_IF_NULL(npu_alloc_cnode); npu_alloc_cnode->set_scope(kDefaultScope); std::vector npu_output_shape = {kNPUShape}; common::AnfAlgo::SetOutputInferTypeAndShape({kNumberTypeFloat32}, {npu_output_shape}, npu_alloc_cnode.get()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder; selected_kernel_builder.SetFusionType(kernel::FusionType::OPAQUE); selected_kernel_builder.SetProcessor(kernel::Processor::AICORE); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), npu_alloc_cnode.get()); return npu_alloc_cnode; } CNodePtr KernelAdjust::CreateAssignAdd(const std::shared_ptr &kernel_graph_ptr, const CNodePtr &npu_alloc_cnode, const AnfNodePtr &specify_para) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(npu_alloc_cnode); MS_EXCEPTION_IF_NULL(specify_para); auto assign_add_primitive = std::make_shared(kAssignAddOpName); std::vector assign_add_inputs = {NewValueNode(assign_add_primitive), specify_para, npu_alloc_cnode}; auto assign_add_cnode = kernel_graph_ptr->NewCNode(assign_add_inputs); MS_EXCEPTION_IF_NULL(assign_add_cnode); assign_add_cnode->set_scope(kDefaultScope); assign_add_cnode->set_abstract(specify_para->abstract()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32}); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_add_cnode.get()); std::vector input_names = {"ref", "value"}; std::vector output_names = {"output"}; ValuePtr input_names_v = MakeValue(input_names); ValuePtr output_names_v = MakeValue(output_names); common::AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_add_cnode); common::AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_add_cnode); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); session::AnfWithOutIndex final_pair = std::make_pair(assign_add_cnode, 0); session::KernelWithIndex kernel_with_index = common::AnfAlgo::VisitKernel(common::AnfAlgo::GetInputNode(assign_add_cnode, 0), 0); kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index); return assign_add_cnode; } CNodePtr KernelAdjust::CreateAssign(const std::shared_ptr &kernel_graph_ptr, const AnfNodePtr &specify_para) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(specify_para); std::vector reset(kNPUShape, 0.0); ShapeVector reset_shape({static_cast(kNPUShape)}); auto shp_buf_size = sizeof(float) * reset.size(); auto reset_tensor = std::make_shared(kNumberTypeFloat32, reset_shape, reset.data(), shp_buf_size); auto reset_value_node = std::make_shared(reset_tensor); MS_EXCEPTION_IF_NULL(reset_value_node); reset_value_node->set_abstract(specify_para->abstract()); kernel_graph_ptr->AddValueNodeToGraph(reset_value_node); auto kernel_info = std::make_shared(); MS_EXCEPTION_IF_NULL(kernel_info); reset_value_node->set_kernel_info(kernel_info); kernel::KernelBuildInfo::KernelBuildInfoBuilder builder1; builder1.SetOutputsFormat({kOpFormat_DEFAULT}); builder1.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(builder1.Build(), reset_value_node.get()); auto assign_primitive = std::make_shared(kAssignOpName); std::vector assign_inputs = {NewValueNode(assign_primitive), specify_para, reset_value_node}; auto assign_cnode = kernel_graph_ptr->NewCNode(assign_inputs); MS_EXCEPTION_IF_NULL(assign_cnode); assign_cnode->set_scope(kDefaultScope); assign_cnode->set_abstract(specify_para->abstract()); kernel::KernelBuildInfo::KernelBuildInfoBuilder selected_kernel_builder = CreateMngKernelBuilder( {kOpFormat_DEFAULT, kOpFormat_DEFAULT}, {TypeId::kNumberTypeFloat32, TypeId::kNumberTypeFloat32}); selected_kernel_builder.SetOutputsFormat({kOpFormat_DEFAULT}); selected_kernel_builder.SetOutputsDeviceType({kNumberTypeFloat32}); AnfAlgo::SetSelectKernelBuildInfo(selected_kernel_builder.Build(), assign_cnode.get()); std::vector input_names = {"ref", "value"}; std::vector output_names = {"output"}; ValuePtr input_names_v = MakeValue(input_names); ValuePtr output_names_v = MakeValue(output_names); common::AnfAlgo::SetNodeAttr("input_names", input_names_v, assign_cnode); common::AnfAlgo::SetNodeAttr("output_names", output_names_v, assign_cnode); selected_kernel_builder.SetKernelType(KernelType::TBE_KERNEL); session::AnfWithOutIndex final_pair = std::make_pair(assign_cnode, 0); session::KernelWithIndex kernel_with_index = common::AnfAlgo::VisitKernel(common::AnfAlgo::GetInputNode(assign_cnode, 0), 0); kernel_graph_ptr->AddRefCorrespondPairs(final_pair, kernel_with_index); return assign_cnode; } void KernelAdjust::InsertOverflowCheckOperations(const std::shared_ptr &kernel_graph_ptr) { MS_LOG(INFO) << "Start Insert Overflow Check Operations."; MS_EXCEPTION_IF_NULL(kernel_graph_ptr); auto parameters = kernel_graph_ptr->parameters(); AnfNodePtr specify_para; bool not_find = true; for (size_t i = 0; i < parameters.size(); i++) { auto para_fullname = parameters[i]->fullname_with_scope(); if (para_fullname.find(kSpecifyParameter) != std::string::npos) { not_find = false; specify_para = parameters[i]; break; } } if (not_find) { MS_LOG(INFO) << "Not find parameter named " << kSpecifyParameter; return; } bool first_grad_op = true; CNodePtr npu_alloc_cnode; std::vector new_execution_order; auto execution_order = kernel_graph_ptr->execution_order(); for (size_t i = 0; i < execution_order.size() - 1; i++) { new_execution_order.push_back(execution_order[i]); auto cur_full_name = execution_order[i]->fullname_with_scope(); auto next_full_name = execution_order[i + 1]->fullname_with_scope(); auto cur_stream_id = AnfAlgo::GetStreamId(execution_order[i]); auto next_stream_id = AnfAlgo::GetStreamId(execution_order[i + 1]); if (cur_full_name.find(kGradients) == std::string::npos && next_full_name.find(kGradients) != std::string::npos) { if (first_grad_op) { npu_alloc_cnode = CreateNPUAllocStatus(kernel_graph_ptr); auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode); auto assign_cnode = CreateAssign(kernel_graph_ptr, specify_para); AnfAlgo::SetStreamId(next_stream_id, npu_alloc_cnode.get()); AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get()); AnfAlgo::SetStreamId(next_stream_id, assign_cnode.get()); new_execution_order.push_back(npu_alloc_cnode); new_execution_order.push_back(npu_clear_cnode); new_execution_order.push_back(assign_cnode); first_grad_op = false; } else { auto npu_clear_cnode = CreateNPUClearStatus(kernel_graph_ptr, npu_alloc_cnode); AnfAlgo::SetStreamId(next_stream_id, npu_clear_cnode.get()); new_execution_order.push_back(npu_clear_cnode); } } if (cur_full_name.find(kGradients) != std::string::npos && next_full_name.find(kGradients) == std::string::npos) { auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode); auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para); AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get()); new_execution_order.push_back(npu_get_cnode); new_execution_order.push_back(assign_add_cnode); } if (i == execution_order.size() - kLastHandleDiff) { new_execution_order.push_back(execution_order[i + 1]); if (next_full_name.find(kGradients) != std::string::npos) { auto npu_get_cnode = CreateNPUGetFloatStatus(kernel_graph_ptr, npu_alloc_cnode); auto assign_add_cnode = CreateAssignAdd(kernel_graph_ptr, npu_alloc_cnode, specify_para); AnfAlgo::SetStreamId(cur_stream_id, npu_get_cnode.get()); AnfAlgo::SetStreamId(cur_stream_id, assign_add_cnode.get()); new_execution_order.push_back(npu_get_cnode); new_execution_order.push_back(assign_add_cnode); } } } kernel_graph_ptr->set_execution_order(new_execution_order); } // device loop control std::shared_ptr KernelAdjust::CreateTensor(int32_t initial_value) { ShapeVector shp = {1}; tensor::TensorPtr tensor = std::make_shared(kInt32->type_id(), shp); MS_EXCEPTION_IF_NULL(tensor); auto val = static_cast(tensor->data_c()); MS_EXCEPTION_IF_NULL(val); *val = initial_value; return tensor; } std::shared_ptr KernelAdjust::CreateParameter(const std::shared_ptr &kernel_graph_ptr, const string parameter_name) { ShapeVector shp = {1}; tensor::TensorPtr tensor_ptr = std::make_shared(kInt32->type_id(), shp); MS_EXCEPTION_IF_NULL(tensor_ptr); mindspore::abstract::AbstractBasePtr parameter_abstract_ptr = tensor_ptr->ToAbstract(); if (parameter_abstract_ptr == nullptr) { MS_LOG(EXCEPTION) << "Create abstract for device loop control failed!"; } ParameterPtr param = std::make_shared(kernel_graph_ptr); MS_EXCEPTION_IF_NULL(param); param->set_name(parameter_name); param->set_abstract(parameter_abstract_ptr); ParameterPtr graph_parameter = kernel_graph_ptr->NewParameter(param); return graph_parameter; } void KernelAdjust::InsertDeviceLoopCtrl(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); std::map device_loop_ctrl_tensors; std::map device_loop_ctrl_params; // current loop count device_loop_ctrl_tensors[kCurLoopCountName] = CreateTensor(0); device_loop_ctrl_params[kCurLoopCountName] = CreateParameter(kernel_graph_ptr, kCurLoopCountName); // next loop count tensor device_loop_ctrl_tensors[kNextLoopCountName] = CreateTensor(0); device_loop_ctrl_params[kNextLoopCountName] = CreateParameter(kernel_graph_ptr, kNextLoopCountName); // current epoch count tensor device_loop_ctrl_tensors[kCurEpochCountName] = CreateTensor(0); device_loop_ctrl_params[kCurEpochCountName] = CreateParameter(kernel_graph_ptr, kCurEpochCountName); // constant one tensor device_loop_ctrl_tensors[kConstOneName] = CreateTensor(1); device_loop_ctrl_params[kConstOneName] = CreateParameter(kernel_graph_ptr, kConstOneName); // constant loop num in epoch tensor int32_t initial_value = 0; if (NeedLoopSink()) { // iter_num minus one because the device side counts from 0 initial_value = SizeToInt(LongToSize(ConfigManager::GetInstance().iter_num() - 1)); } else { MS_LOG(INFO) << "Tensor const_loop_num_in_epoch only used in loop sink mode."; initial_value = 0; } MS_LOG(INFO) << "Loop num in epoch is " << initial_value; device_loop_ctrl_tensors[kConstLoopNumInEpochName] = CreateTensor(initial_value); device_loop_ctrl_params[kConstLoopNumInEpochName] = CreateParameter(kernel_graph_ptr, kConstLoopNumInEpochName); kernel_graph_ptr->set_device_loop_ctrl_tensors(device_loop_ctrl_tensors); kernel_graph_ptr->set_device_loop_ctrl_params(device_loop_ctrl_params); } void KernelAdjust::AssignLoopCtrlTensorMem(const session::KernelGraph &kernel_graph, KernelRuntime *runtime_instance, const string name) { MS_EXCEPTION_IF_NULL(runtime_instance); auto device_loop_control_params = kernel_graph.device_loop_control_params(); if (!device_loop_control_params.count(name)) { MS_LOG(WARNING) << "Can't find Device Loop Control Parameter " << name; return; } auto param = device_loop_control_params.at(name); MS_EXCEPTION_IF_NULL(param); DeviceAddressPtr device_address = nullptr; if (AnfAlgo::OutputAddrExist(param, 0)) { device_address = AnfAlgo::GetMutableOutputAddr(param, 0); MS_EXCEPTION_IF_NULL(device_address); } else { MS_LOG(INFO) << "Device Loop Control Parameter " << name << " have no address, allocating..."; auto size = AnfAlgo::GetOutputTensorMemSize(param, 0); auto format = AnfAlgo::GetOutputFormat(param, 0); auto type_id = AnfAlgo::GetOutputDeviceDataType(param, 0); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); device_address = std::make_shared(nullptr, size, format, type_id, kAscendDevice, device_id); device_address->set_is_ptr_persisted(true); if (runtime_instance->MallocMem(kStaticMem, size, device_address) == nullptr) { MS_LOG(EXCEPTION) << "Cannot alloc static memory for device loop control parameter " << name << " , tensor size is : " << size; } MS_EXCEPTION_IF_NULL(device_address); AnfAlgo::SetOutputAddr(device_address, 0, param.get()); } auto device_loop_control_tensors = kernel_graph.device_loop_control_tensors(); auto tensor = device_loop_control_tensors.at(name); MS_EXCEPTION_IF_NULL(tensor); tensor->set_device_address(device_address); if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(param, 0), LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c(), tensor->device_info().host_format_)) { MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << name; } } void KernelAdjust::AssignLoopCtrlMemory(const session::KernelGraph &kernel_graph_ptr) { auto device_loop_control_tensors = kernel_graph_ptr.device_loop_control_tensors(); if (device_loop_control_tensors.empty()) { return; } MS_LOG(INFO) << "Assign device loop control memory"; auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); auto device_id = ms_context->get_param(MS_CTX_DEVICE_ID); auto runtime_instance = KernelRuntimeManager::Instance().GetSingleKernelRuntime(kAscendDevice, device_id); MS_EXCEPTION_IF_NULL(runtime_instance); AssignLoopCtrlTensorMem(kernel_graph_ptr, runtime_instance, kCurLoopCountName); AssignLoopCtrlTensorMem(kernel_graph_ptr, runtime_instance, kNextLoopCountName); AssignLoopCtrlTensorMem(kernel_graph_ptr, runtime_instance, kCurEpochCountName); AssignLoopCtrlTensorMem(kernel_graph_ptr, runtime_instance, kConstOneName); AssignLoopCtrlTensorMem(kernel_graph_ptr, runtime_instance, kConstLoopNumInEpochName); } void KernelAdjust::SetDeviceLoopCtrlTensor(const std::shared_ptr &kernel_graph_ptr, const std::string name, int32_t value) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); auto device_loop_control_tensors = kernel_graph_ptr->device_loop_control_tensors(); if (!device_loop_control_tensors.count(name)) { MS_LOG(WARNING) << "Can't find Device Loop Control Tensor " << name; return; } auto tensor = device_loop_control_tensors.at(name); MS_EXCEPTION_IF_NULL(tensor); auto *cur_val = static_cast(tensor->data_c()); MS_EXCEPTION_IF_NULL(cur_val); *cur_val = value; tensor->set_sync_status(kNeedSyncHostToDevice); auto device_address = tensor->device_address(); MS_EXCEPTION_IF_NULL(device_address); if (!device_address->SyncHostToDevice(tensor->shape(), LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c(), tensor->device_info().host_format_)) { MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << name; } } void KernelAdjust::LoadDeviceLoopCtrlParameters(const std::shared_ptr &kernel_graph_ptr) { MS_EXCEPTION_IF_NULL(kernel_graph_ptr); auto device_loop_control_tensors = kernel_graph_ptr->device_loop_control_tensors(); if (device_loop_control_tensors.empty()) { return; } MS_LOG(INFO) << "Load device loop control data"; SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurLoopCountName, 0); SetDeviceLoopCtrlTensor(kernel_graph_ptr, kNextLoopCountName, 0); #ifndef ENABLE_SECURITY SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, SizeToInt(DumpJsonParser::GetInstance().cur_dump_iter())); #else SetDeviceLoopCtrlTensor(kernel_graph_ptr, kCurEpochCountName, 0); #endif kernel_graph_ptr->set_current_epoch(kernel_graph_ptr->current_epoch() + 1); } } // namespace device } // namespace mindspore