/** * Copyright 2019-2021 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "backend/session/ascend_session.h" #include #include #include #include #include #include #include "base/core_ops.h" #include "base/base_ref_utils.h" #include "ir/tensor.h" #include "ir/anf.h" #include "common/trans.h" #include "runtime/device/kernel_runtime.h" #include "runtime/device/ascend/kernel_select_ascend.h" #include "runtime/device/ascend/kernel_build_ascend.h" #include "runtime/device/ascend/ascend_kernel_runtime.h" #include "runtime/device/ascend/profiling/profiling_manager.h" #include "backend/optimizer/ascend/ascend_backend_optimization.h" #include "backend/optimizer/common/common_backend_optimization.h" #include "backend/optimizer/ascend/mindir/space_batch_nd_attr_update.h" #include "backend/optimizer/ascend/mindir/dropout_unify_mindir.h" #include "backend/optimizer/ascend/mindir/maxpool_to_maxpool_with_argmax.h" #include "backend/optimizer/ascend/mindir/maxpool_with_argmax_unify_mindir.h" #include "backend/optimizer/ascend/mindir/conv2d_unify_mindir.h" #include "backend/optimizer/ascend/mindir/optimizer_unify_output.h" #include "backend/optimizer/ascend/mindir/sparse_softmax_cross_entropy_with_logits_unify_mindir.h" #include "backend/optimizer/ascend/mindir/slice_grad_unify_mindir.h" #include "backend/optimizer/ascend/mindir/avg_pool_grad_unify_mindir.h" #include "backend/optimizer/ascend/mindir/bn_grad_unify_mindir.h" #include "runtime/device/kernel_adjust.h" #include "runtime/device/ascend/ascend_stream_assign.h" #include "backend/session/anf_runtime_algorithm.h" #include "utils/ms_utils.h" #include "utils/context/graph_kernel_flags.h" #include "backend/optimizer/common/helper.h" #include "runtime/device/kernel_runtime_manager.h" #include "utils/config_manager.h" #include "debug/data_dump/dump_json_parser.h" #include "debug/tensor_load.h" #include "debug/anf_ir_utils.h" #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" #include "backend/session/ascend_auto_monad.h" #include "debug/data_dump/e2e_dump_util.h" #include "debug/anf_ir_dump.h" #include "debug/dump_proto.h" #ifdef ENABLE_DEBUGGER #include "debug/debugger/proto_exporter.h" #else #include "debug/debugger/proto_exporter_stub.h" #endif #include "toolchain/adx_datadump_server.h" #ifdef ENABLE_DUMP_IR #include "debug/rdr/running_data_recorder.h" #include "debug/rdr/recorder_manager.h" #include "debug/rdr/graph_recorder.h" #endif #if ENABLE_CPU && ENABLE_D #include "ps/util.h" #include "ps/ps_cache/ps_cache_manager.h" #endif #include "runtime/device/ascend/ascend_bucket.h" #include "profiler/device/common/memory_profiling.h" using mindspore::device::ascend::ProfilingManager; using mindspore::profiler::MemoryProfiling; static constexpr uint32_t kLabelSwitchLabelId = 2; namespace mindspore { namespace session { const size_t kInvalidIndex = SIZE_MAX; constexpr size_t kReturnDataIndex = 1; constexpr char SR_TAG[] = "sr_tag"; constexpr char BACKWARD[] = "backward"; namespace { void DumpGraphExeOrder(const std::vector &execution_order, const std::string &tag = "") { MS_LOG(INFO) << "Dump execution_order size " << execution_order.size(); MS_LOG(INFO) << "[index][stream_label][graph_id][node string]"; int i = 0; for (auto &cnode : execution_order) { MS_EXCEPTION_IF_NULL(cnode); MS_LOG(INFO) << "[ " << i << "]" << "[" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "]" << "[" << AnfAlgo::GetGraphId(cnode.get()) << "]" << "[" << cnode->DebugString() << "]"; i++; } std::stringstream buf; buf << "================== execution order ==================\n"; if (!tag.empty()) { buf << tag << "\n"; } buf << "execution_order size: " << execution_order.size() << "\n"; i = 0; for (auto &cnode : execution_order) { MS_EXCEPTION_IF_NULL(cnode); buf << i << ":\n"; buf << "\t" << cnode->DebugString() << "\n"; buf << "\t" << AnfAlgo::GetStreamDistinctionLabel(cnode.get()) << "\n"; buf << "\t" << AnfAlgo::GetGraphId(cnode.get()) << "\n"; i++; } buf << "================== execution order ==================\n"; } // Handle control flow by auto-monad. void HandleControlFlow(NotNull graph) { AscendAutoMonad auto_monad(graph); auto_monad.Run(); } void SetStreamDistinctionLabel(const KernelGraphPtr &graph, uint32_t label, bool is_override) { MS_EXCEPTION_IF_NULL(graph); if (is_override || graph->stream_distinction_label() == kInvalidDistincLabel) { graph->set_stream_distinction_label(label); } } std::vector GetCNodes(const std::vector &anf_nodes) { std::vector cnodes = {}; for (const auto &anf : anf_nodes) { MS_EXCEPTION_IF_NULL(anf); if (anf->isa()) { cnodes.push_back(anf->cast()); } } return cnodes; } void InsertMakeTupleForOutput(NotNull root_graph) { auto return_node = root_graph->get_return(); MS_EXCEPTION_IF_NULL(return_node); if (return_node->size() <= kReturnDataIndex) { return; } auto make_tuple = root_graph->NewCNode( {NewValueNode(std::make_shared(prim::kPrimMakeTuple->name())), root_graph->output()}); root_graph->set_output(make_tuple); } TensorPtr GetCNodeOutputStubTensor(const KernelWithIndex &kernel_with_index, const std::map &node_output_info, bool *output_is_weight) { MS_EXCEPTION_IF_NULL(output_is_weight); const auto &iter = node_output_info.find(kernel_with_index); if (iter == node_output_info.end()) { MS_LOG(EXCEPTION) << "Can not find output stub tensor of cnode " << kernel_with_index.first->DebugString(); } *output_is_weight = iter->second.is_weight; return iter->second.output_stub_tensor; } void GenOpOutputStubTensor(const KernelGraphPtr &single_op_graph, const CNodePtr &kernel, const std::map &cnode_refcount, std::map *op_output_info) { MS_EXCEPTION_IF_NULL(single_op_graph); MS_EXCEPTION_IF_NULL(kernel); MS_EXCEPTION_IF_NULL(op_output_info); OutputTensorInfo output_tensor_info; size_t out_idx = 0; for (const auto &output : single_op_graph->outputs()) { KernelWithIndex kernel_with_index = std::make_pair(kernel, out_idx++); if (cnode_refcount.find(kernel_with_index) == cnode_refcount.end()) { continue; } const auto &output_kernel_with_index = AnfAlgo::VisitKernel(output, 0); const auto &output_node = output_kernel_with_index.first; const auto &output_index = output_kernel_with_index.second; auto out_abstract = output_node->abstract(); MS_EXCEPTION_IF_NULL(out_abstract); if (out_abstract->isa()) { out_abstract = out_abstract->cast()->elements()[output_index]; MS_EXCEPTION_IF_NULL(out_abstract); } abstract::AbstractTensorPtr tensor_abstract = out_abstract->cast(); MS_EXCEPTION_IF_NULL(tensor_abstract); const auto &infer_type = AnfAlgo::GetOutputInferDataType(output_node, output_index); tensor::TensorPtr stub_output_tensor = std::make_shared(infer_type, tensor_abstract->shape()->shape(), nullptr); const auto &output_type = AnfAlgo::GetOutputDeviceDataType(output_node, output_index); const auto &output_shape = AnfAlgo::GetOutputDeviceShape(output_node, output_index); const auto &output_format = AnfAlgo::GetOutputFormat(output_node, output_index); tensor::DeviceInfo device_info; device_info.format_ = output_format; device_info.data_type_ = TypeIdToType(output_type); stub_output_tensor->set_device_info(device_info); device::DeviceAddressPtr device_address = std::make_shared(nullptr, 0, output_format, output_type); stub_output_tensor->set_device_address(device_address); output_tensor_info.output_stub_tensor = stub_output_tensor; output_tensor_info.is_weight = !dynamic_cast(output_node->kernel_info())->is_feature_map(); (*op_output_info)[kernel_with_index] = output_tensor_info; } } } // namespace void AscendSession::Init(uint32_t device_id) { InitExecutor(kAscendDevice, device_id); } void AscendSession::UnifyMindIR(const KernelGraphPtr &graph) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { std::string file_name = "hwopt_d_before_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; DumpIR(file_name, graph); DumpIRProto(graph, "before_unify_mindir_hwopt_" + std::to_string(graph->graph_id())); } auto optimizer = std::make_shared(); auto unify_mindir_pm = std::make_shared("unify_mindir_pm"); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kGraphMode) { unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); } else { unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); } unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); unify_mindir_pm->AddPass(std::make_shared()); optimizer->AddPassManager(unify_mindir_pm); (void)optimizer->Optimize(graph); graph->SetExecOrderByDefault(); if (save_graphs) { std::string file_name = "hwopt_d_after_unify_mindir_graph_" + std::to_string(graph->graph_id()) + ".ir"; DumpIR(file_name, graph); } } GraphId AscendSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) { MS_LOG(INFO) << "Start"; // construct graph, if successfully, graph_sum_ + 1 auto graph = ConstructKernelGraph(lst, outputs); auto graph_id = graph->graph_id(); InitAllBucket(graph); MS_LOG(INFO) << "Compile graph " << graph_id << " success"; return graph_id; } bool IsBackward(const CNodePtr &cnode) { auto prim = GetValueNode(cnode->input(0)); return prim->HasAttr(BACKWARD); } // compare the value of send/recv sr_tag bool comp(const CNodePtr &node1, const CNodePtr &node2) { auto prim1 = GetValueNode(node1->input(0)); MS_EXCEPTION_IF_NULL(prim1); auto prim2 = GetValueNode(node1->input(0)); MS_EXCEPTION_IF_NULL(prim2); auto sr_tag_value1 = prim1->GetAttr(SR_TAG); MS_EXCEPTION_IF_NULL(sr_tag_value1); auto sr_tag_value2 = prim2->GetAttr(SR_TAG); MS_EXCEPTION_IF_NULL(sr_tag_value2); auto sr_tag1 = GetValue(sr_tag_value1); auto sr_tag2 = GetValue(sr_tag_value2); return sr_tag1 < sr_tag2; } // Reorder the execution order of send void ReorderSend(std::vector *execution_order, std::vector op_v) { auto last_node = op_v.back(); for (auto &node : op_v) { if (node == last_node) { continue; } auto node_iter = std::find(execution_order->begin(), execution_order->end(), node); (void)execution_order->erase(node_iter); } std::sort(op_v.begin(), op_v.end(), comp); auto last_node_iter = std::find(execution_order->begin(), execution_order->end(), last_node); auto node_iter = execution_order->erase(last_node_iter); // all send will insert the end of the last node execution_order->insert(node_iter, op_v.begin(), op_v.end()); } // Reorder the execution order of receive void ReorderRecv(std::vector *execution_order, std::vector op_v) { auto begin_node = op_v.front(); for (auto &node : op_v) { if (node == begin_node) { continue; } auto node_iter = std::find(execution_order->begin(), execution_order->end(), node); (void)execution_order->erase(node_iter); } std::sort(op_v.begin(), op_v.end(), comp); auto begin_node_iter = std::find(execution_order->begin(), execution_order->end(), begin_node); auto node_iter = execution_order->erase(begin_node_iter); // all receive will insert before the begin node execution_order->insert(node_iter, op_v.begin(), op_v.end()); } void ReorderSendRecv(std::vector *execution_order) { std::vector forward_send, forward_recv, backward_send, backward_recv; for (auto &cnode : *execution_order) { if (IsPrimitiveCNode(cnode, prim::kPrimSend) && IsBackward(cnode)) { backward_send.push_back(cnode); continue; } else if (IsPrimitiveCNode(cnode, prim::kPrimSend)) { forward_send.push_back(cnode); continue; } if (IsPrimitiveCNode(cnode, prim::kPrimReceive) && IsBackward(cnode)) { backward_recv.push_back(cnode); } else if (IsPrimitiveCNode(cnode, prim::kPrimReceive)) { forward_recv.push_back(cnode); } } if (!forward_send.empty()) { ReorderSend(execution_order, forward_send); } if (!backward_send.empty()) { ReorderSend(execution_order, backward_send); } if (!forward_recv.empty()) { ReorderRecv(execution_order, forward_recv); } if (!backward_recv.empty()) { ReorderRecv(execution_order, backward_recv); } } GraphId AscendSession::CompileGraphImpl(NotNull func_graph) { MS_LOG(INFO) << "Start"; std::vector all_graphs; auto root_graph = ConstructKernelGraph(func_graph, &all_graphs); // Update Graph Dynamic Shape Attr UpdateAllGraphDynamicShapeAttr(all_graphs); for (const auto &graph : all_graphs) { UnifyMindIR(graph); } BackendOptimization(all_graphs); // empty graph dont entry to backend if (root_graph->execution_order().empty()) { MS_LOG(INFO) << root_graph->ToString() << " is empty graph."; InsertMakeTupleForOutput(NOT_NULL(root_graph)); root_graph->set_executable(false); InitRuntimeResource(); return root_graph->graph_id(); } // Handle control flow by auto-monad. HandleControlFlow(NOT_NULL(root_graph)); // resource initialize InitRuntimeResource(); std::set memo; IrFusionPass(NOT_NULL(root_graph), NOT_NULL(&memo)); memo.clear(); SelectKernel(NOT_NULL(root_graph)); memo.clear(); HardwareOptimize(NOT_NULL(root_graph), NOT_NULL(&memo)); memo.clear(); // load graphs to debugger. if (debugger_ && debugger_->DebuggerBackendEnabled()) { LoadGraphsToDbg(NOT_NULL(root_graph), NOT_NULL(&memo)); } memo.clear(); UpdateRefOutputMap(NOT_NULL(root_graph), NOT_NULL(&memo)); memo.clear(); // add make_tuple to the output graph InsertMakeTupleForOutput(NOT_NULL(root_graph)); // root root_graph valiate,include genearte execute order and so on RootGraphExecutorValidate(NOT_NULL(root_graph)); // dump graph before remove nop nodes auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { DumpIRProto(root_graph, "before_removeNop_" + std::to_string(graph_sum_)); } // adjust kernel AdjustKernel(root_graph); // reorder send/recv auto execution_order = root_graph->execution_order(); ReorderSendRecv(&execution_order); root_graph->set_execution_order(execution_order); #if ENABLE_CPU && ENABLE_D InitPsWorker(root_graph); #endif // assign stream AssignStream(NOT_NULL(root_graph)); // insert profiling point device::KernelAdjust::GetInstance().Profiling(NOT_NULL(root_graph.get())); // build kernel BuildKernel(root_graph); if (debugger_ && debugger_->partial_memory()) { debugger_->PreExecute(root_graph, graph_sum_); } SetSummaryNodes(root_graph.get()); // Alloc memory for child graph's inputs AssignStaticMemory(NOT_NULL(root_graph), NOT_NULL(&memo)); memo.clear(); // Alloc memory for root graph's inputs and node's outputs, workspace MemoryAlloc(root_graph.get()); // generate and load task into device Load(root_graph); root_graph->SetInputNodes(); root_graph->SetOptimizerFlag(); DumpAllGraphs(all_graphs); // Save memory profiling data to proto file if (ProfilingManager::GetInstance().IsProfiling()) { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); uint64_t mem_size = runtime_instance->GetAvailableMemMaxSize(); auto instance = MemoryProfiling::GetInstance(); instance.SetDeviceMemSize(mem_size); instance.SaveMemoryProfiling(); } // return the root_graph id to backend auto graph_id = root_graph->graph_id(); return graph_id; } void AscendSession::SetFinalGraphSummaryFlag(const std::shared_ptr &kernel_graph) { MS_EXCEPTION_IF_NULL(kernel_graph); auto graph_order = GetGraphOrder(kernel_graph->graph_id()); for (auto graph_id : graph_order) { auto child_graph = GetGraph(graph_id); if (child_graph == nullptr) { continue; } if (child_graph->summary_node_exist()) { kernel_graph->set_summary_node_exist(true); return; } } kernel_graph->set_summary_node_exist(false); } void AscendSession::BuildGraphImpl(GraphId graph_id) { MS_LOG(INFO) << "Start"; auto graph = GetGraph(graph_id); MS_EXCEPTION_IF_NULL(graph); // resource initialize InitRuntimeResource(); // multiple graph handle if (graph_id == final_graph_id_) { if (!graph->executable()) { return; } SetFinalGraphSummaryFlag(graph); // OptChildGraphs auto graph_order = GetGraphOrder(final_graph_id_); auto &graph_type = GetGraphOrderType(final_graph_id_); for (size_t i = 0; i < graph_order.size(); i++) { if (!(graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START)) { auto child_graph = GetGraph(graph_order[i]); CompileChildGraph(child_graph); } } SetSummaryNodes(graph.get()); // merge child graph MergeGraphExecOrder(); } else { auto single_graph = GetGraph(graph_id); MS_EXCEPTION_IF_NULL(single_graph); CompileChildGraph(single_graph); // set the distinction label of single graph single_graph->set_stream_distinction_label(graph_id); single_graph->UpdateExecuteKernelStreamLabel(); } // adjust execution order because merge child graph and other special operations AdjustKernel(graph); #if ENABLE_CPU && ENABLE_D InitPsWorker(graph); #endif // Assign streams for control sink and hccl and so on AssignStream(NOT_NULL(graph)); device::KernelAdjust::GetInstance().Profiling(NOT_NULL(graph.get())); // build kernel if node is cnode BuildKernel(graph); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (debugger_ && debugger_->partial_memory()) { debugger_->PreExecute(graph, graph_sum_); } if (ms_context->get_param(MS_CTX_PRECOMPILE_ONLY)) { MS_LOG(INFO) << "Precompile only, stop in build kernel step"; } else { // alloc memory, including static memory and dynamic memory MemoryAlloc(graph.get()); // generate and load task info to device if it is sink mode Load(graph); } // sync the initial const tensor to device SyncInitialTenosrToDevice(); DumpAllGraphs({graph}); MS_LOG(INFO) << "End"; } void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) { MS_EXCEPTION_IF_NULL(child_graph); MS_LOG(INFO) << "CompileChildGraph " << child_graph->ToString(); opt::AscendBackendIRFusionOptimization(child_graph); child_graph->SetExecOrderByDefault(); auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { std::string file_name = "select_kernel_before_graph_" + std::to_string(child_graph->graph_id()) + ".ir"; DumpIR(file_name, child_graph); } // select kernel build info SelectKernel(*child_graph); if (save_graphs) { std::string file_name = "select_kernel_after_graph_" + std::to_string(child_graph->graph_id()) + ".ir"; DumpIR(file_name, child_graph); } // optimize graph HardwareOptimize(child_graph); // assign static memory of parameters auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->AssignStaticMemoryInput(child_graph.get()); runtime_instance->AssignStaticMemoryValueNode(child_graph.get()); } bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedInsertSwitch(); } void AscendSession::RunGraphImpl(const GraphId &graph_id, const std::vector &inputs, VectorRef *const outputs) { MS_LOG(INFO) << "Start"; auto kernel_graph = GetGraph(graph_id); MS_EXCEPTION_IF_NULL(kernel_graph); // if none of child graph and no anf output exists if (!kernel_graph->executable()) { MS_LOG(INFO) << "No child graph has anf output"; return; } // load data to extra params std::set memo; SyncDataToExtraParams(NOT_NULL(kernel_graph), NOT_NULL(&memo)); memo.clear(); if (debugger_) { debugger_->PreExecute(kernel_graph, graph_sum_); } #if ENABLE_CPU && ENABLE_D // Initialize parameter server InitPSParamAndOptim(kernel_graph, inputs); std::string channel_name; if (ps::PsDataPrefetch::GetInstance().cache_enable() && IsGetNextGraph(graph_id, &channel_name)) { ps::ps_cache_instance.IncreaseGraphStep(channel_name); } #endif { // run task on device Execute(kernel_graph, true); } // summary Summary(kernel_graph.get()); // load tensor from device for debugger if (debugger_ && debugger_->debugger_enabled()) { LoadTensor(kernel_graph); } // debugger post-execution processing if (debugger_) { debugger_->PostExecute(); } MS_LOG(INFO) << "Finish!"; } void AscendSession::RunOpHardwareOptimize(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start"; // data layout optimization opt::AscendDataLayout(kernel_graph); // mixed precision optimization opt::AscendMixPrecision(kernel_graph); MS_LOG(INFO) << "Finish"; } bool AscendSession::GraphCacheExist(const GraphInfo &graph_info) const { return run_op_graphs_.find(graph_info) != run_op_graphs_.end(); } void AscendSession::BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors, const std::vector &tensors_mask) { MS_LOG(INFO) << "Build op " << op_run_info.op_name << " start !"; if (GraphCacheExist(graph_info)) { MS_LOG(INFO) << "Build op " << op_run_info.op_name << " graph cache has existed !"; return; } const auto &graph = PreBuildOp(op_run_info, graph_info, input_tensors, tensors_mask); MS_EXCEPTION_IF_NULL(graph); // init runtime resource InitRuntimeResource(); // build kernel RunOpAdjustKernel(graph); BuildKernel(graph); run_op_graphs_[graph_info] = graph; MS_LOG(INFO) << "Build op " << op_run_info.op_name << " finish !"; } void AscendSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info, std::vector *input_tensors, VectorRef *outputs, const std::vector &tensors_mask) { MS_EXCEPTION_IF_NULL(input_tensors); MS_EXCEPTION_IF_NULL(op_run_info); BuildOpImpl(*op_run_info, graph_info, *input_tensors, tensors_mask); EraseValueNodeTensor(tensors_mask, input_tensors); // wait for allreduce for (auto &tensor : *input_tensors) { if (tensor->NeedWaitDevice()) { tensor->WaitDevice(); } } // Run op auto graph = run_op_graphs_[graph_info]; MS_EXCEPTION_IF_NULL(graph); MS_LOG(INFO) << "Run op " << op_run_info->op_name << " start!"; // malloc mem RunOpRemoveNopNode(graph); RunOpMemoryAlloc(*input_tensors, graph.get()); // Build dynamic kernel if (op_run_info->is_dynamic_shape) { BuildDynamicKernel(graph); } // load input data to device LoadInputData(graph, *input_tensors); // run op Execute(graph, false); // get output UpdateOutputs(graph, outputs, *input_tensors); // update output abstract of dynamic op to op_run_info if (op_run_info->is_dynamic_shape) { UpdateOutputAbstract(graph, op_run_info); } RunOpMemoryClear(graph.get()); MS_LOG(INFO) << "Run op " << op_run_info->op_name << " finish!"; } KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info, const GraphInfo &graph_info, const std::vector &input_tensors, const std::vector &tensors_mask) { // Construct graph include one op auto graph = ConstructSingleOpGraph(op_run_info, input_tensors, tensors_mask, true); MS_EXCEPTION_IF_NULL(graph); opt::RunOpAscendBackendIRFusionOptimization(graph); SelectKernel(*graph); RunOpHardwareOptimize(graph); return graph; } void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map ¶meter_index, const std::vector &graph_inputs, const std::map &node_output_info, InputTensorInfo *input_tensor_info) { MS_EXCEPTION_IF_NULL(cnode); MS_EXCEPTION_IF_NULL(input_tensor_info); for (size_t i = 1; i < cnode->inputs().size(); i += 1) { const auto &input = cnode->input(i); auto kernel_with_index = AnfAlgo::VisitKernel(input, 0); auto real_input = kernel_with_index.first; MS_EXCEPTION_IF_NULL(real_input); tensor::TensorPtr tensor = nullptr; if (real_input->isa()) { tensor = GetValueNodeOutputTensor(real_input, kernel_with_index.second); input_tensor_info->input_tensors_mask.emplace_back(kParameterDataTensorMask); } else if (real_input->isa()) { tensor = GetParameterOutputTensor(real_input, parameter_index, graph_inputs); auto parameter = real_input->cast(); MS_EXCEPTION_IF_NULL(parameter); input_tensor_info->input_tensors_mask.emplace_back(parameter->has_default() ? kParameterWeightTensorMask : kParameterDataTensorMask); } else if (real_input->isa()) { bool output_is_weight = false; tensor = GetCNodeOutputStubTensor(kernel_with_index, node_output_info, &output_is_weight); input_tensor_info->input_tensors_mask.emplace_back(output_is_weight ? kParameterWeightTensorMask : kParameterDataTensorMask); } else { MS_LOG(EXCEPTION) << "Invalid input node, node = " << real_input->DebugString(); } MS_EXCEPTION_IF_NULL(tensor); MS_LOG(DEBUG) << "Get" << i << "th input tensor of " << cnode->fullname_with_scope() << " from " << real_input->fullname_with_scope() << "-" << kernel_with_index.second; input_tensor_info->input_tensors.emplace_back(tensor); } } void AscendSession::BuildOpsInGraph(const GraphId &graph_id, const std::map ¶meter_index, const std::vector &graph_inputs, const std::map &cnode_refcount) { if (built_graph_id_.find(graph_id) != built_graph_id_.end()) { return; } auto graph = GetGraph(graph_id); MS_EXCEPTION_IF_NULL(graph); std::map op_output_info; std::vector kernels; std::unordered_map> single_op_graphs; // Collect kernels need to be built in single op graphs for (const auto &kernel : graph->execution_order()) { // Generate fake input tensors, tensor masks and input kernel with index InputTensorInfo input_tensor_info; GetOpInputStubTensors(kernel, parameter_index, graph_inputs, op_output_info, &input_tensor_info); // Get OpRunInfo and GraphInfo OpRunInfo op_run_info; GetSingleOpRunInfo(kernel, &op_run_info); if (op_run_info.is_dynamic_shape) { MS_LOG(INFO) << "BuildOpsInGraph stop, op " << op_run_info.op_name << " is dynamic shape."; break; } const GraphInfo &graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); const auto &single_op_graph_iter = run_op_graphs_.find(graph_info); if (single_op_graph_iter != run_op_graphs_.end()) { // if graph of same single op exists, the output tensor of current op should be generated const auto &single_op_graph = single_op_graph_iter->second; GenOpOutputStubTensor(single_op_graph, kernel, cnode_refcount, &op_output_info); continue; } const auto &single_op_graph = PreBuildOp(op_run_info, graph_info, input_tensor_info.input_tensors, input_tensor_info.input_tensors_mask); MS_EXCEPTION_IF_NULL(single_op_graph); GenOpOutputStubTensor(single_op_graph, kernel, cnode_refcount, &op_output_info); opt::HideNopNode(single_op_graph.get()); // The graph info could have been changed in PreBuildOp const GraphInfo &new_graph_info = GetSingleOpGraphInfo(kernel, input_tensor_info.input_tensors); single_op_graphs.insert({single_op_graph, {graph_info, new_graph_info}}); const auto &execution_order = single_op_graph->execution_order(); std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels)); } InitRuntimeResource(); // Compile all kernels parallel BuildKernel(kernels); // Some new kernel may be added after KernelBuildPreprocess, so collect and build kernels again kernels.clear(); for (const auto &single_op_graph : single_op_graphs) { device::ascend::KernelBuildPreprocess(single_op_graph.first.get()); const auto &execution_order = single_op_graph.first->execution_order(); std::copy(execution_order.begin(), execution_order.end(), std::back_inserter(kernels)); } BuildKernel(kernels); // Record single op graphs in run_op_graphs_ so that these graphs can be reused in BuildOpImpl for (const auto &single_op_graph : single_op_graphs) { RunOpMemoryClear(single_op_graph.first.get()); for (const auto &graph_info : single_op_graph.second) { run_op_graphs_[graph_info] = single_op_graph.first; MS_LOG(DEBUG) << "Pre build op finished, graph info: " << graph_info; } } built_graph_id_.insert(graph_id); } // compile graph steps void AscendSession::SelectKernel(const KernelGraph &kernel_graph) const { MS_LOG(INFO) << "Start!"; size_t raise_precision_count = 0; size_t reduce_precision_count = 0; for (const auto &cnode : kernel_graph.execution_order()) { auto status = device::ascend::SelectKernelInfo(cnode); AnfAlgo::EraseNodeAttr(kAttrPynativeNextOpName, cnode); AnfAlgo::EraseNodeAttr(kAttrPynativeNextIndex, cnode); if (status == device::ascend::kStatusRaisePrecision) { raise_precision_count++; } else if (status == device::ascend::kStatusReducePrecision) { reduce_precision_count++; } MS_LOG(INFO) << "Select ApplyKernel: " << cnode->DebugString(); } auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kGraphMode) { if (raise_precision_count > 0) { MS_LOG(WARNING) << "There has " << raise_precision_count << " node/nodes used raise precision to selected the kernel!"; } if (reduce_precision_count > 0) { MS_LOG(WARNING) << "There has " << reduce_precision_count << " node/nodes used reduce precision to selected the kernel!"; } } MS_LOG(INFO) << "Finish!"; } void DumpInit() { auto &json_parser = DumpJsonParser::GetInstance(); json_parser.Parse(); json_parser.CopyJsonToDir(); if (json_parser.async_dump_enabled()) { if (AdxDataDumpServerInit() != 0) { MS_LOG(EXCEPTION) << "Adx data dump server init failed"; } } } void AscendSession::InitRuntimeResource() { MS_LOG(INFO) << "Start!"; auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); if (!runtime_instance->Init()) { MS_LOG(EXCEPTION) << "Kernel runtime init error."; } DumpInit(); MS_LOG(INFO) << "Finish!"; } void AscendSession::HardwareOptimize(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "HardwareOptimize start!"; opt::AscendBackendOptimization(kernel_graph); opt::AscendGraphKernelCommonProcess(kernel_graph); GraphKernelOptimize(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph); kernel_graph->SetExecOrderByDefault(); MS_LOG(INFO) << "HardwareOptimize Finish!"; } void AscendSession::GraphKernelOptimize(const std::shared_ptr &kernel_graph) const { if (!context::GraphKernelFlags::GetInstance().IsEnableGraphKernel()) { return; } opt::GraphKernelOptimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); } void AscendSession::AdjustKernel(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; opt::HideNopNode(kernel_graph.get()); // Insert CLearZero op // prepare for next step from json get atomic info BuildKernel(kernel_graph); device::ascend::KernelBuildPreprocess(kernel_graph.get()); device::KernelAdjust::GetInstance().InsertSwitchLoop(kernel_graph); auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { DumpIR("after_adjust_kernel.ir", kernel_graph); } MS_LOG(INFO) << "Finish!"; } void AscendSession::RunOpAdjustKernel(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; RunOpHideNopNode(kernel_graph); // Insert CLearZero op // prepare for next step from json get atomic info BuildKernel(kernel_graph); device::ascend::KernelBuildPreprocess(kernel_graph.get()); MS_LOG(INFO) << "Finish!"; } void AscendSession::AssignStream(NotNull kernel_graph) const { MS_LOG(INFO) << "Start!"; device::ascend::AscendStreamAssign::GetInstance().AssignStream(kernel_graph); MS_LOG(INFO) << "Finish!"; } void AscendSession::BuildKernel(const std::shared_ptr &kernel_graph) const { BuildKernel(kernel_graph->execution_order()); } void AscendSession::BuildKernel(const std::vector &kernels) const { MS_LOG(INFO) << "Start!"; struct timeval start_time, end_time; (void)gettimeofday(&start_time, nullptr); auto ret = device::ascend::KernelBuild(kernels); if (!ret) { MS_LOG(EXCEPTION) << "Kernel build error."; } (void)gettimeofday(&end_time, nullptr); const uint64_t kUSecondInSecond = 1000000; uint64_t cost = kUSecondInSecond * static_cast(end_time.tv_sec - start_time.tv_sec); cost += static_cast(end_time.tv_usec - start_time.tv_usec); MS_LOG(INFO) << "KernelBuild run in " << PRIu64 << " us " << cost; MS_LOG(INFO) << "Finish!"; } void AscendSession::BuildDynamicKernel(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); const auto &kernels = kernel_graph->execution_order(); auto iter = std::find_if(kernels.begin(), kernels.end(), [](const CNodePtr &kernel) { return AnfAlgo::GetBooleanAttr(kernel, kAttrOutputIsDynamicShape); }); if (iter == kernels.end()) { return; } auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); if (!runtime_instance->GenDynamicKernel(kernel_graph.get())) { MS_LOG(DEBUG) << "Graph:" << kernel_graph->graph_id() << " failed to generate dynamic kernel!"; } MS_LOG(INFO) << "Finish!"; } void AscendSession::MemoryAlloc(KernelGraph *kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->AssignMemory(kernel_graph); MS_LOG(INFO) << "Finish!"; } void AscendSession::RunOpMemoryAlloc(const std::vector &input_tensors, KernelGraph *kernel_graph) const { MS_LOG(INFO) << "Start memory alloc!"; MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->RunOpAssignMemory(input_tensors, kernel_graph); MS_LOG(INFO) << "Finish!"; } void AscendSession::RunOpMemoryClear(const KernelGraph *kernel_graph) const { MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->RunOpClearMemory(kernel_graph); } void AscendSession::Load(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); (void)device::KernelAdjust::GetInstance().StepLoadCtrlInputs(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); bool ret_ok = runtime_instance->Load(kernel_graph.get(), is_task_sink); if (!ret_ok) { MS_LOG(EXCEPTION) << "Load task error!"; } MS_LOG(INFO) << "Finish!"; } void AscendSession::Execute(const std::shared_ptr &kernel_graph, bool is_task) const { MS_LOG(INFO) << "Start!"; bool is_task_sink = false; if (is_task) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); is_task_sink = context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK); } auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); bool ret_ok = runtime_instance->Run(kernel_graph.get(), is_task_sink); Dump(kernel_graph); if (!ret_ok) { #ifdef ENABLE_DUMP_IR mindspore::RDR::TriggerAll(); #endif MS_LOG(EXCEPTION) << "run task error!"; } MS_LOG(INFO) << "Finish!"; } void AscendSession::Dump(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); E2eDumpUtil::DumpData(kernel_graph.get(), device_id_); MS_LOG(INFO) << "Finish!"; } void AscendSession::DumpAllGraphs(const std::vector &all_graphs) { #ifdef ENABLE_DUMP_IR auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); auto &json_parser = DumpJsonParser::GetInstance(); json_parser.Parse(); if (!save_graphs && !json_parser.e2e_dump_enabled() && !json_parser.async_dump_enabled() && !mindspore::RecorderManager::Instance().RdrEnable()) { return; } auto kernel_runtime = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(kernel_runtime); uint32_t device_id = kernel_runtime->device_id(); for (auto &graph : all_graphs) { MS_EXCEPTION_IF_NULL(graph); std::string name = "graph_build." + std::to_string(graph->graph_id()); DumpGraphParams dump_params = {true, static_cast(kWholeStack)}; mindspore::RDR::RecordAnfGraph(SUBMODULE_ID, name, graph, dump_params, ".ir;.pb"); if (save_graphs) { std::string file_name = "graph_build_" + std::to_string(graph->graph_id()) + ".ir"; DumpIR(file_name, graph, true, kWholeStack); DumpIRProto(graph, "vm_build_" + std::to_string(graph->graph_id())); DumpIR("trace_code_graph", graph, true, kWholeStack); } std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id()); if (json_parser.e2e_dump_enabled()) { std::string root_dir = json_parser.path() + "/" + json_parser.net_name() + "/device_" + std::to_string(device_id); std::string target_dir = root_dir + "/graphs"; std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack); DumpIR("trace_code_graph", graph, true, kWholeStack, ir_file_path); DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv", root_dir, graph->execution_order()); } else if (json_parser.async_dump_enabled()) { std::string root_dir = json_parser.path() + "/device_" + std::to_string(device_id); std::string target_dir = root_dir + "/graphs"; std::string ir_file_path = target_dir + "/" + "ms_output_" + final_graph + ".ir"; DumpIRProtoWithSrcInfo(graph, final_graph, target_dir, kDebugWholeStack); DumpIR("trace_code_graph", graph, true, kWholeStack, ir_file_path); DumpGraphExeOrder("ms_execution_order_graph_" + std::to_string(graph->graph_id()) + ".csv", root_dir, graph->execution_order()); } } #endif } void AscendSession::LoadTensor(const std::shared_ptr &kernel_graph) const { MS_LOG(INFO) << "Start!"; MS_EXCEPTION_IF_NULL(kernel_graph); auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); (void)runtime_instance->LoadData(kernel_graph.get()); MS_LOG(INFO) << "Finish!"; } void AscendSession::RecurseSetSummaryNodes(KernelGraph *graph, std::map> *summary) { MS_EXCEPTION_IF_NULL(graph); MS_EXCEPTION_IF_NULL(summary); // if final graph have no child graph auto graph_order_iter = graph_execute_orders_.find(graph->graph_id()); if (graph_order_iter == graph_execute_orders_.end()) { SessionBasic::SetSummaryNodes(graph); auto summary_nodes = graph->summary_nodes(); summary->insert(summary_nodes.begin(), summary_nodes.end()); return; } // for every child graph, find summary nodes auto graph_order = GetGraphOrder(graph->graph_id()); for (size_t i = 0; i < graph_order.size(); i++) { auto child_graph = GetGraph(graph_order[i]); if (child_graph == nullptr) { continue; } SessionBasic::SetSummaryNodes(child_graph.get()); auto child_graph_summary = child_graph->summary_nodes(); summary->insert(child_graph_summary.begin(), child_graph_summary.end()); RecurseSetSummaryNodes(child_graph.get(), summary); } graph->set_summary_nodes(*summary); } void AscendSession::SetSummaryNodes(KernelGraph *graph) { MS_LOG(DEBUG) << "Update summary Start"; MS_EXCEPTION_IF_NULL(graph); auto summary_nodes = graph->summary_nodes(); std::map> summary; summary.insert(summary_nodes.begin(), summary_nodes.end()); RecurseSetSummaryNodes(graph, &summary); graph->set_summary_nodes(summary); MS_LOG(DEBUG) << "Update summary end size: " << summary.size(); } void AscendSession::MergeGraphExecOrder() { MS_LOG(INFO) << "Start!"; // merge graph order auto &graph_order = GetGraphOrder(final_graph_id_); auto &graph_type = GetGraphOrderType(final_graph_id_); auto final_graph = GetGraph(final_graph_id_); MS_EXCEPTION_IF_NULL(final_graph); if (graph_order.empty()) { MS_LOG(WARNING) << "Graph output is a lonely variable not linked to any op!"; return; } if (graph_order.size() > 1) { auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); if (!context_ptr->get_param(MS_CTX_ENABLE_TASK_SINK)) { MS_LOG(EXCEPTION) << "Control sink network should run with task-sink mode!"; } } // if first graph is common,the final graph has no label,then set the stream of final graph same with the first graph SetStreamDistinctionLabel(final_graph, graph_order[0], false); std::vector final_exec_order = final_graph->execution_order(); KernelGraphPtr last_graph = nullptr; for (size_t i = 0; i < graph_order.size(); i++) { auto graph_id = graph_order[i]; if (graph_type[i] == BRANCH_END || graph_type[i] == BRANCH_START) { continue; } auto child_graph = GetGraph(graph_id); last_graph = child_graph; MS_EXCEPTION_IF_NULL(child_graph); auto exec_order = child_graph->execution_order(); MS_LOG(INFO) << "Merge graph,graph_id " << graph_id; (void)std::transform(exec_order.begin(), exec_order.end(), std::back_inserter(final_exec_order), [&](CNodePtr node) -> CNodePtr { AnfAlgo::SetStreamDistinctionLabel(child_graph->stream_distinction_label(), node.get()); return node; }); // add all value nodes of child graphs to final graph for (auto &value_node : child_graph->graph_value_nodes()) { final_graph->AddValueNodeToGraph(value_node); } // copy ref map to final graph auto child_ref_map = child_graph->GetRefMap(); for (auto &item : child_ref_map) { if (final_graph->IsInRefOutputMap(item.first)) { MS_LOG(EXCEPTION) << "The ref pair is already in final graph!"; } final_graph->AddRefCorrespondPairs(item.first, item.second); } } // set final_exec_order into final graph MS_EXCEPTION_IF_NULL(final_graph); DumpGraphExeOrder(final_exec_order); final_graph->set_execution_order(final_exec_order); } const std::vector &AscendSession::GetGraphOrder(GraphId final_graph_id) const { auto graph_order_iter = graph_execute_orders_.find(final_graph_id); if (graph_order_iter == graph_execute_orders_.end()) { MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no child graph"; } return graph_order_iter->second; } const std::vector &AscendSession::GetGraphOrderType(GraphId final_graph_id) const { auto graph_type_iter = graph_order_types_.find(final_graph_id); if (graph_type_iter == graph_order_types_.end()) { MS_LOG(EXCEPTION) << "Final graph" << final_graph_id << "has no graph_order_types_"; } return graph_type_iter->second; } void AscendSession::SyncInitialTenosrToDevice() { for (auto &item : initial_tenosrs_) { auto to_graph_id = item.first.first; auto input_idx = item.first.second; auto front_tensor = item.second; auto to_graph = GetGraph(to_graph_id); MS_EXCEPTION_IF_NULL(to_graph); std::vector graph_inputs = to_graph->inputs(); if (input_idx >= graph_inputs.size()) { MS_LOG(EXCEPTION) << "Input_index " << input_idx << " out of range size " << graph_inputs.size(); } auto backend_parameter = graph_inputs[input_idx]; // sync data from host to device MS_EXCEPTION_IF_NULL(front_tensor); size_t tensor_size = front_tensor->data().nbytes(); auto addr = AnfAlgo::GetOutputAddr(backend_parameter, 0); MS_EXCEPTION_IF_NULL(addr); if (!addr->SyncHostToDevice(trans::GetRuntimePaddingShape(backend_parameter, 0), tensor_size, front_tensor->data_type(), front_tensor->data_c())) { MS_LOG(EXCEPTION) << "Tensor SyncHostToDevice fail!"; } } } void AscendSession::BackendOptimization(const std::vector &all_graphs) { MS_LOG(INFO) << "Start BackendCommonOptimization"; for (auto &graph : all_graphs) { opt::BackendCommonOptimization(graph); } MS_LOG(INFO) << "End."; } void AscendSession::LinkChildGraphs(NotNull graph) { AscendControlParser::LinkGraph(graph); } bool AscendSession::IsMultiCallGraph(NotNull graph, std::vector parent_graphs) { std::stack post_graph; std::set memo; post_graph.push(graph->graph_id()); while (!post_graph.empty()) { auto graph_id = post_graph.top(); post_graph.pop(); memo.insert(graph_id); for (auto child_graph : graphs_[graph_id]->child_graph_order()) { std::shared_ptr child_graph_ptr = child_graph.lock(); MS_EXCEPTION_IF_NULL(child_graph_ptr); if (std::find(parent_graphs.begin(), parent_graphs.end(), child_graph_ptr->graph_id()) != parent_graphs.end()) { MS_LOG(DEBUG) << "graph:" << graph->graph_id() << " will call its parent graph:" << child_graph_ptr->graph_id(); return false; } else if (memo.find(child_graph_ptr->graph_id()) == memo.end()) { MS_LOG(DEBUG) << "child graph:" << child_graph_ptr->graph_id() << " into deque, wait for check."; post_graph.push(child_graph_ptr->graph_id()); } } } return true; } void AscendSession::MultiCallGraphOptimize(NotNull root_graph) { for (auto current : parent_graphs_) { if (current.second.size() < 2) { continue; } auto graph = graphs_[current.first]; auto parent_kernel_graphs = current.second; if (!IsMultiCallGraph(NOT_NULL(graph), parent_kernel_graphs)) { MS_LOG(DEBUG) << "graph:" << graph->graph_id() << " with it's parent graphs make up a cycle"; continue; } MS_LOG(INFO) << "graph: " << graph->graph_id() << " has been called by more than two graphs"; int32_t index = 0; std::vector child_graphs; auto start_label_id = AnfAlgo::GetNodeAttr(graph->get_start_label(), kAttrLabelIndex); auto end_node = graph->get_end_goto(); ParameterPtr post_label_param = graph->AddExtraParamAndTensor("label_param", 0); std::vector new_inputs = {std::make_shared(std::make_shared(kLabelSwitchOpName)), post_label_param}; for (auto graph_id : parent_kernel_graphs) { auto kg = graphs_[graph_id]; auto nodes = kg->execution_order(); for (uint32_t i = 0; i < nodes.size(); i++) { if (AnfAlgo::IsLabelIndexInNode(nodes[i], start_label_id)) { if (i < (nodes.size() - 1)) { new_inputs.push_back(nodes[i + 1]); } else { MS_LOG(EXCEPTION) << "No labelset after labelgoto"; } ParameterPtr pre_label_param = kg->AddExtraParamAndTensor("label_param", index++); AscendControlParser::InsertMultipleAssignToGraph(NOT_NULL(kg), nodes[i], NOT_NULL(pre_label_param), NOT_NULL(post_label_param)); } } kg->SetExecOrderByDefault(); child_graphs.push_back(kg); } end_node->set_inputs(new_inputs); AnfAlgo::SetNodeAttr(kAttrChildGraph, MakeValue>(child_graphs), end_node); std::vector label_list; for (size_t i = kLabelSwitchLabelId; i < end_node->size(); ++i) { auto input = end_node->input(i); MS_EXCEPTION_IF_NULL(input); if (!input->isa() || AnfAlgo::GetCNodeName(input) != kLabelSetOpName) { break; } uint32_t goto_label_id = AnfAlgo::GetNodeAttr(input, kAttrLabelIndex); label_list.push_back(goto_label_id); MS_LOG(INFO) << "Switch " << end_node->DebugString() << " case " << i - kLabelSwitchLabelId << ": id " << goto_label_id; } AnfAlgo::SetNodeAttr(kAttrLabelSwitchList, MakeValue>(label_list), end_node); end_node->set_inputs({end_node->input(kAnfPrimitiveIndex), end_node->input(kFirstDataInputIndex)}); graph->SetExecOrderByDefault(); } } void AscendSession::SyncDataToExtraParams(NotNull graph, NotNull *> memo) { if (memo->find(graph.get()) != memo->end()) { return; } memo->insert(graph.get()); auto extra_param_tensor = graph->GetExtraParamAndTensor(); for (uint32_t i = 0; i < extra_param_tensor.size(); i++) { auto param = extra_param_tensor[i].first; auto tensor = extra_param_tensor[i].second; auto device_address = AnfAlgo::GetMutableOutputAddr(param, 0); MS_EXCEPTION_IF_NULL(device_address); tensor->set_device_address(device_address); if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(param, 0), LongToSize(tensor->data().nbytes()), tensor->data_type(), tensor->data_c())) { MS_LOG(EXCEPTION) << "SyncHostToDevice failed."; } } for (auto &child_graph : graph->child_graph_order()) { SyncDataToExtraParams(NOT_NULL(child_graph.lock()), memo); } } void AscendSession::RootGraphExecutorValidate(NotNull graph) { AscendAutoMonad auto_monad(graph); auto_monad.GenerateExecuteOrder(); } void AscendSession::CreateMultiBranchOutput(NotNull graph, NotNull *> memo) { if (memo->find(graph.get()) != memo->end()) { return; } memo->insert(graph.get()); graph->UpdateChildGraphOrder(); for (auto &child_graph : graph->child_graph_order()) { CreateMultiBranchOutput(NOT_NULL(child_graph.lock()), memo); } std::map need_replace_list; auto node_list = GetCNodes(TopoSort(graph->get_return())); for (auto &node : node_list) { if (AnfAlgo::CheckPrimitiveType(node, prim::kPrimCall) || AnfAlgo::CheckPrimitiveType(node, prim::kPrimSwitch) || AnfAlgo::CheckPrimitiveType(node, prim::kPrimSwitchLayer)) { // create a parameter to store the output of multiple branch and set the parameter as the condition graph's output auto output_param = graph->TransTupleToMakeTuple(graph->NewParameter(node->abstract())); MS_EXCEPTION_IF_NULL(graph->MutableInputs()); graph->AddChildGraphResult(output_param); std::vector depend_inputs = { graph->NewValueNode(NewValueNode(std::make_shared(prim::kPrimDepend->name()))), output_param, node}; auto depend = graph->NewCNode(depend_inputs); depend->set_abstract(output_param->abstract()); need_replace_list.emplace(node, depend); MS_LOG(INFO) << "Create parameter " << output_param->DebugString() << " for call node " << node->DebugString() << ", depend node is " << depend->DebugString(); // insert assign in order to transfer child graph output to parameter auto child_graphs = AnfAlgo::GetCallSwitchKernelGraph(node); for (auto &child_graph : child_graphs) { MS_EXCEPTION_IF_NULL(child_graph); // If graph has no output, the graph is the true graph of while and will call condition graph, no need insert // assign from condition to true graph if (memo->find(child_graph) != memo->end()) { continue; } AscendControlParser::InsertMultipleAssignToGraph(NOT_NULL(child_graph), nullptr, NOT_NULL(child_graph->output()), NOT_NULL(output_param)); } } } // searching for nodes' input to replace call by depend(parameter, call) for (auto &node : node_list) { for (size_t i = 0; i < node->size(); ++i) { auto input = node->input(i); auto iter = need_replace_list.find(input); if (iter != need_replace_list.end()) { node->set_input(i, iter->second); } } } memo->erase(graph.get()); } void AscendSession::IrFusionPass(const NotNull graph, NotNull *> memo) { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); opt::AscendBackendIRFusionOptimization(graph); graph->SetExecOrderByDefault(); auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { std::string file_name = "select_kernel_before_graph_" + std::to_string(graph->graph_id()) + ".ir"; DumpIR(file_name, graph.get()); } for (auto &child_graph : graph->child_graph_order()) { IrFusionPass(NOT_NULL(child_graph.lock()), memo); } } void AscendSession::SelectKernel(NotNull root_graph) { MS_LOG(INFO) << "Start select kernel."; size_t raise_precision_count = 0; size_t reduce_precision_count = 0; std::set memo; (void)RecurseSelectKernelInfo(root_graph, NOT_NULL(&memo), &raise_precision_count, &reduce_precision_count); memo.clear(); auto ms_context = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(ms_context); if (ms_context->get_param(MS_CTX_EXECUTION_MODE) == kGraphMode) { if (raise_precision_count > 0) { MS_LOG(WARNING) << "There are " << raise_precision_count << " node/nodes used raise precision to selected the kernel!"; } if (reduce_precision_count > 0) { MS_LOG(WARNING) << "There are " << reduce_precision_count << " node/nodes used reduce precision to selected the kernel!"; } } MS_LOG(INFO) << "Finish!"; } void AscendSession::RecurseSelectKernelInfo(NotNull graph, NotNull *> const memo, size_t *const raise_precision_count, size_t *const reduce_precision_count) const { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); MS_LOG(INFO) << "Start to select kernel info in graph: " << graph->graph_id(); for (const auto &cnode : graph->execution_order()) { if (AnfAlgo::IsCondControlKernel(cnode)) { std::vector child_graphs; if (AnfAlgo::HasNodeAttr(kAttrChildGraph, cnode)) { child_graphs = AnfAlgo::GetNodeAttr>(cnode, kAttrChildGraph); } for (auto &child_graph : child_graphs) { RecurseSelectKernelInfo(NOT_NULL(child_graph), memo, raise_precision_count, reduce_precision_count); } } auto status = device::ascend::SelectKernelInfo(cnode); if (status == device::ascend::kStatusRaisePrecision) { (*raise_precision_count)++; } else if (status == device::ascend::kStatusReducePrecision) { (*reduce_precision_count)++; } } auto context_ptr = MsContext::GetInstance(); MS_EXCEPTION_IF_NULL(context_ptr); bool save_graphs = context_ptr->get_param(MS_CTX_SAVE_GRAPHS_FLAG); if (save_graphs) { std::string file_name = "select_kernel_after_graph_" + std::to_string(graph->graph_id()) + ".ir"; DumpIR(file_name, graph.get()); } MS_LOG(INFO) << "Finish selecting kernel info in graph: " << graph->graph_id(); } void AscendSession::HardwareOptimize(NotNull graph, NotNull *> const memo) const { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); MS_LOG(INFO) << "Start to do HardwareOptimize in graph: " << graph->graph_id(); HardwareOptimize(graph.get()); for (auto &child_graph : graph->child_graph_order()) { HardwareOptimize(NOT_NULL(child_graph.lock()), memo); } MS_LOG(INFO) << "Finish doing HardwareOptimize in graph: " << graph->graph_id(); } void AscendSession::LoadGraphsToDbg(NotNull graph, NotNull *> const memo) const { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id(); debugger_->LoadGraphs(graph); MS_LOG(INFO) << "graph_sum_: " << graph_sum_; for (auto &child_graph : graph->child_graph_order()) { LoadGraphsToDbg(NOT_NULL(child_graph.lock()), memo); } MS_LOG(INFO) << "Finish doing LoadGraphsToDbg in graph: " << graph->graph_id(); } void AscendSession::AssignStaticMemory(NotNull graph, NotNull *> const memo) const { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); MS_LOG(INFO) << "Start to assign static memory for parameter in graph: " << graph->graph_id(); // assign static memory for parameters auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); runtime_instance->ClearGlobalIdleMem(); runtime_instance->AssignStaticMemoryInput(graph.get().get()); runtime_instance->AssignStaticMemoryValueNode(graph.get().get()); for (auto &child_graph : graph->child_graph_order()) { AssignStaticMemory(NOT_NULL(child_graph.lock()), memo); } MS_LOG(INFO) << "Finish assigning static memory for parameter in graph: " << graph->graph_id(); } void AscendSession::UpdateRefOutputMap(NotNull graph, NotNull *> const memo) const { if (memo->find(graph) != memo->end()) { return; } memo->insert(graph.get()); for (auto &child_graph : graph->child_graph_order()) { std::shared_ptr child_graph_ptr = child_graph.lock(); MS_EXCEPTION_IF_NULL(child_graph_ptr); UpdateRefOutputMap(NOT_NULL(child_graph_ptr), memo); // copy ref map to final graph auto child_ref_map = child_graph_ptr->GetRefMap(); for (auto &item : child_ref_map) { if (graph->IsInRefOutputMap(item.first)) { MS_LOG(WARNING) << "The ref pair <" << item.first.first->DebugString() << ", " << item.first.second << "> is already in " << graph->ToString(); continue; } graph->AddRefCorrespondPairs(item.first, item.second); } } } void AscendSession::SyncStream() { auto runtime_instance = device::KernelRuntimeManager::Instance().GetKernelRuntime(kAscendDevice, device_id_); MS_EXCEPTION_IF_NULL(runtime_instance); auto ret = runtime_instance->SyncStream(); if (!ret) { MS_LOG(EXCEPTION) << "Sync stream error!"; } } std::shared_ptr AscendSession::CreateBucket(uint32_t bucket_id, uint32_t bucket_size) { return std::make_shared(bucket_id, bucket_size); } } // namespace session } // namespace mindspore