From: @wenfangpei Reviewed-by: @coding2020,@gaoxiong1,@ckey_dou Signed-off-by: @ckey_doupull/14648/MERGE
| @@ -472,7 +472,7 @@ class GraphSplitAscend(GraphSplitByPattern): | |||
| def get_default_mode(self, op): | |||
| if op.prim == "MatMul": | |||
| return self.Area.MODE_COMPOSITE if op.inputs[0].dtype == "float16" else self.Area.MODE_BASIC | |||
| if op.prim in ("Tile", "BroadcastTo"): | |||
| if op.prim in ("Tile", "BroadcastTo", "ExpandDims"): | |||
| return self.Area.MODE_COMPOSITE | |||
| return self.Area.MODE_BASIC | |||
| @@ -34,7 +34,9 @@ | |||
| #include "pipeline/jit/action.h" | |||
| #include "utils/context/graph_kernel_flags.h" | |||
| #include "vm/segment_runner.h" | |||
| #if ENABLE_GPU | |||
| #if ENABLE_D | |||
| #include "runtime/device/ascend/kernel_select_ascend.h" | |||
| #elif ENABLE_GPU | |||
| #include "runtime/device/gpu/kernel_info_setter.h" | |||
| #endif | |||
| @@ -620,7 +622,11 @@ bool IsBasicFuseOp(const AnfNodePtr &node) { | |||
| std::vector<PrimitivePtr> basic_ops = GetFusibleOpList(); | |||
| #if ENABLE_D | |||
| if (!CheckProcessor(node)) { | |||
| return false; | |||
| std::vector<PrimitivePtr> fused_aicpu_op = {prim::kPrimExpandDims, prim::kPrimReshape}; | |||
| if (!std::any_of(fused_aicpu_op.begin(), fused_aicpu_op.end(), | |||
| [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) { | |||
| return false; | |||
| } | |||
| } | |||
| #endif | |||
| return std::any_of(basic_ops.begin(), basic_ops.end(), | |||
| @@ -644,7 +650,9 @@ bool IsFusibleOp(const AnfNodePtr &node) { | |||
| void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) { | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(cnode); | |||
| #if ENABLE_GPU | |||
| #if ENABLE_D | |||
| device::ascend::SetKernelInfo(cnode, kernel_type); | |||
| #elif ENABLE_GPU | |||
| device::gpu::SetKernelInfo(cnode, kernel_type); | |||
| #endif | |||
| } | |||
| @@ -100,17 +100,17 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { | |||
| PassManagerPtr GraphKernelOptimizer::Split() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage4_split"); | |||
| // Move the non-scalar tensor (in composite node) to parameter list | |||
| pm->AddPass(std::make_shared<TensorPromotion>()); | |||
| // Make certain nodes redundant so that they are used by only one user, | |||
| // which can avoid unnecessary input-output and get better performance. | |||
| if (is_gpu) { | |||
| // preprocess for ShapeOpsSplitter | |||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>()); | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||
| } | |||
| // preprocess for ShapeOpsSplitter | |||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>()); | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||
| // Split kernel according to costmodel | |||
| pm->AddPass(std::make_shared<GraphKernelSplitter>()); | |||
| @@ -120,11 +120,9 @@ PassManagerPtr GraphKernelOptimizer::Split() { | |||
| pm->AddPass(std::make_shared<GetitemTuple>()); | |||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | |||
| if (is_gpu) { | |||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>()); | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||
| } | |||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>()); | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||
| return pm; | |||
| } | |||
| @@ -359,12 +359,19 @@ class Splitter { | |||
| Splitter(const CNodePtr &main_cnode, SplitSchemerPtr split_schemer) | |||
| : main_func_graph_(main_cnode->func_graph()), old_subgraph_cnode_(main_cnode), split_schemer_(split_schemer) {} | |||
| void ResetInlinedNodesKernelInfo() { | |||
| for (const auto &node : inlined_nodes_) { | |||
| ResetKernelInfo(node); | |||
| } | |||
| } | |||
| // Maintain new subgraphs in main graph. | |||
| void RebuildGraph(const std::vector<size_t> &cnodes_group_id) { | |||
| BindFuncGraph(); | |||
| RecoverParameter(); | |||
| ConnectToMainGraph(cnodes_group_id); | |||
| UpdateSubGraphInfo(); | |||
| ResetInlinedNodesKernelInfo(); | |||
| } | |||
| // Rebind nodes to its new sub_func_graph | |||
| @@ -420,7 +427,7 @@ class Splitter { | |||
| } | |||
| } | |||
| if (AnfAlgo::IsRealKernel(node)) { | |||
| ResetKernelInfo(node); | |||
| inlined_nodes_.push_back(node); | |||
| } | |||
| } | |||
| } | |||
| @@ -533,6 +540,7 @@ class Splitter { | |||
| FuncGraphPtr main_func_graph_; | |||
| CNodePtr old_subgraph_cnode_; // The cnode that holds the original sub_func_graph | |||
| std::vector<CNodePtr> new_subgraph_cnodes_; // The cnode list that hold the new sub_func_graph | |||
| std::vector<AnfNodePtr> inlined_nodes_; | |||
| SplitSchemerPtr split_schemer_; | |||
| std::unordered_map<ParameterPtr, AnfNodePtr> param_to_main_graph_node_map_; | |||
| }; | |||
| @@ -54,6 +54,7 @@ | |||
| #include "debug/data_dump/dump_json_parser.h" | |||
| #include "debug/tensor_load.h" | |||
| #include "debug/anf_ir_utils.h" | |||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||
| #include "backend/session/ascend_auto_monad.h" | |||
| #include "debug/data_dump/e2e_dump_util.h" | |||
| @@ -515,6 +515,56 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kern | |||
| } | |||
| return select_status; | |||
| } | |||
| void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) { | |||
| auto kernel_info = static_cast<device::KernelInfo *>(kernel_node->kernel_info()); | |||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||
| auto kernel_build_info = kernel_info->select_kernel_build_info(); | |||
| MS_EXCEPTION_IF_NULL(kernel_build_info); | |||
| if (AnfAlgo::IsGraphKernel(kernel_node)) { | |||
| return; | |||
| } | |||
| auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(); | |||
| builder->SetOriginDataFormat(kernel_build_info->GetOriginDataFormat()); | |||
| builder->SetInputsFormat(kernel_build_info->GetAllInputFormats()); | |||
| builder->SetInputsDeviceType(kernel_build_info->GetAllInputDeviceTypes()); | |||
| builder->SetOutputsFormat(kernel_build_info->GetAllOutputFormats()); | |||
| builder->SetOutputsDeviceType(kernel_build_info->GetAllOutputDeviceTypes()); | |||
| builder->SetOpPattern(kernel_build_info->op_pattern()); | |||
| builder->SetFusionType(kernel_build_info->fusion_type()); | |||
| auto new_kernel_type = kernel_type; | |||
| auto new_processor = kernel_build_info->processor(); | |||
| if (kernel_type == UNKNOWN_KERNEL_TYPE) { | |||
| std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list; | |||
| std::vector<std::shared_ptr<kernel::KernelBuildInfo>> aicpu_kernel_info_list; | |||
| kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type); | |||
| auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list); | |||
| if (select_status != kNoMatched) { | |||
| new_kernel_type = TBE_KERNEL; | |||
| new_processor = kernel::Processor::AICORE; | |||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses TBE_KERNEL"; | |||
| } else { | |||
| kernel::AICPUQuery(kernel_node, &aicpu_kernel_info_list); | |||
| select_status = SetMatchedKernelInfo(kernel_node, aicpu_kernel_info_list); | |||
| if (select_status != kNoMatched) { | |||
| new_kernel_type = AICPU_KERNEL; | |||
| new_processor = kernel::Processor::AICPU; | |||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AICPU_KERNEL"; | |||
| } | |||
| } | |||
| } | |||
| if (new_kernel_type == UNKNOWN_KERNEL_TYPE) { | |||
| new_kernel_type = AKG_KERNEL; | |||
| new_processor = kernel::Processor::AICORE; | |||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AKG_KERNEL"; | |||
| } | |||
| builder->SetKernelType(new_kernel_type); | |||
| builder->SetProcessor(new_processor); | |||
| AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get()); | |||
| } | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||
| @@ -31,6 +31,7 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, | |||
| KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE); | |||
| void SetTensorDeviceInfo(const CNodePtr &kernel_node); | |||
| void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph); | |||
| void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type); | |||
| } // namespace ascend | |||
| } // namespace device | |||
| } // namespace mindspore | |||