From: @wenfangpei Reviewed-by: @coding2020,@gaoxiong1,@ckey_dou Signed-off-by: @ckey_doupull/14648/MERGE
| @@ -472,7 +472,7 @@ class GraphSplitAscend(GraphSplitByPattern): | |||||
| def get_default_mode(self, op): | def get_default_mode(self, op): | ||||
| if op.prim == "MatMul": | if op.prim == "MatMul": | ||||
| return self.Area.MODE_COMPOSITE if op.inputs[0].dtype == "float16" else self.Area.MODE_BASIC | return self.Area.MODE_COMPOSITE if op.inputs[0].dtype == "float16" else self.Area.MODE_BASIC | ||||
| if op.prim in ("Tile", "BroadcastTo"): | |||||
| if op.prim in ("Tile", "BroadcastTo", "ExpandDims"): | |||||
| return self.Area.MODE_COMPOSITE | return self.Area.MODE_COMPOSITE | ||||
| return self.Area.MODE_BASIC | return self.Area.MODE_BASIC | ||||
| @@ -34,7 +34,9 @@ | |||||
| #include "pipeline/jit/action.h" | #include "pipeline/jit/action.h" | ||||
| #include "utils/context/graph_kernel_flags.h" | #include "utils/context/graph_kernel_flags.h" | ||||
| #include "vm/segment_runner.h" | #include "vm/segment_runner.h" | ||||
| #if ENABLE_GPU | |||||
| #if ENABLE_D | |||||
| #include "runtime/device/ascend/kernel_select_ascend.h" | |||||
| #elif ENABLE_GPU | |||||
| #include "runtime/device/gpu/kernel_info_setter.h" | #include "runtime/device/gpu/kernel_info_setter.h" | ||||
| #endif | #endif | ||||
| @@ -620,7 +622,11 @@ bool IsBasicFuseOp(const AnfNodePtr &node) { | |||||
| std::vector<PrimitivePtr> basic_ops = GetFusibleOpList(); | std::vector<PrimitivePtr> basic_ops = GetFusibleOpList(); | ||||
| #if ENABLE_D | #if ENABLE_D | ||||
| if (!CheckProcessor(node)) { | if (!CheckProcessor(node)) { | ||||
| return false; | |||||
| std::vector<PrimitivePtr> fused_aicpu_op = {prim::kPrimExpandDims, prim::kPrimReshape}; | |||||
| if (!std::any_of(fused_aicpu_op.begin(), fused_aicpu_op.end(), | |||||
| [&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); })) { | |||||
| return false; | |||||
| } | |||||
| } | } | ||||
| #endif | #endif | ||||
| return std::any_of(basic_ops.begin(), basic_ops.end(), | return std::any_of(basic_ops.begin(), basic_ops.end(), | ||||
| @@ -644,7 +650,9 @@ bool IsFusibleOp(const AnfNodePtr &node) { | |||||
| void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) { | void ResetKernelInfo(const AnfNodePtr &node, KernelType kernel_type) { | ||||
| auto cnode = node->cast<CNodePtr>(); | auto cnode = node->cast<CNodePtr>(); | ||||
| MS_EXCEPTION_IF_NULL(cnode); | MS_EXCEPTION_IF_NULL(cnode); | ||||
| #if ENABLE_GPU | |||||
| #if ENABLE_D | |||||
| device::ascend::SetKernelInfo(cnode, kernel_type); | |||||
| #elif ENABLE_GPU | |||||
| device::gpu::SetKernelInfo(cnode, kernel_type); | device::gpu::SetKernelInfo(cnode, kernel_type); | ||||
| #endif | #endif | ||||
| } | } | ||||
| @@ -100,17 +100,17 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { | |||||
| PassManagerPtr GraphKernelOptimizer::Split() { | PassManagerPtr GraphKernelOptimizer::Split() { | ||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage4_split"); | auto pm = std::make_shared<PassManager>("graphkernel_stage4_split"); | ||||
| // Move the non-scalar tensor (in composite node) to parameter list | // Move the non-scalar tensor (in composite node) to parameter list | ||||
| pm->AddPass(std::make_shared<TensorPromotion>()); | pm->AddPass(std::make_shared<TensorPromotion>()); | ||||
| // Make certain nodes redundant so that they are used by only one user, | // Make certain nodes redundant so that they are used by only one user, | ||||
| // which can avoid unnecessary input-output and get better performance. | // which can avoid unnecessary input-output and get better performance. | ||||
| if (is_gpu) { | |||||
| // preprocess for ShapeOpsSplitter | |||||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>()); | |||||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||||
| } | |||||
| // preprocess for ShapeOpsSplitter | |||||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>()); | |||||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||||
| // Split kernel according to costmodel | // Split kernel according to costmodel | ||||
| pm->AddPass(std::make_shared<GraphKernelSplitter>()); | pm->AddPass(std::make_shared<GraphKernelSplitter>()); | ||||
| @@ -120,11 +120,9 @@ PassManagerPtr GraphKernelOptimizer::Split() { | |||||
| pm->AddPass(std::make_shared<GetitemTuple>()); | pm->AddPass(std::make_shared<GetitemTuple>()); | ||||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | ||||
| if (is_gpu) { | |||||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>()); | |||||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||||
| } | |||||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>()); | |||||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||||
| return pm; | return pm; | ||||
| } | } | ||||
| @@ -359,12 +359,19 @@ class Splitter { | |||||
| Splitter(const CNodePtr &main_cnode, SplitSchemerPtr split_schemer) | Splitter(const CNodePtr &main_cnode, SplitSchemerPtr split_schemer) | ||||
| : main_func_graph_(main_cnode->func_graph()), old_subgraph_cnode_(main_cnode), split_schemer_(split_schemer) {} | : main_func_graph_(main_cnode->func_graph()), old_subgraph_cnode_(main_cnode), split_schemer_(split_schemer) {} | ||||
| void ResetInlinedNodesKernelInfo() { | |||||
| for (const auto &node : inlined_nodes_) { | |||||
| ResetKernelInfo(node); | |||||
| } | |||||
| } | |||||
| // Maintain new subgraphs in main graph. | // Maintain new subgraphs in main graph. | ||||
| void RebuildGraph(const std::vector<size_t> &cnodes_group_id) { | void RebuildGraph(const std::vector<size_t> &cnodes_group_id) { | ||||
| BindFuncGraph(); | BindFuncGraph(); | ||||
| RecoverParameter(); | RecoverParameter(); | ||||
| ConnectToMainGraph(cnodes_group_id); | ConnectToMainGraph(cnodes_group_id); | ||||
| UpdateSubGraphInfo(); | UpdateSubGraphInfo(); | ||||
| ResetInlinedNodesKernelInfo(); | |||||
| } | } | ||||
| // Rebind nodes to its new sub_func_graph | // Rebind nodes to its new sub_func_graph | ||||
| @@ -420,7 +427,7 @@ class Splitter { | |||||
| } | } | ||||
| } | } | ||||
| if (AnfAlgo::IsRealKernel(node)) { | if (AnfAlgo::IsRealKernel(node)) { | ||||
| ResetKernelInfo(node); | |||||
| inlined_nodes_.push_back(node); | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| @@ -533,6 +540,7 @@ class Splitter { | |||||
| FuncGraphPtr main_func_graph_; | FuncGraphPtr main_func_graph_; | ||||
| CNodePtr old_subgraph_cnode_; // The cnode that holds the original sub_func_graph | CNodePtr old_subgraph_cnode_; // The cnode that holds the original sub_func_graph | ||||
| std::vector<CNodePtr> new_subgraph_cnodes_; // The cnode list that hold the new sub_func_graph | std::vector<CNodePtr> new_subgraph_cnodes_; // The cnode list that hold the new sub_func_graph | ||||
| std::vector<AnfNodePtr> inlined_nodes_; | |||||
| SplitSchemerPtr split_schemer_; | SplitSchemerPtr split_schemer_; | ||||
| std::unordered_map<ParameterPtr, AnfNodePtr> param_to_main_graph_node_map_; | std::unordered_map<ParameterPtr, AnfNodePtr> param_to_main_graph_node_map_; | ||||
| }; | }; | ||||
| @@ -54,6 +54,7 @@ | |||||
| #include "debug/data_dump/dump_json_parser.h" | #include "debug/data_dump/dump_json_parser.h" | ||||
| #include "debug/tensor_load.h" | #include "debug/tensor_load.h" | ||||
| #include "debug/anf_ir_utils.h" | #include "debug/anf_ir_utils.h" | ||||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | ||||
| #include "backend/session/ascend_auto_monad.h" | #include "backend/session/ascend_auto_monad.h" | ||||
| #include "debug/data_dump/e2e_dump_util.h" | #include "debug/data_dump/e2e_dump_util.h" | ||||
| @@ -515,6 +515,56 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, KernelType kern | |||||
| } | } | ||||
| return select_status; | return select_status; | ||||
| } | } | ||||
| void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) { | |||||
| auto kernel_info = static_cast<device::KernelInfo *>(kernel_node->kernel_info()); | |||||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||||
| auto kernel_build_info = kernel_info->select_kernel_build_info(); | |||||
| MS_EXCEPTION_IF_NULL(kernel_build_info); | |||||
| if (AnfAlgo::IsGraphKernel(kernel_node)) { | |||||
| return; | |||||
| } | |||||
| auto builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(); | |||||
| builder->SetOriginDataFormat(kernel_build_info->GetOriginDataFormat()); | |||||
| builder->SetInputsFormat(kernel_build_info->GetAllInputFormats()); | |||||
| builder->SetInputsDeviceType(kernel_build_info->GetAllInputDeviceTypes()); | |||||
| builder->SetOutputsFormat(kernel_build_info->GetAllOutputFormats()); | |||||
| builder->SetOutputsDeviceType(kernel_build_info->GetAllOutputDeviceTypes()); | |||||
| builder->SetOpPattern(kernel_build_info->op_pattern()); | |||||
| builder->SetFusionType(kernel_build_info->fusion_type()); | |||||
| auto new_kernel_type = kernel_type; | |||||
| auto new_processor = kernel_build_info->processor(); | |||||
| if (kernel_type == UNKNOWN_KERNEL_TYPE) { | |||||
| std::vector<std::shared_ptr<kernel::KernelBuildInfo>> kernel_info_list; | |||||
| std::vector<std::shared_ptr<kernel::KernelBuildInfo>> aicpu_kernel_info_list; | |||||
| kernel::KernelQuery(kernel_node, &kernel_info_list, kernel_type); | |||||
| auto select_status = SetMatchedKernelInfo(kernel_node, kernel_info_list); | |||||
| if (select_status != kNoMatched) { | |||||
| new_kernel_type = TBE_KERNEL; | |||||
| new_processor = kernel::Processor::AICORE; | |||||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses TBE_KERNEL"; | |||||
| } else { | |||||
| kernel::AICPUQuery(kernel_node, &aicpu_kernel_info_list); | |||||
| select_status = SetMatchedKernelInfo(kernel_node, aicpu_kernel_info_list); | |||||
| if (select_status != kNoMatched) { | |||||
| new_kernel_type = AICPU_KERNEL; | |||||
| new_processor = kernel::Processor::AICPU; | |||||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AICPU_KERNEL"; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (new_kernel_type == UNKNOWN_KERNEL_TYPE) { | |||||
| new_kernel_type = AKG_KERNEL; | |||||
| new_processor = kernel::Processor::AICORE; | |||||
| MS_LOG(INFO) << kernel_node->fullname_with_scope() << " uses AKG_KERNEL"; | |||||
| } | |||||
| builder->SetKernelType(new_kernel_type); | |||||
| builder->SetProcessor(new_processor); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(builder->Build(), kernel_node.get()); | |||||
| } | |||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -31,6 +31,7 @@ KernelSelectStatus SelectKernelInfo(const CNodePtr &kernel_node, | |||||
| KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE); | KernelType kernel_type = KernelType::UNKNOWN_KERNEL_TYPE); | ||||
| void SetTensorDeviceInfo(const CNodePtr &kernel_node); | void SetTensorDeviceInfo(const CNodePtr &kernel_node); | ||||
| void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph); | void SelectGraphKernelInfo(const CNodePtr &kernel_node, const FuncGraphPtr &func_graph); | ||||
| void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type); | |||||
| } // namespace ascend | } // namespace ascend | ||||
| } // namespace device | } // namespace device | ||||
| } // namespace mindspore | } // namespace mindspore | ||||