removed one cse from GPU passes, some common passes was enabled for Ascend.tags/v1.2.0-rc1
| @@ -0,0 +1,171 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include <memory> | |||||
| #include "ir/func_graph.h" | |||||
| #include "utils/ms_context.h" | |||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean.h" | |||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||||
| #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" | |||||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | |||||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||||
| #include "backend/optimizer/graph_kernel/depend_formater.h" | |||||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||||
| #include "backend/optimizer/graph_kernel/raise_reduction_precision.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||||
| #include "backend/optimizer/graph_kernel/parallel_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/optimize_assign.h" | |||||
| #include "backend/optimizer/graph_kernel/split_assign.h" | |||||
| #include "backend/optimizer/graph_kernel/reorder_ops.h" | |||||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| PassManagerPtr GraphKernelOptimizer::PreProcess() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage1_preprocess"); | |||||
| // Change Assign(p, a, U) to Assign(Depend(p, U), a) | |||||
| pm->AddPass(std::make_shared<SplitAssign>()); | |||||
| // Move the Depend nodes to the bottom of graph | |||||
| pm->AddPass(std::make_shared<DependFormater>()); | |||||
| // Reorder TransData-Cast to Cast-TransData, | |||||
| if (is_ascend) { | |||||
| pm->AddPass(std::make_shared<ReorderOps>()); | |||||
| } | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::Cluster() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage2_cluster"); | |||||
| // Expand complex basic kernels to composite kernels | |||||
| pm->AddPass(std::make_shared<GraphKernelExpander>()); | |||||
| // Fuse basic kernels and composite kernels | |||||
| pm->AddPass(std::make_shared<BasicOpsFusion>()); | |||||
| // Eliminate the outputs without external user | |||||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage3_highlevelopt1"); | |||||
| // Replace Assign with InplaceAssign, and replace original output with overridden parameters | |||||
| pm->AddPass(std::make_shared<OptimizeAssign>()); | |||||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||||
| // Cast the input of ReduceSum from float16 to float32 for higher precision*/ | |||||
| pm->AddPass(std::make_shared<RaiseReductionPrecision>()); | |||||
| // Universal arithmetic simplify | |||||
| if (is_gpu) { | |||||
| pm->AddPass(std::make_shared<ArithmeticSimplify>()); | |||||
| } | |||||
| // Common subexpression elimination | |||||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::Split() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage4_split"); | |||||
| // Move the non-scalar tensor (in composite node) to parameter list | |||||
| pm->AddPass(std::make_shared<TensorPromotion>()); | |||||
| // Make certain nodes redundant so that they are used by only one user, | |||||
| // which can avoid unnecessary input-output and get better performance. | |||||
| if (is_gpu) { | |||||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||||
| } | |||||
| // Split kernel according to costmodel | |||||
| pm->AddPass(std::make_shared<GraphKernelSplitter>()); | |||||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | |||||
| if (is_gpu) { | |||||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||||
| } | |||||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||||
| // will be exposed, use GetitemTuple Pass to delete them. | |||||
| pm->AddPass(std::make_shared<GetitemTuple>()); | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage5_highlevelopt2"); | |||||
| // Enable atomic add | |||||
| if (is_gpu) { | |||||
| pm->AddPass(std::make_shared<AtomicCleanInsertter>()); | |||||
| pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>()); | |||||
| } else /* if (is_ascend) */ { | |||||
| pm->AddPass(std::make_shared<CleanAddAtomic>()); | |||||
| } | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::Combine() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage6_combine"); | |||||
| // Enable parallel fusion | |||||
| if (is_gpu) { | |||||
| // Prevent fake loop in parallel fusion | |||||
| pm->AddPass(std::make_shared<DependFormater>()); | |||||
| // Do parallel fusion for gpu device | |||||
| pm->AddPass(std::make_shared<ParallelOpFusion>(kGPUDevice, ParallelConfig(7))); | |||||
| } | |||||
| return pm; | |||||
| } | |||||
| PassManagerPtr GraphKernelOptimizer::PostProcess() { | |||||
| auto pm = std::make_shared<PassManager>("graphkernel_stage7_postprocess"); | |||||
| // Add the new tensors to the kernel_graph | |||||
| pm->AddPass(std::make_shared<BindValueToGraph>()); | |||||
| return pm; | |||||
| } | |||||
| void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) { | |||||
| auto context_ptr = MsContext::GetInstance(); | |||||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||||
| is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice); | |||||
| is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice); | |||||
| auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer"); | |||||
| optimizer->AddPassManager(PreProcess()); | |||||
| optimizer->AddPassManager(Cluster()); | |||||
| optimizer->AddPassManager(HighLevelOpt1()); | |||||
| optimizer->AddPassManager(Split()); | |||||
| optimizer->AddPassManager(HighLevelOpt2()); | |||||
| optimizer->AddPassManager(Combine()); | |||||
| optimizer->AddPassManager(PostProcess()); | |||||
| (void)optimizer->Optimize(kernel_graph); | |||||
| } | |||||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,54 @@ | |||||
| /** | |||||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||||
| #include "ir/anf.h" | |||||
| #include "ir/func_graph.h" | |||||
| #include "backend/session/kernel_graph.h" | |||||
| #include "backend/optimizer/common/optimizer.h" | |||||
| #include "backend/optimizer/common/pass_manager.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class GraphKernelOptimizer { | |||||
| public: | |||||
| void Run(const KernelGraphPtr &kernel_graph); | |||||
| private: | |||||
| // Pre-process | |||||
| PassManagerPtr PreProcess(); | |||||
| // Cluster kernels | |||||
| PassManagerPtr Cluster(); | |||||
| // High level optimize 1 | |||||
| PassManagerPtr HighLevelOpt1(); | |||||
| // Split kernels | |||||
| PassManagerPtr Split(); | |||||
| // High level optimize 2 | |||||
| PassManagerPtr HighLevelOpt2(); | |||||
| // Combine kernels | |||||
| PassManagerPtr Combine(); | |||||
| // Post-process | |||||
| PassManagerPtr PostProcess(); | |||||
| bool is_gpu{false}; | |||||
| bool is_ascend{false}; | |||||
| }; | |||||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph); | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||||
| @@ -17,6 +17,7 @@ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_SHAPE_OPS_SPLITTER_H_ | #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_SHAPE_OPS_SPLITTER_H_ | ||||
| #include <memory> | #include <memory> | ||||
| #include <vector> | #include <vector> | ||||
| #include <utility> | |||||
| #include "ir/func_graph.h" | #include "ir/func_graph.h" | ||||
| #include "backend/optimizer/common/pass.h" | #include "backend/optimizer/common/pass.h" | ||||
| @@ -24,15 +25,15 @@ namespace mindspore { | |||||
| namespace opt { | namespace opt { | ||||
| class ShapeOpsSplitter : public Pass { | class ShapeOpsSplitter : public Pass { | ||||
| public: | public: | ||||
| explicit ShapeOpsSplitter(const std::vector<PrimitivePtr> &shape_ops) | |||||
| : Pass("shape_ops_splitter"), shape_ops_(shape_ops) {} | |||||
| explicit ShapeOpsSplitter(std::vector<PrimitivePtr> shape_ops) | |||||
| : Pass("shape_ops_splitter"), shape_ops_(std::move(shape_ops)) {} | |||||
| ~ShapeOpsSplitter() override = default; | ~ShapeOpsSplitter() override = default; | ||||
| bool Run(const FuncGraphPtr &func_graph); | bool Run(const FuncGraphPtr &func_graph); | ||||
| private: | private: | ||||
| bool Process(const FuncGraphPtr &func_graph); | bool Process(const FuncGraphPtr &func_graph); | ||||
| bool IsMultiUserShapeOps(const AnfNodePtr &node, const FuncGraphManagerPtr &mng); | bool IsMultiUserShapeOps(const AnfNodePtr &node, const FuncGraphManagerPtr &mng); | ||||
| const std::vector<PrimitivePtr> &shape_ops_; | |||||
| std::vector<PrimitivePtr> shape_ops_; | |||||
| }; | }; | ||||
| using ShapeOpsSplitterPtr = std::shared_ptr<ShapeOpsSplitter>; | using ShapeOpsSplitterPtr = std::shared_ptr<ShapeOpsSplitter>; | ||||
| } // namespace opt | } // namespace opt | ||||
| @@ -51,16 +51,7 @@ | |||||
| #include "debug/data_dump/dump_json_parser.h" | #include "debug/data_dump/dump_json_parser.h" | ||||
| #include "debug/tensor_load.h" | #include "debug/tensor_load.h" | ||||
| #include "debug/anf_ir_utils.h" | #include "debug/anf_ir_utils.h" | ||||
| #include "backend/optimizer/graph_kernel/reorder_ops.h" | |||||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean.h" | |||||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||||
| #include "backend/session/ascend_auto_monad.h" | #include "backend/session/ascend_auto_monad.h" | ||||
| #include "debug/data_dump/e2e_dump_util.h" | #include "debug/data_dump/e2e_dump_util.h" | ||||
| #include "debug/anf_ir_dump.h" | #include "debug/anf_ir_dump.h" | ||||
| @@ -843,22 +834,8 @@ void AscendSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kern | |||||
| if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | ||||
| return; | return; | ||||
| } | } | ||||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||||
| auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm"); | |||||
| pm->AddPass(std::make_shared<opt::ReorderOps>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelExpander>()); | |||||
| pm->AddPass(std::make_shared<opt::BasicOpsFusion>()); | |||||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<opt::TensorPromotion>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelSplitter>()); | |||||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||||
| // will be exposed, use GetitemTuple Pass to delete them. | |||||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | |||||
| pm->AddPass(std::make_shared<opt::CleanAddAtomic>()); | |||||
| optimizer->AddPassManager(pm); | |||||
| (void)optimizer->Optimize(kernel_graph); | |||||
| opt::GraphKernelOptimize(kernel_graph); | |||||
| kernel_graph->SetExecOrderByDefault(); | |||||
| } | } | ||||
| void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { | void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { | ||||
| @@ -42,23 +42,7 @@ | |||||
| #include "backend/optimizer/gpu/relu_v2_pass.h" | #include "backend/optimizer/gpu/relu_v2_pass.h" | ||||
| #include "backend/optimizer/gpu/add_relu_v2_fusion.h" | #include "backend/optimizer/gpu/add_relu_v2_fusion.h" | ||||
| #include "backend/optimizer/gpu/add_relu_grad_v2_fusion.h" | #include "backend/optimizer/gpu/add_relu_grad_v2_fusion.h" | ||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||||
| #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" | |||||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | |||||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||||
| #include "backend/optimizer/graph_kernel/depend_formater.h" | |||||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||||
| #include "backend/optimizer/graph_kernel/raise_reduction_precision.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||||
| #include "backend/optimizer/graph_kernel/parallel_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/optimize_assign.h" | |||||
| #include "backend/optimizer/graph_kernel/split_assign.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||||
| #include "backend/optimizer/pass/communication_op_fusion.h" | #include "backend/optimizer/pass/communication_op_fusion.h" | ||||
| #include "backend/optimizer/pass/getitem_tuple.h" | #include "backend/optimizer/pass/getitem_tuple.h" | ||||
| #include "common/trans.h" | #include "common/trans.h" | ||||
| @@ -197,36 +181,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_ | |||||
| if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | ||||
| return; | return; | ||||
| } | } | ||||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||||
| auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm"); | |||||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||||
| pm->AddPass(std::make_shared<opt::SplitAssign>()); | |||||
| pm->AddPass(std::make_shared<opt::DependFormater>()); // Make more fusion opportunity. | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelExpander>()); | |||||
| pm->AddPass(std::make_shared<opt::BasicOpsFusion>()); | |||||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||||
| pm->AddPass(std::make_shared<opt::OptimizeAssign>()); | |||||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||||
| pm->AddPass(std::make_shared<opt::RaiseReductionPrecision>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<opt::ArithmeticSimplify>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||||
| pm->AddPass(std::make_shared<opt::TensorPromotion>()); | |||||
| pm->AddPass(std::make_shared<opt::ShapeOpsSplitter>(duplicated_ops)); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelSplitter>()); | |||||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||||
| // The CSE may output a graph with repeated outputs. | |||||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||||
| // will be exposed, use GetitemTuple Pass to delete them. | |||||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||||
| pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>()); | |||||
| pm->AddPass(std::make_shared<opt::StitchAtomicCleanInsertter>()); | |||||
| pm->AddPass(std::make_shared<opt::DependFormater>()); // Prevent fake loop in parallel fusion. | |||||
| pm->AddPass(std::make_shared<opt::ParallelOpFusion>(kGPUDevice, opt::ParallelConfig(7))); | |||||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | |||||
| optimizer->AddPassManager(pm); | |||||
| (void)optimizer->Optimize(kernel_graph); | |||||
| opt::GraphKernelOptimize(kernel_graph); | |||||
| kernel_graph->SetExecOrderByDefault(); | kernel_graph->SetExecOrderByDefault(); | ||||
| } | } | ||||