removed one cse from GPU passes, some common passes was enabled for Ascend.tags/v1.2.0-rc1
| @@ -0,0 +1,171 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||
| #include <vector> | |||
| #include <string> | |||
| #include <memory> | |||
| #include "ir/func_graph.h" | |||
| #include "utils/ms_context.h" | |||
| #include "backend/optimizer/graph_kernel/add_atomic_clean.h" | |||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||
| #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" | |||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | |||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||
| #include "backend/optimizer/graph_kernel/depend_formater.h" | |||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||
| #include "backend/optimizer/graph_kernel/raise_reduction_precision.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||
| #include "backend/optimizer/graph_kernel/parallel_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/optimize_assign.h" | |||
| #include "backend/optimizer/graph_kernel/split_assign.h" | |||
| #include "backend/optimizer/graph_kernel/reorder_ops.h" | |||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| PassManagerPtr GraphKernelOptimizer::PreProcess() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage1_preprocess"); | |||
| // Change Assign(p, a, U) to Assign(Depend(p, U), a) | |||
| pm->AddPass(std::make_shared<SplitAssign>()); | |||
| // Move the Depend nodes to the bottom of graph | |||
| pm->AddPass(std::make_shared<DependFormater>()); | |||
| // Reorder TransData-Cast to Cast-TransData, | |||
| if (is_ascend) { | |||
| pm->AddPass(std::make_shared<ReorderOps>()); | |||
| } | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::Cluster() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage2_cluster"); | |||
| // Expand complex basic kernels to composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelExpander>()); | |||
| // Fuse basic kernels and composite kernels | |||
| pm->AddPass(std::make_shared<BasicOpsFusion>()); | |||
| // Eliminate the outputs without external user | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage3_highlevelopt1"); | |||
| // Replace Assign with InplaceAssign, and replace original output with overridden parameters | |||
| pm->AddPass(std::make_shared<OptimizeAssign>()); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||
| // Cast the input of ReduceSum from float16 to float32 for higher precision*/ | |||
| pm->AddPass(std::make_shared<RaiseReductionPrecision>()); | |||
| // Universal arithmetic simplify | |||
| if (is_gpu) { | |||
| pm->AddPass(std::make_shared<ArithmeticSimplify>()); | |||
| } | |||
| // Common subexpression elimination | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::Split() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage4_split"); | |||
| // Move the non-scalar tensor (in composite node) to parameter list | |||
| pm->AddPass(std::make_shared<TensorPromotion>()); | |||
| // Make certain nodes redundant so that they are used by only one user, | |||
| // which can avoid unnecessary input-output and get better performance. | |||
| if (is_gpu) { | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops)); | |||
| } | |||
| // Split kernel according to costmodel | |||
| pm->AddPass(std::make_shared<GraphKernelSplitter>()); | |||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | |||
| if (is_gpu) { | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>()); | |||
| } | |||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||
| // will be exposed, use GetitemTuple Pass to delete them. | |||
| pm->AddPass(std::make_shared<GetitemTuple>()); | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage5_highlevelopt2"); | |||
| // Enable atomic add | |||
| if (is_gpu) { | |||
| pm->AddPass(std::make_shared<AtomicCleanInsertter>()); | |||
| pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>()); | |||
| } else /* if (is_ascend) */ { | |||
| pm->AddPass(std::make_shared<CleanAddAtomic>()); | |||
| } | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::Combine() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage6_combine"); | |||
| // Enable parallel fusion | |||
| if (is_gpu) { | |||
| // Prevent fake loop in parallel fusion | |||
| pm->AddPass(std::make_shared<DependFormater>()); | |||
| // Do parallel fusion for gpu device | |||
| pm->AddPass(std::make_shared<ParallelOpFusion>(kGPUDevice, ParallelConfig(7))); | |||
| } | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::PostProcess() { | |||
| auto pm = std::make_shared<PassManager>("graphkernel_stage7_postprocess"); | |||
| // Add the new tensors to the kernel_graph | |||
| pm->AddPass(std::make_shared<BindValueToGraph>()); | |||
| return pm; | |||
| } | |||
| void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) { | |||
| auto context_ptr = MsContext::GetInstance(); | |||
| MS_EXCEPTION_IF_NULL(context_ptr); | |||
| is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice); | |||
| is_ascend = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice); | |||
| auto optimizer = std::make_shared<GraphOptimizer>("graph_kernel_optimizer"); | |||
| optimizer->AddPassManager(PreProcess()); | |||
| optimizer->AddPassManager(Cluster()); | |||
| optimizer->AddPassManager(HighLevelOpt1()); | |||
| optimizer->AddPassManager(Split()); | |||
| optimizer->AddPassManager(HighLevelOpt2()); | |||
| optimizer->AddPassManager(Combine()); | |||
| optimizer->AddPassManager(PostProcess()); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| } | |||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,54 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||
| #include "ir/anf.h" | |||
| #include "ir/func_graph.h" | |||
| #include "backend/session/kernel_graph.h" | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "backend/optimizer/common/pass_manager.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class GraphKernelOptimizer { | |||
| public: | |||
| void Run(const KernelGraphPtr &kernel_graph); | |||
| private: | |||
| // Pre-process | |||
| PassManagerPtr PreProcess(); | |||
| // Cluster kernels | |||
| PassManagerPtr Cluster(); | |||
| // High level optimize 1 | |||
| PassManagerPtr HighLevelOpt1(); | |||
| // Split kernels | |||
| PassManagerPtr Split(); | |||
| // High level optimize 2 | |||
| PassManagerPtr HighLevelOpt2(); | |||
| // Combine kernels | |||
| PassManagerPtr Combine(); | |||
| // Post-process | |||
| PassManagerPtr PostProcess(); | |||
| bool is_gpu{false}; | |||
| bool is_ascend{false}; | |||
| }; | |||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph); | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ | |||
| @@ -17,6 +17,7 @@ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_SHAPE_OPS_SPLITTER_H_ | |||
| #include <memory> | |||
| #include <vector> | |||
| #include <utility> | |||
| #include "ir/func_graph.h" | |||
| #include "backend/optimizer/common/pass.h" | |||
| @@ -24,15 +25,15 @@ namespace mindspore { | |||
| namespace opt { | |||
| class ShapeOpsSplitter : public Pass { | |||
| public: | |||
| explicit ShapeOpsSplitter(const std::vector<PrimitivePtr> &shape_ops) | |||
| : Pass("shape_ops_splitter"), shape_ops_(shape_ops) {} | |||
| explicit ShapeOpsSplitter(std::vector<PrimitivePtr> shape_ops) | |||
| : Pass("shape_ops_splitter"), shape_ops_(std::move(shape_ops)) {} | |||
| ~ShapeOpsSplitter() override = default; | |||
| bool Run(const FuncGraphPtr &func_graph); | |||
| private: | |||
| bool Process(const FuncGraphPtr &func_graph); | |||
| bool IsMultiUserShapeOps(const AnfNodePtr &node, const FuncGraphManagerPtr &mng); | |||
| const std::vector<PrimitivePtr> &shape_ops_; | |||
| std::vector<PrimitivePtr> shape_ops_; | |||
| }; | |||
| using ShapeOpsSplitterPtr = std::shared_ptr<ShapeOpsSplitter>; | |||
| } // namespace opt | |||
| @@ -51,16 +51,7 @@ | |||
| #include "debug/data_dump/dump_json_parser.h" | |||
| #include "debug/tensor_load.h" | |||
| #include "debug/anf_ir_utils.h" | |||
| #include "backend/optimizer/graph_kernel/reorder_ops.h" | |||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||
| #include "backend/optimizer/graph_kernel/add_atomic_clean.h" | |||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||
| #include "backend/session/ascend_auto_monad.h" | |||
| #include "debug/data_dump/e2e_dump_util.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| @@ -843,22 +834,8 @@ void AscendSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kern | |||
| if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | |||
| return; | |||
| } | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm"); | |||
| pm->AddPass(std::make_shared<opt::ReorderOps>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelExpander>()); | |||
| pm->AddPass(std::make_shared<opt::BasicOpsFusion>()); | |||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<opt::TensorPromotion>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelSplitter>()); | |||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||
| // will be exposed, use GetitemTuple Pass to delete them. | |||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | |||
| pm->AddPass(std::make_shared<opt::CleanAddAtomic>()); | |||
| optimizer->AddPassManager(pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| opt::GraphKernelOptimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| } | |||
| void AscendSession::AdjustKernel(const std::shared_ptr<KernelGraph> &kernel_graph) const { | |||
| @@ -42,23 +42,7 @@ | |||
| #include "backend/optimizer/gpu/relu_v2_pass.h" | |||
| #include "backend/optimizer/gpu/add_relu_v2_fusion.h" | |||
| #include "backend/optimizer/gpu/add_relu_grad_v2_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||
| #include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" | |||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | |||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||
| #include "backend/optimizer/graph_kernel/depend_formater.h" | |||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | |||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | |||
| #include "backend/optimizer/graph_kernel/raise_reduction_precision.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_cse.h" | |||
| #include "backend/optimizer/graph_kernel/shape_ops_splitter.h" | |||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | |||
| #include "backend/optimizer/graph_kernel/parallel_fusion.h" | |||
| #include "backend/optimizer/graph_kernel/optimize_assign.h" | |||
| #include "backend/optimizer/graph_kernel/split_assign.h" | |||
| #include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" | |||
| #include "backend/optimizer/pass/communication_op_fusion.h" | |||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||
| #include "common/trans.h" | |||
| @@ -197,36 +181,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_ | |||
| if (!(context_ptr->get_param<bool>(MS_CTX_ENABLE_GRAPH_KERNEL))) { | |||
| return; | |||
| } | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>("graph_kernel_pm"); | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; | |||
| pm->AddPass(std::make_shared<opt::SplitAssign>()); | |||
| pm->AddPass(std::make_shared<opt::DependFormater>()); // Make more fusion opportunity. | |||
| pm->AddPass(std::make_shared<opt::GraphKernelExpander>()); | |||
| pm->AddPass(std::make_shared<opt::BasicOpsFusion>()); | |||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||
| pm->AddPass(std::make_shared<opt::OptimizeAssign>()); | |||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||
| pm->AddPass(std::make_shared<opt::RaiseReductionPrecision>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<opt::ArithmeticSimplify>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||
| pm->AddPass(std::make_shared<opt::TensorPromotion>()); | |||
| pm->AddPass(std::make_shared<opt::ShapeOpsSplitter>(duplicated_ops)); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelSplitter>()); | |||
| pm->AddPass(std::make_shared<opt::GraphKernelCSE>()); | |||
| // The CSE may output a graph with repeated outputs. | |||
| pm->AddPass(std::make_shared<opt::EliminateRedundantOutput>()); | |||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||
| // will be exposed, use GetitemTuple Pass to delete them. | |||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||
| pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>()); | |||
| pm->AddPass(std::make_shared<opt::StitchAtomicCleanInsertter>()); | |||
| pm->AddPass(std::make_shared<opt::DependFormater>()); // Prevent fake loop in parallel fusion. | |||
| pm->AddPass(std::make_shared<opt::ParallelOpFusion>(kGPUDevice, opt::ParallelConfig(7))); | |||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | |||
| optimizer->AddPassManager(pm); | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| opt::GraphKernelOptimize(kernel_graph); | |||
| kernel_graph->SetExecOrderByDefault(); | |||
| } | |||