diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc new file mode 100644 index 0000000000..a5f299baea --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.cc @@ -0,0 +1,171 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" + +#include +#include +#include + +#include "ir/func_graph.h" +#include "utils/ms_context.h" +#include "backend/optimizer/graph_kernel/add_atomic_clean.h" +#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" +#include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" +#include "backend/optimizer/graph_kernel/arithmetic_simplify.h" +#include "backend/optimizer/graph_kernel/basic_ops_fusion.h" +#include "backend/optimizer/graph_kernel/clean_all_in_once.h" +#include "backend/optimizer/graph_kernel/depend_formater.h" +#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" +#include "backend/optimizer/graph_kernel/tensor_promotion.h" +#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" +#include "backend/optimizer/graph_kernel/graph_kernel_expander.h" +#include "backend/optimizer/graph_kernel/raise_reduction_precision.h" +#include "backend/optimizer/graph_kernel/graph_kernel_cse.h" +#include "backend/optimizer/graph_kernel/shape_ops_splitter.h" +#include "backend/optimizer/graph_kernel/value_graph_binder.h" +#include "backend/optimizer/graph_kernel/parallel_fusion.h" +#include "backend/optimizer/graph_kernel/optimize_assign.h" +#include "backend/optimizer/graph_kernel/split_assign.h" +#include "backend/optimizer/graph_kernel/reorder_ops.h" +#include "backend/optimizer/pass/getitem_tuple.h" + +namespace mindspore { +namespace opt { +PassManagerPtr GraphKernelOptimizer::PreProcess() { + auto pm = std::make_shared("graphkernel_stage1_preprocess"); + // Change Assign(p, a, U) to Assign(Depend(p, U), a) + pm->AddPass(std::make_shared()); + + // Move the Depend nodes to the bottom of graph + pm->AddPass(std::make_shared()); + + // Reorder TransData-Cast to Cast-TransData, + if (is_ascend) { + pm->AddPass(std::make_shared()); + } + return pm; +} + +PassManagerPtr GraphKernelOptimizer::Cluster() { + auto pm = std::make_shared("graphkernel_stage2_cluster"); + // Expand complex basic kernels to composite kernels + pm->AddPass(std::make_shared()); + + // Fuse basic kernels and composite kernels + pm->AddPass(std::make_shared()); + + // Eliminate the outputs without external user + pm->AddPass(std::make_shared()); + return pm; +} + +PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() { + auto pm = std::make_shared("graphkernel_stage3_highlevelopt1"); + // Replace Assign with InplaceAssign, and replace original output with overridden parameters + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + + // Cast the input of ReduceSum from float16 to float32 for higher precision*/ + pm->AddPass(std::make_shared()); + + // Universal arithmetic simplify + if (is_gpu) { + pm->AddPass(std::make_shared()); + } + + // Common subexpression elimination + pm->AddPass(std::make_shared()); + return pm; +} + +PassManagerPtr GraphKernelOptimizer::Split() { + auto pm = std::make_shared("graphkernel_stage4_split"); + // Move the non-scalar tensor (in composite node) to parameter list + pm->AddPass(std::make_shared()); + + // Make certain nodes redundant so that they are used by only one user, + // which can avoid unnecessary input-output and get better performance. + if (is_gpu) { + std::vector duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; + pm->AddPass(std::make_shared(duplicated_ops)); + } + + // Split kernel according to costmodel + pm->AddPass(std::make_shared()); + + // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter + if (is_gpu) { + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + } + + // After Simplify and Splitter, a lot of redundant getitem/maketuple + // will be exposed, use GetitemTuple Pass to delete them. + pm->AddPass(std::make_shared()); + return pm; +} + +PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() { + auto pm = std::make_shared("graphkernel_stage5_highlevelopt2"); + // Enable atomic add + if (is_gpu) { + pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); + } else /* if (is_ascend) */ { + pm->AddPass(std::make_shared()); + } + return pm; +} + +PassManagerPtr GraphKernelOptimizer::Combine() { + auto pm = std::make_shared("graphkernel_stage6_combine"); + // Enable parallel fusion + if (is_gpu) { + // Prevent fake loop in parallel fusion + pm->AddPass(std::make_shared()); + // Do parallel fusion for gpu device + pm->AddPass(std::make_shared(kGPUDevice, ParallelConfig(7))); + } + return pm; +} + +PassManagerPtr GraphKernelOptimizer::PostProcess() { + auto pm = std::make_shared("graphkernel_stage7_postprocess"); + // Add the new tensors to the kernel_graph + pm->AddPass(std::make_shared()); + return pm; +} + +void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) { + auto context_ptr = MsContext::GetInstance(); + MS_EXCEPTION_IF_NULL(context_ptr); + is_gpu = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kGPUDevice); + is_ascend = (context_ptr->get_param(MS_CTX_DEVICE_TARGET) == kAscendDevice); + + auto optimizer = std::make_shared("graph_kernel_optimizer"); + optimizer->AddPassManager(PreProcess()); + optimizer->AddPassManager(Cluster()); + optimizer->AddPassManager(HighLevelOpt1()); + optimizer->AddPassManager(Split()); + optimizer->AddPassManager(HighLevelOpt2()); + optimizer->AddPassManager(Combine()); + optimizer->AddPassManager(PostProcess()); + (void)optimizer->Optimize(kernel_graph); +} + +void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); } +} // namespace opt +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h new file mode 100644 index 0000000000..0014666053 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_optimization.h @@ -0,0 +1,54 @@ +/** + * Copyright 2021 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ + +#include "ir/anf.h" +#include "ir/func_graph.h" +#include "backend/session/kernel_graph.h" +#include "backend/optimizer/common/optimizer.h" +#include "backend/optimizer/common/pass_manager.h" + +namespace mindspore { +namespace opt { +class GraphKernelOptimizer { + public: + void Run(const KernelGraphPtr &kernel_graph); + + private: + // Pre-process + PassManagerPtr PreProcess(); + // Cluster kernels + PassManagerPtr Cluster(); + // High level optimize 1 + PassManagerPtr HighLevelOpt1(); + // Split kernels + PassManagerPtr Split(); + // High level optimize 2 + PassManagerPtr HighLevelOpt2(); + // Combine kernels + PassManagerPtr Combine(); + // Post-process + PassManagerPtr PostProcess(); + + bool is_gpu{false}; + bool is_ascend{false}; +}; + +void GraphKernelOptimize(const KernelGraphPtr &kernel_graph); +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_OPTIMIZATION_H_ diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.h index 36a030dfdc..4522e80dec 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.h +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/shape_ops_splitter.h @@ -17,6 +17,7 @@ #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_SHAPE_OPS_SPLITTER_H_ #include #include +#include #include "ir/func_graph.h" #include "backend/optimizer/common/pass.h" @@ -24,15 +25,15 @@ namespace mindspore { namespace opt { class ShapeOpsSplitter : public Pass { public: - explicit ShapeOpsSplitter(const std::vector &shape_ops) - : Pass("shape_ops_splitter"), shape_ops_(shape_ops) {} + explicit ShapeOpsSplitter(std::vector shape_ops) + : Pass("shape_ops_splitter"), shape_ops_(std::move(shape_ops)) {} ~ShapeOpsSplitter() override = default; bool Run(const FuncGraphPtr &func_graph); private: bool Process(const FuncGraphPtr &func_graph); bool IsMultiUserShapeOps(const AnfNodePtr &node, const FuncGraphManagerPtr &mng); - const std::vector &shape_ops_; + std::vector shape_ops_; }; using ShapeOpsSplitterPtr = std::shared_ptr; } // namespace opt diff --git a/mindspore/ccsrc/backend/session/ascend_session.cc b/mindspore/ccsrc/backend/session/ascend_session.cc index ae770fb363..7e5d85f994 100644 --- a/mindspore/ccsrc/backend/session/ascend_session.cc +++ b/mindspore/ccsrc/backend/session/ascend_session.cc @@ -51,16 +51,7 @@ #include "debug/data_dump/dump_json_parser.h" #include "debug/tensor_load.h" #include "debug/anf_ir_utils.h" -#include "backend/optimizer/graph_kernel/reorder_ops.h" -#include "backend/optimizer/graph_kernel/basic_ops_fusion.h" -#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" -#include "backend/optimizer/graph_kernel/tensor_promotion.h" -#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" -#include "backend/optimizer/graph_kernel/graph_kernel_expander.h" -#include "backend/optimizer/graph_kernel/graph_kernel_cse.h" -#include "backend/optimizer/graph_kernel/value_graph_binder.h" -#include "backend/optimizer/graph_kernel/add_atomic_clean.h" -#include "backend/optimizer/pass/getitem_tuple.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" #include "backend/session/ascend_auto_monad.h" #include "debug/data_dump/e2e_dump_util.h" #include "debug/anf_ir_dump.h" @@ -843,22 +834,8 @@ void AscendSession::GraphKernelOptimize(const std::shared_ptr &kern if (!(context_ptr->get_param(MS_CTX_ENABLE_GRAPH_KERNEL))) { return; } - auto optimizer = std::make_shared(); - auto pm = std::make_shared("graph_kernel_pm"); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - // After Simplify and Splitter, a lot of redundant getitem/maketuple - // will be exposed, use GetitemTuple Pass to delete them. - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - optimizer->AddPassManager(pm); - (void)optimizer->Optimize(kernel_graph); + opt::GraphKernelOptimize(kernel_graph); + kernel_graph->SetExecOrderByDefault(); } void AscendSession::AdjustKernel(const std::shared_ptr &kernel_graph) const { diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 0b4d00e586..e3f6f48997 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -42,23 +42,7 @@ #include "backend/optimizer/gpu/relu_v2_pass.h" #include "backend/optimizer/gpu/add_relu_v2_fusion.h" #include "backend/optimizer/gpu/add_relu_grad_v2_fusion.h" -#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" -#include "backend/optimizer/graph_kernel/add_stitch_atomic_clean_gpu.h" -#include "backend/optimizer/graph_kernel/arithmetic_simplify.h" -#include "backend/optimizer/graph_kernel/basic_ops_fusion.h" -#include "backend/optimizer/graph_kernel/clean_all_in_once.h" -#include "backend/optimizer/graph_kernel/depend_formater.h" -#include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" -#include "backend/optimizer/graph_kernel/tensor_promotion.h" -#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" -#include "backend/optimizer/graph_kernel/graph_kernel_expander.h" -#include "backend/optimizer/graph_kernel/raise_reduction_precision.h" -#include "backend/optimizer/graph_kernel/graph_kernel_cse.h" -#include "backend/optimizer/graph_kernel/shape_ops_splitter.h" -#include "backend/optimizer/graph_kernel/value_graph_binder.h" -#include "backend/optimizer/graph_kernel/parallel_fusion.h" -#include "backend/optimizer/graph_kernel/optimize_assign.h" -#include "backend/optimizer/graph_kernel/split_assign.h" +#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h" #include "backend/optimizer/pass/communication_op_fusion.h" #include "backend/optimizer/pass/getitem_tuple.h" #include "common/trans.h" @@ -197,36 +181,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr &kernel_ if (!(context_ptr->get_param(MS_CTX_ENABLE_GRAPH_KERNEL))) { return; } - auto optimizer = std::make_shared(); - auto pm = std::make_shared("graph_kernel_pm"); - std::vector duplicated_ops = {prim::kPrimReshape, prim::kPrimExpandDims, prim::kPrimCast}; - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); // Make more fusion opportunity. - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared(duplicated_ops)); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - // The CSE may output a graph with repeated outputs. - pm->AddPass(std::make_shared()); - // After Simplify and Splitter, a lot of redundant getitem/maketuple - // will be exposed, use GetitemTuple Pass to delete them. - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); - pm->AddPass(std::make_shared()); // Prevent fake loop in parallel fusion. - pm->AddPass(std::make_shared(kGPUDevice, opt::ParallelConfig(7))); - pm->AddPass(std::make_shared()); - optimizer->AddPassManager(pm); - (void)optimizer->Optimize(kernel_graph); + opt::GraphKernelOptimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); }