| @@ -23,6 +23,7 @@ | |||
| namespace mindspore::graphkernel { | |||
| class CallbackImpl : public Callback { | |||
| public: | |||
| virtual ~CallbackImpl() = default; | |||
| ShapeVector GetInputShape(const AnfNodePtr &node, size_t i) override; | |||
| ShapeVector GetOutputShape(const AnfNodePtr &node, size_t i) override; | |||
| ShapeVector GetInputInferShape(const AnfNodePtr &node, size_t i) override; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2021-2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -19,7 +19,7 @@ | |||
| namespace mindspore::graphkernel { | |||
| namespace { | |||
| class AbstractShapeCreator { | |||
| class FakeAbstractShape { | |||
| public: | |||
| using AbstractShapeTransferFunc = std::function<ShapeVector(const ShapeVector &)>; | |||
| /** | |||
| @@ -49,10 +49,14 @@ class AbstractShapeCreator { | |||
| static ShapeVector NchwAbstractShape(const ShapeVector &device_shape) { return device_shape; } | |||
| static ShapeVector NhwcAbstractShape(const ShapeVector &device_shape) { | |||
| const size_t nhwc_size = 4; | |||
| const size_t index_n = 0; | |||
| const size_t index_h = 1; | |||
| const size_t index_w = 2; | |||
| const size_t index_c = 3; | |||
| if (device_shape.size() != nhwc_size) { | |||
| MS_LOG(EXCEPTION) << "Shape size of NHWC should be 4, but got " << device_shape.size(); | |||
| } | |||
| return {device_shape[0], device_shape[3], device_shape[1], device_shape[2]}; | |||
| return {device_shape[index_n], device_shape[index_c], device_shape[index_h], device_shape[index_w]}; | |||
| } | |||
| static ShapeVector FractalNzAbstractShape(const ShapeVector &device_shape) { | |||
| if (device_shape.size() == 1 && (device_shape[0] == 1 || static_cast<size_t>(device_shape[0]) % kCubeSize == 0)) { | |||
| @@ -64,12 +68,16 @@ class AbstractShapeCreator { | |||
| } | |||
| ShapeVector shape; | |||
| size_t dims = device_shape.size(); | |||
| size_t batch = dims - 4; | |||
| size_t batch = dims - nz_size; | |||
| for (size_t i = 0; i < batch; ++i) { | |||
| shape.push_back(device_shape[i]); | |||
| } | |||
| int64_t m = device_shape[dims - 3] * device_shape[dims - 2]; | |||
| int64_t n = device_shape[dims - 4] * device_shape[dims - 1]; | |||
| const size_t index_m1 = 3; | |||
| const size_t index_m2 = 2; | |||
| const size_t index_n1 = 4; | |||
| const size_t index_n2 = 1; | |||
| int64_t m = device_shape[dims - index_m1] * device_shape[dims - index_m2]; | |||
| int64_t n = device_shape[dims - index_n1] * device_shape[dims - index_n2]; | |||
| shape.push_back(m); | |||
| shape.push_back(n); | |||
| @@ -79,6 +87,6 @@ class AbstractShapeCreator { | |||
| } // namespace | |||
| ShapeVector GetFakeAbstractShape(const ShapeVector &device_shape, const std::string &format) { | |||
| return AbstractShapeCreator::GetFakeAbstractShape(device_shape, format); | |||
| return FakeAbstractShape::GetFakeAbstractShape(device_shape, format); | |||
| } | |||
| } // namespace mindspore::graphkernel | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2021-2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -24,12 +24,18 @@ | |||
| namespace mindspore::graphkernel { | |||
| class PyExpander : public DefaultExpander { | |||
| public: | |||
| virtual ~PyExpander() = default; | |||
| protected: | |||
| virtual bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json); | |||
| FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node) override; | |||
| }; | |||
| class ComplexOpExpander : public PyExpander { | |||
| public: | |||
| virtual ~ComplexOpExpander() = default; | |||
| protected: | |||
| bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json); | |||
| }; | |||
| @@ -67,21 +67,21 @@ inline unsigned int GetPassLevelByFlag(bool flag) { return flag ? OptLevel_1 : O | |||
| PassManagerPtr GraphKernelOptimizer::PreProcess() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(0, "preprocess"); | |||
| // Do DependElimination all passes of graphkernel | |||
| pm->AddPass(std::make_shared<DependElimination>(), OptLevel_1); | |||
| pm->Add(std::make_shared<DependElimination>(), OptLevel_1); | |||
| // Do cse before all passes of graphkernel | |||
| pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1); | |||
| pm->Add(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1); | |||
| // Save the original output info | |||
| pm->AddPass(std::make_shared<SaveOutputShape>(), OptLevel_1); | |||
| pm->Add(std::make_shared<SaveOutputShape>(), OptLevel_1); | |||
| // Change Assign(p, a, U) to Assign(Depend(p, U), a) | |||
| pm->AddPass(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu); | |||
| pm->Add(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu); | |||
| // Spread the MakeTuple input of UpdateState | |||
| pm->AddPass(std::make_shared<SpreadUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<SpreadUpdateState>(), OptLevel_1); | |||
| // Eliminate the common nodes that generated in SpreadUpdateState | |||
| pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1); | |||
| pm->Add(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -89,16 +89,16 @@ PassManagerPtr GraphKernelOptimizer::Cluster() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(1, "cluster"); | |||
| // Expand complex op to composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu); | |||
| pm->Add(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu); | |||
| // Expand complex basic kernels to composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1); | |||
| // Cluster basic kernels and composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1); | |||
| // Eliminate the outputs without external user | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -106,29 +106,29 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(2, "highlevelopt1"); | |||
| // Remove redundant Cast(bias, fp16) for Matmul input | |||
| pm->AddPass(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend); | |||
| pm->Add(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend); | |||
| // Reorder Cast and Type-insensitive node | |||
| pm->AddPass(std::make_shared<ReorderOps>(), OptLevel_2); | |||
| pm->Add(std::make_shared<ReorderOps>(), OptLevel_2); | |||
| // normalize the Reduce axis | |||
| pm->AddPass(std::make_shared<AxisNormalizer>(), OptLevel_1); | |||
| pm->Add(std::make_shared<AxisNormalizer>(), OptLevel_1); | |||
| // Cast the input of ReduceSum from float16 to float32 for higher precision | |||
| pm->AddPass(std::make_shared<RaiseReductionPrecision>(), OptLevel_2); | |||
| pm->Add(std::make_shared<RaiseReductionPrecision>(), OptLevel_2); | |||
| // Insert PadAkg and UnPadAkg Ops for MatMul | |||
| pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu); | |||
| pm->Add(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu); | |||
| // Universal arithmetic simplify | |||
| pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu); | |||
| pm->Add(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu); | |||
| // Common subexpression elimination | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2); | |||
| pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_2); | |||
| // Eliminate unnecessary transform ops | |||
| auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_trans_op_optimize); | |||
| pm->AddPass(std::make_shared<TransformOpOptimizer>(), level, is_gpu); | |||
| pm->Add(std::make_shared<TransformOpOptimizer>(), level, is_gpu); | |||
| return pm; | |||
| } | |||
| @@ -137,21 +137,21 @@ PassManagerPtr GraphKernelOptimizer::Split() const { | |||
| // Make certain nodes redundant so that they are used by only one user, | |||
| // which can avoid unnecessary input-output and get better performance. | |||
| // preprocess for ShapeOpsSplitter | |||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1); | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape}; | |||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1); | |||
| pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1); | |||
| // Split kernel according to costmodel | |||
| pm->AddPass(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1); | |||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||
| // will be exposed, use GetitemTuple Pass to delete them. | |||
| pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | |||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1); | |||
| pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_1); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1); | |||
| pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -161,30 +161,30 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const { | |||
| auto &flags = GraphKernelFlags::GetInstance(); | |||
| // Auto recompute according to local memory burst. | |||
| auto recompute_lv = GetPassLevelByFlag(flags.recompute_increment_threshold > 0 || flags.recompute_peak_threshold > 0); | |||
| pm->AddPass(std::make_shared<GraphKernelRecompute>(), recompute_lv); | |||
| pm->Add(std::make_shared<GraphKernelRecompute>(), recompute_lv); | |||
| // Replace Assign with InplaceAssign, and replace original output with overridden parameters | |||
| pm->AddPass(std::make_shared<OptimizeAssign>(), OptLevel_2); | |||
| pm->Add(std::make_shared<OptimizeAssign>(), OptLevel_2); | |||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2)); | |||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2)); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2)); | |||
| pm->Add(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2)); | |||
| pm->Add(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2)); | |||
| pm->Add(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2)); | |||
| // Enable atomic add | |||
| pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend); | |||
| pm->Add(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend); | |||
| // Enable atomic add for stitch nodes. | |||
| auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_stitch_fusion); | |||
| pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu); | |||
| pm->Add(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu); | |||
| // Enable low precision | |||
| auto level_low_precision = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_low_precision); | |||
| pm->AddPass(std::make_shared<DecreaseTransferPrecision>(), level_low_precision); | |||
| pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend); | |||
| pm->Add(std::make_shared<DecreaseTransferPrecision>(), level_low_precision); | |||
| pm->Add(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend); | |||
| // Enable tsa and uss | |||
| pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu); | |||
| pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu); | |||
| pm->Add(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu); | |||
| pm->Add(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu); | |||
| return pm; | |||
| } | |||
| @@ -197,9 +197,8 @@ PassManagerPtr GraphKernelOptimizer::Combine() const { | |||
| auto target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET); | |||
| auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_fusion); | |||
| // Atomic-add GraphKernel node may be linked directly to UpdateState, it should be spread before parallel fusion! | |||
| pm->AddPass(std::make_shared<SpreadUpdateState>(), level); | |||
| pm->AddPass(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, | |||
| is_gpu || is_ascend); | |||
| pm->Add(std::make_shared<SpreadUpdateState>(), level); | |||
| pm->Add(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, is_gpu || is_ascend); | |||
| return pm; | |||
| } | |||
| @@ -207,17 +206,17 @@ PassManagerPtr GraphKernelOptimizer::Combine() const { | |||
| PassManagerPtr GraphKernelOptimizer::PostProcess() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(6, "postprocess"); | |||
| // Make Tuple for the inputs of UpdateState. (the reverse of SpreadUpdateState) | |||
| pm->AddPass(std::make_shared<ShrinkUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<ShrinkUpdateState>(), OptLevel_1); | |||
| // Recover the original output info | |||
| pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| pm->AddPass(std::make_shared<RewriteOutputShape>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| pm->Add(std::make_shared<RewriteOutputShape>(), OptLevel_1); | |||
| // Reduce fake output memory. | |||
| pm->AddPass(std::make_shared<ReduceFakeOutMem>(), OptLevel_1); | |||
| pm->Add(std::make_shared<ReduceFakeOutMem>(), OptLevel_1); | |||
| // Add the new tensors to the kernel_graph | |||
| pm->AddPass(std::make_shared<BindValueToGraph>(), OptLevel_1); | |||
| pm->Add(std::make_shared<BindValueToGraph>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -242,5 +241,8 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) { | |||
| (void)optimizer->Optimize(kernel_graph); | |||
| } | |||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); } | |||
| void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { | |||
| GraphKernelOptimizer graph_kernel_optimizer; | |||
| graph_kernel_optimizer.Run(kernel_graph); | |||
| } | |||
| } // namespace mindspore::graphkernel | |||
| @@ -24,7 +24,7 @@ | |||
| #include "debug/anf_ir_dump.h" | |||
| namespace mindspore::graphkernel { | |||
| void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) { | |||
| void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) { | |||
| MS_EXCEPTION_IF_NULL(pass); | |||
| auto pass_id = passes_.size(); | |||
| auto pass_name = pass->name(); | |||
| @@ -33,7 +33,7 @@ class GraphKernelPassManager : public PassManager { | |||
| ~GraphKernelPassManager() = default; | |||
| // Add graph pass, the pass object will be freed when pass manager freed. | |||
| virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true); | |||
| void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true); | |||
| // Run passes on the func_graph | |||
| bool Run(const FuncGraphPtr &func_graph) const override; | |||
| @@ -26,6 +26,7 @@ | |||
| #include "include/common/utils/python_adapter.h" | |||
| #include "kernel/akg/akg_kernel_json_generator.h" | |||
| #include "kernel/common_utils.h" | |||
| #include "common/graph_kernel/core/graph_kernel_utils.h" | |||
| #include "common/graph_kernel/graph_kernel_helper.h" | |||
| #include "include/common/utils/context/graph_kernel_flags.h" | |||
| @@ -107,13 +108,13 @@ class CostModelSplitSchemer : public SplitSchemer { | |||
| MS_LOG(ERROR) << "Failed decode sub graph, " << graph_desc; | |||
| return false; | |||
| } | |||
| split_plan_.emplace_back(std::move(res_graph)); | |||
| (void)split_plan_.emplace_back(std::move(res_graph)); | |||
| } | |||
| // ops to be inlined. | |||
| need_inline_.clear(); | |||
| std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_), | |||
| [](const std::string &mode) { return mode == "basic" ? 1 : 0; }); | |||
| (void)std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_), | |||
| [](const std::string &mode) { return mode == "basic" ? 1 : 0; }); | |||
| return true; | |||
| } | |||
| @@ -151,8 +152,8 @@ class CostModelSplitSchemer : public SplitSchemer { | |||
| virtual void GetValidKernelNodes() { | |||
| topo_all_nodes_ = TopoSort(func_graph_->get_return()); | |||
| topo_valid_nodes_.clear(); | |||
| std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_), | |||
| [this](const AnfNodePtr &node) { return IsValidKernelNode(node); }); | |||
| (void)std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_), | |||
| [this](const AnfNodePtr &node) { return IsValidKernelNode(node); }); | |||
| } | |||
| void MapNodeGroup() { | |||
| @@ -175,14 +176,14 @@ class CostModelSplitSchemer : public SplitSchemer { | |||
| if (IsValidKernelNode(output)) { | |||
| auto group_id = node_group_[output]; | |||
| node_group_[ret_node] = group_id; | |||
| split_plan_[group_id].emplace_back(ret_node); | |||
| (void)split_plan_[group_id].emplace_back(ret_node); | |||
| return; | |||
| } | |||
| // assign the make_tuple node to a new group. | |||
| if (common::AnfAlgo::CheckPrimitiveType(output, prim::kPrimMakeTuple)) { | |||
| auto group_id = split_plan_.size(); | |||
| split_plan_.emplace_back(AnfNodePtrList{output, ret_node}); | |||
| need_inline_.emplace_back(1); | |||
| (void)split_plan_.emplace_back(AnfNodePtrList{output, ret_node}); | |||
| (void)need_inline_.emplace_back(1); | |||
| node_group_[output] = group_id; | |||
| node_group_[ret_node] = group_id; | |||
| return; | |||
| @@ -52,8 +52,9 @@ std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = fal | |||
| } | |||
| auto axis_vec = GetReduceAxis(node); | |||
| if (axis_vec.empty()) { | |||
| axis_vec.resize(src_shape_vec.size()); | |||
| for (size_t i = 0; i < src_shape_vec.size(); ++i) { | |||
| (void)axis_vec.emplace_back(i); | |||
| axis_vec[i] = i; | |||
| } | |||
| } else { | |||
| (void)std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), [&src_shape_vec](int64_t axis) -> int64_t { | |||
| @@ -281,15 +282,8 @@ void AtomicCleanInsertter::CorrectKernelBuildInfo( | |||
| new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); | |||
| } | |||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; | |||
| new_info_builder.SetInputsFormat(new_inputs_format); | |||
| new_info_builder.SetInputsDeviceType(new_inputs_type); | |||
| new_info_builder.SetOutputsFormat(new_outputs_format); | |||
| new_info_builder.SetOutputsDeviceType(new_outputs_type); | |||
| new_info_builder.SetProcessor(origin_processor); | |||
| new_info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||
| new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||
| auto new_selected_info = new_info_builder.Build(); | |||
| auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format, | |||
| new_outputs_type, origin_processor); | |||
| AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2020-2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2020-2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -42,7 +42,7 @@ class AtomicAddChecker { | |||
| std::vector<AtomicAddInfo> GetAtomicAddInfo() { return atomic_add_infos_; } | |||
| protected: | |||
| virtual bool SuitableForAtomicAdd(const AnfNodePtr &node) { return false; } | |||
| virtual bool SuitableForAtomicAdd(const AnfNodePtr &) { return false; } | |||
| virtual bool FindCandidate(const AnfNodePtr &anf_node); | |||
| virtual bool CanActivateAtomicAdd(const AnfNodePtr &anf_node); | |||
| std::vector<AtomicAddInfo> atomic_add_infos_; | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2021-2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -52,15 +52,8 @@ void StitchAtomicCleanInsertter::CorrectKernelBuildInfo( | |||
| new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second)); | |||
| new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); | |||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; | |||
| new_info_builder.SetInputsFormat(new_inputs_format); | |||
| new_info_builder.SetInputsDeviceType(new_inputs_type); | |||
| new_info_builder.SetOutputsFormat(new_outputs_format); | |||
| new_info_builder.SetOutputsDeviceType(new_outputs_type); | |||
| new_info_builder.SetProcessor(origin_processor); | |||
| new_info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||
| new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||
| auto new_selected_info = new_info_builder.Build(); | |||
| auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format, | |||
| new_outputs_type, origin_processor); | |||
| AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); | |||
| } | |||
| @@ -124,7 +117,7 @@ void StitchAtomicCleanInsertter::ProcessOriginCNode( | |||
| for (const auto &[user_node, index] : reduce_user_nodes) { | |||
| auto user_cnode = user_node->cast<CNodePtr>(); | |||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||
| user_cnode->set_input(static_cast<size_t>(index), parameter); | |||
| user_cnode->set_input(IntToSize(index), parameter); | |||
| if (!connected) { | |||
| std::vector<std::pair<AnfNodePtr, int>> user_user = FindInnerCNodeUsers(stitch_node_, user_cnode); | |||
| if (!user_user.empty()) { | |||
| @@ -154,8 +147,8 @@ std::vector<std::pair<AnfNodePtr, int>> StitchAtomicCleanInsertter::FindInnerCNo | |||
| } | |||
| std::vector<std::pair<AnfNodePtr, int>> inner_user_nodes; | |||
| auto users = mng_sub->node_users()[target]; | |||
| std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes), | |||
| [](const std::pair<AnfNodePtr, int> &pair) { return pair; }); | |||
| (void)std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes), | |||
| [](const std::pair<AnfNodePtr, int> &pair) { return pair; }); | |||
| return inner_user_nodes; | |||
| } | |||
| @@ -1,5 +1,5 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * Copyright 2021-2022 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| @@ -32,16 +32,18 @@ class StitchAtomicCleanInsertter : public AtomicCleanInsertter { | |||
| ~StitchAtomicCleanInsertter() override = default; | |||
| bool Run(const FuncGraphPtr &func_graph) override; | |||
| private: | |||
| protected: | |||
| void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, | |||
| const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &clean_infos) override; | |||
| void ProcessOriginCNode( | |||
| const AnfNodePtr &composite_node, | |||
| const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override; | |||
| private: | |||
| CNodePtr CreateInplaceAssignNode(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter, | |||
| const AtomicAddInfo &info) const; | |||
| std::vector<std::pair<AnfNodePtr, int>> FindInnerCNodeUsers(const AnfNodePtr &inner_node, | |||
| const CNodePtr &target) const; | |||
| void ProcessOriginCNode( | |||
| const AnfNodePtr &composite_node, | |||
| const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override; | |||
| std::pair<bool, AtomicAddInfo> IsStitchWithAtomic(const AnfNodePtr &anf_node); | |||
| void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node, | |||
| @@ -33,8 +33,8 @@ void UpdateBuildInfo(const AnfNodePtr &matmul_node, const AnfNodePtr &cast_node) | |||
| input_types.push_back(cast_types.front()); | |||
| std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node); | |||
| std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node); | |||
| auto graph_sel_info = | |||
| BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node); | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, | |||
| AnfAlgo::GetProcessor(matmul_node)); | |||
| AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get()); | |||
| } | |||
| @@ -169,6 +169,6 @@ class CallbackImplRegister { | |||
| }; | |||
| #define GRAPH_KERNEL_CALLBACK_REGISTER(cls) \ | |||
| static const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); }) | |||
| const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); }) | |||
| } // namespace mindspore::graphkernel | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CORE_GRAPH_KERNEL_CALLBACK_H_ | |||
| @@ -74,7 +74,8 @@ void UpdateOutputInfo(const AnfNodePtr &cnode) { | |||
| } | |||
| std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(cnode); | |||
| std::vector<TypeId> output_types = {TypeId::kNumberTypeFloat16}; | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode); | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, | |||
| AnfAlgo::GetProcessor(cnode)); | |||
| AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, cnode.get()); | |||
| } | |||
| } | |||
| @@ -91,33 +91,27 @@ AbstractBasePtr GetOutputAbstract(const AnfNodePtr &node, size_t output_idx) { | |||
| return out_spec; | |||
| } | |||
| // Rebuild as node inputs or outputs have changed, processor comes from node itself | |||
| // Build for new node, processor comes from context | |||
| kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format, | |||
| const std::vector<TypeId> &inputs_type, | |||
| const std::vector<std::string> &output_formats, | |||
| const std::vector<TypeId> &output_types, const AnfNodePtr &node) { | |||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder; | |||
| graph_info_builder.SetInputsFormat(inputs_format); | |||
| graph_info_builder.SetInputsDeviceType(inputs_type); | |||
| graph_info_builder.SetOutputsFormat(output_formats); | |||
| graph_info_builder.SetOutputsDeviceType(output_types); | |||
| graph_info_builder.SetProcessor(AnfAlgo::GetProcessor(node)); | |||
| graph_info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||
| graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||
| return graph_info_builder.Build(); | |||
| const std::vector<TypeId> &output_types) { | |||
| return BuildSelectKernelBuildInfo(inputs_format, inputs_type, output_formats, output_types, | |||
| kernel::GetProcessorFromContext()); | |||
| } | |||
| // Build for new node, processor comes from context | |||
| // Build for new node with given processor | |||
| kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format, | |||
| const std::vector<TypeId> &inputs_type, | |||
| const std::vector<std::string> &output_formats, | |||
| const std::vector<TypeId> &output_types) { | |||
| const std::vector<TypeId> &output_types, | |||
| const kernel::Processor &processor) { | |||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder; | |||
| graph_info_builder.SetInputsFormat(inputs_format); | |||
| graph_info_builder.SetInputsDeviceType(inputs_type); | |||
| graph_info_builder.SetOutputsFormat(output_formats); | |||
| graph_info_builder.SetOutputsDeviceType(output_types); | |||
| graph_info_builder.SetProcessor(kernel::GetProcessorFromContext()); | |||
| graph_info_builder.SetProcessor(processor); | |||
| graph_info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||
| graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||
| return graph_info_builder.Build(); | |||
| @@ -57,11 +57,12 @@ void SetNewKernelInfo(const AnfNodePtr &new_node, const FuncGraphPtr &fg, const | |||
| kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format, | |||
| const std::vector<TypeId> &inputs_type, | |||
| const std::vector<std::string> &output_formats, | |||
| const std::vector<TypeId> &output_types, const AnfNodePtr &node); | |||
| const std::vector<TypeId> &output_types); | |||
| kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format, | |||
| const std::vector<TypeId> &inputs_type, | |||
| const std::vector<std::string> &output_formats, | |||
| const std::vector<TypeId> &output_types); | |||
| const std::vector<TypeId> &output_types, | |||
| const kernel::Processor &processor); | |||
| bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc); | |||
| bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc, | |||
| std::map<std::string, AnfNodePtr> *address_node_map); | |||
| @@ -229,8 +229,8 @@ void UpdateMatmulInfo(const AnfNodePtr &matmul_node, const vec &unpad_shape, con | |||
| std::vector<TypeId> input_types = AnfAlgo::GetAllInputDeviceTypes(matmul_node); | |||
| std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node); | |||
| std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node); | |||
| auto graph_sel_info = | |||
| BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node); | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, | |||
| AnfAlgo::GetProcessor(matmul_node)); | |||
| AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get()); | |||
| } | |||
| @@ -39,14 +39,14 @@ using opt::GraphOptimizer; | |||
| PassManagerPtr GraphKernelOptimizer::Cluster() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(0, "cluster"); | |||
| // Expand complex basic kernels to composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1); | |||
| // Cluster basic kernels and composite kernels | |||
| pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1); | |||
| pm->AddPass(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1); | |||
| pm->Add(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1); | |||
| // Eliminate the outputs without external user | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -55,27 +55,27 @@ PassManagerPtr GraphKernelOptimizer::Split() const { | |||
| // Make certain nodes redundant so that they are used by only one user, | |||
| // which can avoid unnecessary input-output and get better performance. | |||
| // preprocess for ShapeOpsSplitter | |||
| pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1); | |||
| std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape}; | |||
| pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1); | |||
| pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1); | |||
| // Split kernel according to costmodel | |||
| pm->AddPass(std::make_shared<GraphKernelSplitter>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GraphKernelSplitter>(), OptLevel_1); | |||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | |||
| // will be exposed, use GetitemTuple Pass to delete them. | |||
| pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1); | |||
| // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter | |||
| pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1); | |||
| pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1); | |||
| pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| PassManagerPtr GraphKernelOptimizer::PostProcess() const { | |||
| auto pm = std::make_shared<GraphKernelPassManager>(2, "postprocess"); | |||
| // build akg and replace graph kernel nodes | |||
| pm->AddPass(std::make_shared<KernelBuilder>(), OptLevel_1); | |||
| pm->Add(std::make_shared<KernelBuilder>(), OptLevel_1); | |||
| return pm; | |||
| } | |||
| @@ -23,26 +23,26 @@ | |||
| #include "debug/anf_ir_dump.h" | |||
| namespace mindspore::graphkernel { | |||
| void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) { | |||
| void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) { | |||
| MS_EXCEPTION_IF_NULL(pass); | |||
| auto pass_id = passes_.size(); | |||
| auto pass_name = pass->name(); | |||
| auto pass_id = passes_.size(); | |||
| auto pass_in_list = [this, pass_id, &pass_name](const std::vector<std::string> &pass_list) { | |||
| // the config format can be "stage_id.pass_id" or "stage_name.pass_name" | |||
| // config format can be "stage_id.pass_id" or "stage_name.pass_name" | |||
| return std::find(pass_list.begin(), pass_list.end(), | |||
| std::to_string(this->stage_) + "." + std::to_string(pass_id)) != pass_list.end() || | |||
| std::find(pass_list.begin(), pass_list.end(), this->name_ + "." + pass_name) != pass_list.end(); | |||
| }; | |||
| bool enable = supported_device && flags_.opt_level >= pass_level; | |||
| if (enable) { | |||
| // if it meets the condition to enable, check whether it's in the disabled list. | |||
| // if it meets the condition to enable, check whether it's in the disabled pass list. | |||
| enable = !pass_in_list(flags_.disable_pass); | |||
| } else { | |||
| // if it doesn't meet the condition to enable, check whether it's in the enabled list. | |||
| // if it doesn't meet the condition to enable, check whether it's in the enabled pass list. | |||
| enable = pass_in_list(flags_.enable_pass); | |||
| } | |||
| passes_.push_back(pass); | |||
| enabled_.push_back(enable); | |||
| passes_.push_back(pass); | |||
| } | |||
| bool GraphKernelPassManager::RunPass(const FuncGraphPtr &func_graph, size_t pass_id, const opt::PassPtr &pass) const { | |||
| @@ -59,10 +59,10 @@ bool GraphKernelPassManager::Run(const FuncGraphPtr &func_graph) const { | |||
| for (size_t i = 0; i < passes_.size(); i++) { | |||
| if (enabled_[i]) { | |||
| changed = RunPass(func_graph, i, passes_[i]) || changed; | |||
| // dump ir to a graph_kernel subdir, and set a global id in front of the filename | |||
| // dump ir to a graph_kernel subdir, and set a global id in front of the filenames | |||
| std::ostringstream oss; | |||
| static int g_id = 0; | |||
| constexpr int id_length = 4; | |||
| static int g_id = 0; | |||
| oss << "graph_kernel/" << std::setfill('0') << std::setw(id_length) << g_id++ << "_" | |||
| << GetPassFullname(i, passes_[i]); | |||
| DumpPassIR(func_graph, oss.str()); | |||
| @@ -21,8 +21,8 @@ | |||
| #include <string> | |||
| #include <memory> | |||
| #include "include/common/utils/context/graph_kernel_flags.h" | |||
| #include "backend/common/optimizer/pass_manager.h" | |||
| #include "include/common/utils/context/graph_kernel_flags.h" | |||
| namespace mindspore::graphkernel { | |||
| using opt::PassManager; | |||
| @@ -32,10 +32,10 @@ class GraphKernelPassManager : public PassManager { | |||
| : PassManager(name, true), stage_(stage), flags_(GraphKernelFlags::GetInstance()) {} | |||
| ~GraphKernelPassManager() = default; | |||
| // Add graph pass, the pass object will be freed when pass manager freed. | |||
| virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true); | |||
| // Add graph pass for lite, the pass object will be freed when pass manager freed. | |||
| void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true); | |||
| // Run passes on the func_graph | |||
| // Run passes for lite on the func_graph | |||
| bool Run(const FuncGraphPtr &func_graph) const override; | |||
| protected: | |||
| @@ -178,7 +178,8 @@ bool ReplaceAssignByInplaceAssignInGraphkernel(const FuncGraphPtr &func_graph) { | |||
| input_types.push_back(input_types.back()); | |||
| std::vector<std::string> output_formats = {input_formats.back()}; | |||
| std::vector<TypeId> output_types = {input_types.back()}; | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode); | |||
| auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, | |||
| AnfAlgo::GetProcessor(cnode)); | |||
| AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, new_cnode.get()); | |||
| mng->Replace(cnode, new_cnode); | |||
| } | |||