| @@ -14,10 +14,8 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/ir_fission/transdata_split.h" | |||
| #include "backend/optimizer/ascend/ascend_helper.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include <set> | |||
| #include "debug/anf_ir_dump.h" | |||
| #include "utils/trace_base.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -14,15 +14,9 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/mindir/bn_grad_unify_mindir.h" | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "utils/utils.h" | |||
| #include "utils/ms_context.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/trace_base.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -14,15 +14,9 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/mindir/dynamic_reshape_unify_mindir.h" | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "utils/utils.h" | |||
| #include "utils/ms_context.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/trace_base.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -1,149 +0,0 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/gpu/adjust_depend_for_parallel_optimizer_recompute_all_gather_fusion.h" | |||
| #include <algorithm> | |||
| #include "utils/hash_map.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/utils.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| bool AdjustDependForParallelOptimizerRecomputeAllGatherFusion::Run(const FuncGraphPtr &graph) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| mindspore::HashMap<int64_t, bool> forward_allgather_recompute_value_in_fusion_group; | |||
| std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return()); | |||
| std::vector<int64_t> parallel_optimizer_recompute_allgather_fusion_ids; | |||
| std::vector<AnfNodePtr> parallel_optimizer_recompute_allgathers; | |||
| std::vector<AnfNodePtr> parallel_optimizer_recompute_first_fusion_allgathers; | |||
| int64_t unrecompute_max_fusion_id = -1; | |||
| int64_t recompute_min_fusion_id = 0; | |||
| for (auto &node : node_list) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->cast<CNodePtr>() || !AnfUtils::IsRealKernel(node)) { | |||
| continue; | |||
| } | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| if (!AnfAlgo::IsAllgather(cnode) || !AnfAlgo::IsFusion(cnode) || !AnfAlgo::IsFromParallelOptimizer(cnode)) { | |||
| continue; | |||
| } | |||
| if (AnfAlgo::IsRecompute(cnode)) { | |||
| int64_t fusion_id = AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrFusion); | |||
| if (std::find(parallel_optimizer_recompute_allgather_fusion_ids.begin(), | |||
| parallel_optimizer_recompute_allgather_fusion_ids.end(), | |||
| fusion_id) == parallel_optimizer_recompute_allgather_fusion_ids.end()) { | |||
| parallel_optimizer_recompute_allgather_fusion_ids.push_back(fusion_id); | |||
| if (recompute_min_fusion_id == 0 || fusion_id < recompute_min_fusion_id) { | |||
| recompute_min_fusion_id = fusion_id; | |||
| } | |||
| parallel_optimizer_recompute_first_fusion_allgathers.push_back(node); | |||
| } else { | |||
| parallel_optimizer_recompute_allgathers.push_back(node); | |||
| } | |||
| } else { | |||
| int64_t unrecompute_fusion_id = AnfAlgo::GetNodeAttr<int64_t>(cnode, kAttrFusion); | |||
| unrecompute_max_fusion_id = std::max(unrecompute_fusion_id, unrecompute_max_fusion_id); | |||
| bool would_be_recomputed = | |||
| AnfAlgo::HasNodeAttr(kAttrRecompute, cnode) && AnfAlgo::GetNodeAttr<bool>(cnode, kAttrRecompute); | |||
| auto [iter, inserted] = | |||
| forward_allgather_recompute_value_in_fusion_group.emplace(unrecompute_fusion_id, would_be_recomputed); | |||
| if (!inserted && iter->second != would_be_recomputed) { | |||
| MS_LOG(EXCEPTION) << "In same fusion group, the allgather recompute attribute should be equal. " | |||
| "The normal node is:" | |||
| << cnode->fullname_with_scope(); | |||
| } | |||
| } | |||
| } | |||
| IncreaseAllgatherFusionId(parallel_optimizer_recompute_allgathers, | |||
| parallel_optimizer_recompute_first_fusion_allgathers, unrecompute_max_fusion_id, | |||
| recompute_min_fusion_id); | |||
| return AdjustAllgatherDepend(graph, parallel_optimizer_recompute_allgathers); | |||
| } | |||
| void AdjustDependForParallelOptimizerRecomputeAllGatherFusion::IncreaseAllgatherFusionId( | |||
| const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers, | |||
| const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_first_fusion_allgathers, | |||
| int64_t unrecompute_max_fusion_id, int64_t recompute_min_fusion_id) { | |||
| // means that there may some forward allgather and duplicated allgather would be fused. | |||
| if (recompute_min_fusion_id <= unrecompute_max_fusion_id) { | |||
| MS_LOG(WARNING) << "Increase the duplicated allgather fusion id"; | |||
| for (auto &adjust_node : parallel_optimizer_recompute_first_fusion_allgathers) { | |||
| int64_t current_fusion_id = AnfAlgo::GetNodeAttr<int64_t>(adjust_node, kAttrFusion); | |||
| int64_t destination_fusion_id = current_fusion_id + unrecompute_max_fusion_id - recompute_min_fusion_id + 2; | |||
| AnfAlgo::SetNodeAttr(kAttrFusion, MakeValue(destination_fusion_id), adjust_node); | |||
| } | |||
| for (auto &adjust_node : parallel_optimizer_recompute_allgathers) { | |||
| int64_t current_fusion_id = AnfAlgo::GetNodeAttr<int64_t>(adjust_node, kAttrFusion); | |||
| int64_t destination_fusion_id = current_fusion_id + unrecompute_max_fusion_id - recompute_min_fusion_id + 2; | |||
| AnfAlgo::SetNodeAttr(kAttrFusion, MakeValue(destination_fusion_id), adjust_node); | |||
| } | |||
| } | |||
| } | |||
| bool AdjustDependForParallelOptimizerRecomputeAllGatherFusion::AdjustAllgatherDepend( | |||
| const FuncGraphPtr &graph, const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers) { | |||
| FuncGraphManagerPtr manager = graph->manager(); | |||
| bool changed = false; | |||
| for (auto &node : parallel_optimizer_recompute_allgathers) { | |||
| auto cnode = node->cast<CNodePtr>(); | |||
| auto depend_node = AnfAlgo::GetInputNode(cnode, 0); | |||
| if (IsPrimitiveCNode(depend_node, prim::kPrimDepend)) { | |||
| auto depend_cnode = depend_node->cast<CNodePtr>(); | |||
| AnfNodeIndexSet allgather_node_set = manager->node_users()[cnode]; | |||
| for (auto &node_pair : allgather_node_set) { | |||
| auto allgather_next_node = node_pair.first; | |||
| CNodePtr allgather_next_cnode = node_pair.first->cast<CNodePtr>(); | |||
| if (allgather_next_cnode == nullptr || !IsValueNode<Primitive>(allgather_next_cnode->input(0))) { | |||
| continue; | |||
| } | |||
| std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimDepend->name())), | |||
| allgather_next_node, AnfAlgo::GetInputNode(depend_cnode, 1)}; | |||
| auto new_depend = graph->NewCNode(inputs); | |||
| new_depend->set_abstract(depend_node->abstract()); | |||
| manager->SetEdge(node, 1, AnfAlgo::GetInputNode(depend_cnode, 0)); | |||
| (void)manager->Replace(allgather_next_node, new_depend); | |||
| changed = true; | |||
| } | |||
| } else if (IsPrimitiveCNode(depend_node, prim::kPrimCast) && | |||
| IsPrimitiveCNode(AnfAlgo::GetInputNode(depend_node->cast<CNodePtr>(), 0), prim::kPrimDepend)) { | |||
| auto cast_cnode = depend_node->cast<CNodePtr>(); | |||
| auto cast_depend_node = AnfAlgo::GetInputNode(cast_cnode, 0); | |||
| auto cast_depend_cnode = cast_depend_node->cast<CNodePtr>(); | |||
| AnfNodeIndexSet allgather_node_set = manager->node_users()[cnode]; | |||
| for (auto &node_pair : allgather_node_set) { | |||
| auto allgather_next_node = node_pair.first; | |||
| CNodePtr allgather_next_cnode = node_pair.first->cast<CNodePtr>(); | |||
| if (allgather_next_cnode == nullptr || !IsValueNode<Primitive>(allgather_next_cnode->input(0))) { | |||
| continue; | |||
| } | |||
| std::vector<AnfNodePtr> inputs = {NewValueNode(std::make_shared<Primitive>(prim::kPrimDepend->name())), | |||
| allgather_next_node, AnfAlgo::GetInputNode(cast_depend_cnode, 1)}; | |||
| auto new_depend = graph->NewCNode(inputs); | |||
| new_depend->set_abstract(cast_depend_node->abstract()); | |||
| manager->SetEdge(depend_node, 1, AnfAlgo::GetInputNode(cast_depend_cnode, 0)); | |||
| (void)manager->Replace(allgather_next_node, new_depend); | |||
| changed = true; | |||
| } | |||
| } else { | |||
| MS_LOG(WARNING) << "The parallel optimizer recompute allgather has no depend edge"; | |||
| } | |||
| } | |||
| return changed; | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -1,44 +0,0 @@ | |||
| /** | |||
| * Copyright 2021 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_ADJUST_DEPEND_FOR_PARALLEL_OPTIMIZER_RECOMPUTE_ALL_GATHER_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_ADJUST_DEPEND_FOR_PARALLEL_OPTIMIZER_RECOMPUTE_ALL_GATHER_H_ | |||
| #include <vector> | |||
| #include <string> | |||
| #include <utility> | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class AdjustDependForParallelOptimizerRecomputeAllGatherFusion : public Pass { | |||
| public: | |||
| explicit AdjustDependForParallelOptimizerRecomputeAllGatherFusion(const std::string &name) | |||
| : Pass("adjust_depend_for_parallel_optimizer_recompute_all_gather") {} | |||
| ~AdjustDependForParallelOptimizerRecomputeAllGatherFusion() override = default; | |||
| bool Run(const FuncGraphPtr &graph) override; | |||
| private: | |||
| bool AdjustAllgatherDepend(const FuncGraphPtr &graph, | |||
| const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers); | |||
| void IncreaseAllgatherFusionId(const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers, | |||
| const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_first_fusion_allgathers, | |||
| int64_t unrecompute_max_fusion_id, int64_t recompute_min_fusion_id); | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_PASS_ADJUST_DEPEND_FOR_PARALLEL_OPTIMIZER_RECOMPUTE_ALL_GATHER_H_ | |||
| @@ -15,7 +15,7 @@ | |||
| */ | |||
| #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h" | |||
| #include "utils/utils.h" | |||
| #include <algorithm> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| namespace mindspore { | |||
| @@ -26,20 +26,17 @@ | |||
| #include "ir/anf.h" | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "backend/optimizer/ascend/ascend_helper.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class AdjustDependForParallelOptimizerRecomputeAllGather : public Pass { | |||
| public: | |||
| AdjustDependForParallelOptimizerRecomputeAllGather() | |||
| : Pass("adjust_depend_for_parallel_optimizer_recompute_all_gather"), | |||
| kernel_select_(std::make_shared<KernelSelect>()) {} | |||
| : Pass("adjust_depend_for_parallel_optimizer_recompute_all_gather") {} | |||
| ~AdjustDependForParallelOptimizerRecomputeAllGather() override = default; | |||
| bool Run(const FuncGraphPtr &graph) override; | |||
| private: | |||
| KernelSelectPtr kernel_select_; | |||
| bool AdjustAllgatherDepend(const FuncGraphPtr &graph, | |||
| const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers); | |||
| void IncreaseAllgatherFusionId(const std::vector<AnfNodePtr> ¶llel_optimizer_recompute_allgathers, | |||
| @@ -16,15 +16,7 @@ | |||
| #include "backend/optimizer/pass/reduce_sum_optimizer.h" | |||
| #include <vector> | |||
| #include "backend/optimizer/common/helper.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "utils/utils.h" | |||
| #include "abstract/abstract_value.h" | |||
| #include "base/core_ops.h" | |||
| #include "ir/anf.h" | |||
| #include "ir/dtype.h" | |||
| #include "ir/scalar.h" | |||
| #include "utils/anf_utils.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| @@ -119,7 +111,6 @@ AnfNodePtr ReduceSumOptimizer::InsertAssistNode(const CNodePtr &cnode, const Ker | |||
| // create a new assist value node to deal with the following two case: | |||
| // 1: the axis_input is empty, the new tensor of the new value node should be 'range(shape.size())', | |||
| // the shape is the first input'shape of ReduceSum; | |||
| // | |||
| // 2: the value of axis_input contain the value less 0, | |||
| // the new tensor of the new value node should be "shape.size() + the_old_value_less_0", | |||
| // the shape is the first input'shape of ReduceSum; | |||
| @@ -57,7 +57,7 @@ | |||
| #include "backend/optimizer/gpu/concat_outputs_for_all_gather.h" | |||
| #include "backend/optimizer/pass/getitem_tuple.h" | |||
| #include "backend/optimizer/pass/optimize_updatestate.h" | |||
| #include "backend/optimizer/gpu/adjust_depend_for_parallel_optimizer_recompute_all_gather_fusion.h" | |||
| #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h" | |||
| #include "utils/ms_device_shape_transfer.h" | |||
| #include "debug/anf_ir_dump.h" | |||
| #include "debug/dump_proto.h" | |||
| @@ -200,8 +200,7 @@ void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_gra | |||
| pm->AddPass(std::make_shared<opt::AddReluV2Fusion>()); | |||
| pm->AddPass(std::make_shared<opt::AddReluGradV2Fusion>()); | |||
| pm->AddPass(std::make_shared<opt::AllReduceFusion>()); | |||
| pm->AddPass(std::make_shared<opt::AdjustDependForParallelOptimizerRecomputeAllGatherFusion>( | |||
| "adjust_depend_for_parallel_optimizer_recompute_all_gather_fusion")); | |||
| pm->AddPass(std::make_shared<opt::AdjustDependForParallelOptimizerRecomputeAllGather>()); | |||
| pm->AddPass(std::make_shared<opt::AllGatherFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ConcatOutputsForAllGather>()); | |||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||
| @@ -292,8 +292,7 @@ void GPUDeviceContext::OptimizeGraphWithDeviceInfo(const KernelGraphPtr &graph) | |||
| pm->AddPass(std::make_shared<opt::AddReluV2Fusion>()); | |||
| pm->AddPass(std::make_shared<opt::AddReluGradV2Fusion>()); | |||
| pm->AddPass(std::make_shared<opt::AllReduceFusion>()); | |||
| pm->AddPass(std::make_shared<opt::AdjustDependForParallelOptimizerRecomputeAllGatherFusion>( | |||
| "adjust_depend_for_parallel_optimizer_recompute_all_gather_fusion")); | |||
| pm->AddPass(std::make_shared<opt::AdjustDependForParallelOptimizerRecomputeAllGather>()); | |||
| pm->AddPass(std::make_shared<opt::AllGatherFusion>()); | |||
| pm->AddPass(std::make_shared<opt::ConcatOutputsForAllGather>()); | |||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | |||
| @@ -21,6 +21,7 @@ | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "backend/optimizer/common/pass_manager.h" | |||
| #include "backend/optimizer/common/common_backend_optimization.h" | |||
| #include "backend/optimizer/pass/adjust_depend_for_parallel_optimizer_recompute_all_gather.h" | |||
| #include "backend/optimizer/gpu/adam_weight_decay_fusion.h" | |||
| #include "backend/optimizer/gpu/adam_fusion.h" | |||
| #include "backend/optimizer/gpu/alltoall_fusion.h" | |||
| @@ -52,7 +53,6 @@ | |||
| #include "backend/optimizer/gpu/matmul_biasadd_fusion.h" | |||
| #include "backend/optimizer/gpu/bce_with_logits_loss_fusion.h" | |||
| #include "backend/optimizer/gpu/insert_cast_gpu.h" | |||
| #include "backend/optimizer/gpu/adjust_depend_for_parallel_optimizer_recompute_all_gather_fusion.h" | |||
| #include "backend/optimizer/gpu/neighbor_exchange_v2_fusion.h" | |||
| #endif // MINDSPORE_CCSRC_RUNTIME_HARDWARE_GPU_OPTIMIZER_H_ | |||