| @@ -100,13 +100,13 @@ | |||
| #include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h" | |||
| #include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h" | |||
| #include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h" | |||
| #include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h" | |||
| #include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h" | |||
| #include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h" | |||
| #include "backend/optimizer/ascend/ir_fission/addn_fission.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h" | |||
| #include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h" | |||
| #include "backend/optimizer/ascend/ir_fission/split_fission.h" | |||
| #include "backend/optimizer/ascend/ir_fission/splitv_fission.h" | |||
| @@ -292,11 +292,11 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) && | |||
| ConfigManager::GetInstance().iter_num() > 1) { | |||
| ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForGetNext>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>()); | |||
| } | |||
| ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForHcclOp>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>()); | |||
| ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>()); | |||
| @@ -370,7 +370,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern | |||
| other_pm->AddPass(std::make_shared<ReduceScatterFusion>()); | |||
| other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>()); | |||
| other_pm->AddPass(std::make_shared<BroadcastFusion>()); | |||
| other_pm->AddPass(std::make_shared<InsertMemcpyAsyncForCascade>()); | |||
| other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>()); | |||
| other_pm->AddPass(std::make_shared<ParameterTransOpFusion>()); | |||
| other_pm->AddPass(std::make_shared<RefreshParameterFormat>()); | |||
| other_pm->AddPass(std::make_shared<SplitOpOptimizer>()); | |||
| @@ -387,7 +387,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern | |||
| other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>()); | |||
| if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) && | |||
| ConfigManager::GetInstance().iter_num() > 1) { | |||
| other2_pm->AddPass(std::make_shared<GetnextMemcpyElimination>()); | |||
| other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>()); | |||
| } | |||
| other2_pm->AddPass(std::make_shared<CheckConsistency>()); | |||
| optimizer2->AddPassManager(other2_pm); | |||
| @@ -383,10 +383,10 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod | |||
| return new_node; | |||
| } | |||
| AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node) { | |||
| AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| auto prim = std::make_shared<Primitive>(kMemCpyAsyncOpName); | |||
| auto prim = std::make_shared<Primitive>(kTensorMoveOpName); | |||
| std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim), node}; | |||
| auto new_node = graph->NewCNode(new_node_inputs); | |||
| MS_EXCEPTION_IF_NULL(new_node); | |||
| @@ -108,7 +108,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP | |||
| CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnode); | |||
| AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node); | |||
| AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node); | |||
| AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const KernelSelectPtr &kernel_select, size_t insert_index, bool is_insert_input); | |||
| @@ -14,49 +14,49 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h" | |||
| #include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h" | |||
| #include <memory> | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| #include "frontend/optimizer/opt.h" | |||
| namespace mindspore::opt { | |||
| const BaseRef GetnextMemcpyElimination::DefinePattern() const { | |||
| auto prim_memcpy = std::make_shared<Primitive>(kMemCpyAsyncOpName); | |||
| const BaseRef GetnextTensorMoveElimination::DefinePattern() const { | |||
| auto prim_tensor_move = std::make_shared<Primitive>(kTensorMoveOpName); | |||
| VarPtr x = std::make_shared<SeqVar>(); | |||
| VectorRef memcpy_async({prim_memcpy, x}); | |||
| return memcpy_async; | |||
| VectorRef tensor_move({prim_tensor_move, x}); | |||
| return tensor_move; | |||
| } | |||
| const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| const AnfNodePtr GetnextTensorMoveElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node, | |||
| const EquivPtr &equiv) const { | |||
| if (graph == nullptr || node == nullptr || equiv == nullptr) { | |||
| return nullptr; | |||
| } | |||
| auto memcpy_cnode = node->cast<CNodePtr>(); | |||
| if (memcpy_cnode == nullptr) { | |||
| auto tensor_move_node = node->cast<CNodePtr>(); | |||
| if (tensor_move_node == nullptr) { | |||
| return nullptr; | |||
| } | |||
| // 1. memcpy has attr kAttrLabelForInsertStreamActive | |||
| if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, memcpy_cnode)) { | |||
| // 1. tensor move has attr kAttrLabelForInsertStreamActive | |||
| if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, tensor_move_node)) { | |||
| MS_LOG(DEBUG) << "node has no label_for_insert_stream_active attr"; | |||
| return nullptr; | |||
| } | |||
| // 2. memcpy's output has only one user next_node | |||
| // 2. tensor move's output has only one user next_node | |||
| auto manager = graph->manager(); | |||
| MS_EXCEPTION_IF_NULL(manager); | |||
| if (manager->node_users().find(memcpy_cnode) == manager->node_users().end()) { | |||
| MS_LOG(EXCEPTION) << "memcpy has no output in manager"; | |||
| if (manager->node_users().find(tensor_move_node) == manager->node_users().end()) { | |||
| MS_LOG(EXCEPTION) << "tensor move has no output in manager"; | |||
| } | |||
| auto next_nodes = manager->node_users()[memcpy_cnode]; | |||
| auto next_nodes = manager->node_users()[tensor_move_node]; | |||
| if (next_nodes.size() > 1) { | |||
| MS_LOG(DEBUG) << "node's output has more than one users"; | |||
| return nullptr; | |||
| } | |||
| // 3. next_node is not nop node, not graph output and it has only one input which is memcpy's output | |||
| // 3. next_node is not nop node, not graph output and it has only one input which is tensor move's output | |||
| for (auto &item : next_nodes) { | |||
| auto next_node = item.first->cast<CNodePtr>(); | |||
| if (opt::IsNopNode(next_node)) { | |||
| @@ -77,6 +77,6 @@ const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, co | |||
| AnfAlgo::SetNodeAttr(kAttrLabelForInsertStreamActive, MakeValue(true), next_node); | |||
| } | |||
| return memcpy_cnode->input(1); | |||
| return tensor_move_node->input(1); | |||
| } | |||
| } // namespace mindspore::opt | |||
| @@ -13,21 +13,21 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class GetnextMemcpyElimination : public PatternProcessPass { | |||
| class GetnextTensorMoveElimination : public PatternProcessPass { | |||
| public: | |||
| explicit GetnextMemcpyElimination(bool multigraph = true) | |||
| : PatternProcessPass("getnext_memcpy_elimination", multigraph) {} | |||
| ~GetnextMemcpyElimination() override = default; | |||
| explicit GetnextTensorMoveElimination(bool multigraph = true) | |||
| : PatternProcessPass("getnext_tensormove_elimination", multigraph) {} | |||
| ~GetnextTensorMoveElimination() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H | |||
| @@ -13,7 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h" | |||
| #include <vector> | |||
| #include "utils/utils.h" | |||
| #include "backend/session/anf_runtime_algorithm.h" | |||
| @@ -69,36 +69,36 @@ bool IsPartOutputsOfHcclOp(const AnfNodePtr &node, const CNodePtr &cur_hccl, con | |||
| } | |||
| } // namespace | |||
| AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const { | |||
| AnfNodePtr InsertTensorMoveForCascade::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(hccl_node); | |||
| std::vector<AnfNodePtr> memcpy_async_list; | |||
| std::vector<AnfNodePtr> tensor_move_list; | |||
| std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)}; | |||
| for (size_t i = 1; i < hccl_node->size(); ++i) { | |||
| auto input = hccl_node->input(i); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| // when input is also a hccl op and just part outputs of it linking with cur_hccl_op | |||
| if (IsPartOutputsOfHcclOp(input, hccl_node, graph)) { | |||
| auto memcpy_async = CreateMemcpyAsyncOp(graph, input); | |||
| if (memcpy_async == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Create memcpy_async op failed." | |||
| auto tensor_move = CreateTensorMoveOp(graph, input); | |||
| if (tensor_move == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Create tensor_move op failed." | |||
| << " trace: " << trace::DumpSourceLines(hccl_node); | |||
| } | |||
| if (AnfAlgo::IsNodeDynamicShape(input)) { | |||
| AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async); | |||
| AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move); | |||
| } | |||
| auto kernel_info = std::make_shared<device::KernelInfo>(); | |||
| memcpy_async->set_kernel_info(kernel_info); | |||
| tensor_move->set_kernel_info(kernel_info); | |||
| MS_EXCEPTION_IF_NULL(kernel_select_); | |||
| kernel_select_->SelectKernel(memcpy_async->cast<CNodePtr>()); | |||
| new_inputs.push_back(memcpy_async); | |||
| memcpy_async_list.push_back(memcpy_async); | |||
| kernel_select_->SelectKernel(tensor_move->cast<CNodePtr>()); | |||
| new_inputs.push_back(tensor_move); | |||
| tensor_move_list.push_back(tensor_move); | |||
| } else { | |||
| new_inputs.push_back(input); | |||
| } | |||
| } | |||
| if (!memcpy_async_list.empty()) { | |||
| if (!tensor_move_list.empty()) { | |||
| CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node); | |||
| new_hccl_node->set_inputs(new_inputs); | |||
| return new_hccl_node; | |||
| @@ -106,8 +106,8 @@ AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &gr | |||
| return nullptr; | |||
| } | |||
| const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| const AnfNodePtr InsertTensorMoveForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) { | |||
| return nullptr; | |||
| } | |||
| @@ -115,7 +115,7 @@ const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_g | |||
| if (!AnfAlgo::IsCommunicationOp(node)) { | |||
| return nullptr; | |||
| } | |||
| return InsertMemcpyAsync(func_graph, cnode); | |||
| return InsertTensorMove(func_graph, cnode); | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -13,8 +13,8 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_ | |||
| #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_ | |||
| #ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_ | |||
| #define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| @@ -22,18 +22,18 @@ | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class InsertMemcpyAsyncForCascade : public PatternProcessPass { | |||
| class InsertTensorMoveForCascade : public PatternProcessPass { | |||
| public: | |||
| explicit InsertMemcpyAsyncForCascade(bool multigraph = true) | |||
| : PatternProcessPass("insert_memcpy_async_for_cascade", multigraph), | |||
| explicit InsertTensorMoveForCascade(bool multigraph = true) | |||
| : PatternProcessPass("insert_tensor_move_for_cascade", multigraph), | |||
| kernel_select_(std::make_shared<KernelSelect>()) {} | |||
| ~InsertMemcpyAsyncForCascade() override = default; | |||
| ~InsertTensorMoveForCascade() override = default; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| AnfNodePtr InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const; | |||
| AnfNodePtr InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const; | |||
| KernelSelectPtr kernel_select_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_OP_CASCADE_H_ | |||
| #endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_OP_CASCADE_H_ | |||
| @@ -13,7 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h" | |||
| #include <vector> | |||
| #include <memory> | |||
| #include "backend/optimizer/ascend/ascend_helper.h" | |||
| @@ -22,14 +22,14 @@ | |||
| namespace mindspore { | |||
| namespace opt { | |||
| AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) { | |||
| AnfNodePtr InsertTensorMoveForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) { | |||
| if (func_graph == nullptr || node == nullptr) { | |||
| return nullptr; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(node); | |||
| if (output_num == 0) { | |||
| MS_LOG(DEBUG) << "Output number is zero, no need to insert memcpy_async!"; | |||
| MS_LOG(DEBUG) << "Output number is zero, no need to insert tensor_move!"; | |||
| return node; | |||
| } | |||
| @@ -39,9 +39,9 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co | |||
| for (size_t output_index = 0; output_index < output_num; ++output_index) { | |||
| auto tuple_get_item = CreatTupleGetItemNode(func_graph, node, output_index); | |||
| auto new_node = CreateMemcpyAsyncOp(func_graph, tuple_get_item); | |||
| auto new_node = CreateTensorMoveOp(func_graph, tuple_get_item); | |||
| if (new_node == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Create memcpy_async op failed!"; | |||
| MS_LOG(EXCEPTION) << "Create tensor move op failed!"; | |||
| } | |||
| if (AnfAlgo::IsNodeDynamicShape(tuple_get_item)) { | |||
| AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), new_node); | |||
| @@ -53,15 +53,15 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co | |||
| return make_tuple; | |||
| } | |||
| const BaseRef InsertMemcpyAsyncForGetNext::DefinePattern() const { | |||
| const BaseRef InsertTensorMoveForGetNext::DefinePattern() const { | |||
| std::shared_ptr<Var> Xs = std::make_shared<SeqVar>(); | |||
| auto prim = std::make_shared<Primitive>(kGetNextOpName); | |||
| return VectorRef({prim, Xs}); | |||
| } | |||
| const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| const AnfNodePtr InsertTensorMoveForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| if (func_graph == nullptr || node == nullptr || !AnfAlgo::IsRealKernel(node)) { | |||
| return nullptr; | |||
| } | |||
| @@ -73,7 +73,7 @@ const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_g | |||
| } | |||
| AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cnode); | |||
| return InsertMemcpyAsyncForGetNextOutputs(func_graph, cnode); | |||
| return InsertTensorMoveForGetNextOutputs(func_graph, cnode); | |||
| } | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| @@ -14,22 +14,22 @@ | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_ | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class InsertMemcpyAsyncForGetNext : public PatternProcessPass { | |||
| class InsertTensorMoveForGetNext : public PatternProcessPass { | |||
| public: | |||
| explicit InsertMemcpyAsyncForGetNext(bool multigraph = true) | |||
| : PatternProcessPass("insert_memcpy_async_for_getnext", multigraph) {} | |||
| ~InsertMemcpyAsyncForGetNext() override = default; | |||
| explicit InsertTensorMoveForGetNext(bool multigraph = true) | |||
| : PatternProcessPass("insert_tensor_move_for_getnext", multigraph) {} | |||
| ~InsertTensorMoveForGetNext() override = default; | |||
| const BaseRef DefinePattern() const override; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_ | |||
| @@ -13,7 +13,7 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h" | |||
| #include <vector> | |||
| #include <set> | |||
| #include <string> | |||
| @@ -25,9 +25,9 @@ | |||
| namespace mindspore { | |||
| namespace opt { | |||
| namespace { | |||
| // insert memcpy for some cnode even if not a Ref cnode | |||
| const std::set<std::string> kNeedInsertMemcpyOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName, | |||
| kLambUpdateWithLROpName}; | |||
| // insert tensormove for some cnode even if not a Ref cnode | |||
| const std::set<std::string> kNeedInsertTensorMoveOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName, | |||
| kLambUpdateWithLROpName}; | |||
| bool IsParameterOrValueNode(const AnfNodePtr &node) { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| @@ -43,7 +43,7 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) { | |||
| // NodeUsersMap, for node B input i use node A, it will be one item in map with key: A, and value: (B, i) | |||
| bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) { | |||
| if (node_users.size() == 1) { | |||
| MS_LOG(INFO) << "This node only used once, no need to insert memcpy node."; | |||
| MS_LOG(INFO) << "This node only used once, no need to insert tensormove node."; | |||
| return false; | |||
| } | |||
| for (const auto &node_pair : node_users) { | |||
| @@ -53,13 +53,13 @@ bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) { | |||
| return true; | |||
| } | |||
| } | |||
| MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert memcpy node."; | |||
| MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert tensormove node."; | |||
| return false; | |||
| } | |||
| } // namespace | |||
| bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input, | |||
| const CNodePtr &cur_node) const { | |||
| bool InsertTensorMoveForHcclOp::NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input, | |||
| const CNodePtr &cur_node) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(input); | |||
| MS_EXCEPTION_IF_NULL(cur_node); | |||
| @@ -79,7 +79,7 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con | |||
| } | |||
| // when input is some special cnodes | |||
| if (kNeedInsertMemcpyOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertMemcpyOpSet.end()) { | |||
| if (kNeedInsertTensorMoveOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertTensorMoveOpSet.end()) { | |||
| return true; | |||
| } | |||
| @@ -96,29 +96,29 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con | |||
| return false; | |||
| } | |||
| void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const { | |||
| void InsertTensorMoveForHcclOp::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const { | |||
| MS_EXCEPTION_IF_NULL(graph); | |||
| MS_EXCEPTION_IF_NULL(hccl_node); | |||
| bool need_memcpy_async = false; | |||
| bool need_tensor_move_async = false; | |||
| std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)}; | |||
| for (size_t i = 1; i < hccl_node->size(); ++i) { | |||
| auto input = hccl_node->input(i); | |||
| if (NeedInsertMemcpy(graph, input, hccl_node)) { | |||
| auto memcpy_async = CreateMemcpyAsyncOp(graph, input); | |||
| if (memcpy_async == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Create memcpy_async op failed."; | |||
| if (NeedInsertTensorMove(graph, input, hccl_node)) { | |||
| auto tensor_move = CreateTensorMoveOp(graph, input); | |||
| if (tensor_move == nullptr) { | |||
| MS_LOG(EXCEPTION) << "Create tensor_move op failed."; | |||
| } | |||
| if (input->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(input)) { | |||
| AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async); | |||
| AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move); | |||
| } | |||
| new_inputs.push_back(memcpy_async); | |||
| need_memcpy_async = true; | |||
| new_inputs.push_back(tensor_move); | |||
| need_tensor_move_async = true; | |||
| } else { | |||
| new_inputs.push_back(input); | |||
| } | |||
| } | |||
| if (need_memcpy_async) { | |||
| if (need_tensor_move_async) { | |||
| CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node); | |||
| new_hccl_node->set_inputs(new_inputs); | |||
| auto manager = graph->manager(); | |||
| @@ -129,15 +129,15 @@ void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, co | |||
| } | |||
| } | |||
| const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| const AnfNodePtr InsertTensorMoveForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node, | |||
| const EquivPtr &) const { | |||
| if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) { | |||
| return nullptr; | |||
| } | |||
| if (!AnfAlgo::IsCommunicationOp(node)) { | |||
| return nullptr; | |||
| } | |||
| InsertMemcpyAsync(func_graph, node->cast<CNodePtr>()); | |||
| InsertTensorMove(func_graph, node->cast<CNodePtr>()); | |||
| return nullptr; | |||
| } | |||
| } // namespace opt | |||
| @@ -13,8 +13,8 @@ | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_ | |||
| #include <memory> | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| @@ -22,19 +22,19 @@ | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class InsertMemcpyAsyncForHcclOp : public PatternProcessPass { | |||
| class InsertTensorMoveForHcclOp : public PatternProcessPass { | |||
| public: | |||
| explicit InsertMemcpyAsyncForHcclOp(bool multigraph = true) | |||
| : PatternProcessPass("insert_memcpy_async_for_hccl_op", multigraph), | |||
| explicit InsertTensorMoveForHcclOp(bool multigraph = true) | |||
| : PatternProcessPass("insert_tensor_move_for_hccl_op", multigraph), | |||
| kernel_query_(std::make_shared<KernelQuery>()) {} | |||
| ~InsertMemcpyAsyncForHcclOp() override = default; | |||
| ~InsertTensorMoveForHcclOp() override = default; | |||
| const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override; | |||
| private: | |||
| void InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const; | |||
| bool NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const; | |||
| void InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const; | |||
| bool NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const; | |||
| KernelQueryPtr kernel_query_; | |||
| }; | |||
| } // namespace opt | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_ | |||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_ | |||
| @@ -22,75 +22,75 @@ | |||
| #include "utils/utils.h" | |||
| #include "backend/kernel_compiler/kernel_build_info.h" | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h" | |||
| #include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class TestGetNextMemcpyElimination : public BackendCommon { | |||
| class TestGetNextTensorMoveElimination : public BackendCommon { | |||
| public: | |||
| TestGetNextMemcpyElimination() : get_py_fun_("gtest_input.pre_activate.getnext_memcpy_elimination_test", true) {} | |||
| TestGetNextTensorMoveElimination() : get_py_fun_("gtest_input.pre_activate.getnext_tensor_move_elimination_test", true) {} | |||
| public: | |||
| UT::PyFuncGraphFetcher get_py_fun_; | |||
| }; | |||
| TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "before"); | |||
| TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensormove_elimination) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "before"); | |||
| ASSERT_TRUE(g_before != nullptr); | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::GetnextMemcpyElimination>(); | |||
| auto pass = std::make_shared<opt::GetnextTensorMoveElimination>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(g_before); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_no_attr) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "before"); | |||
| TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_no_attr) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "before"); | |||
| ASSERT_TRUE(g_before != nullptr); | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::GetnextMemcpyElimination>(); | |||
| auto pass = std::make_shared<opt::GetnextTensorMoveElimination>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(g_before); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_memcpy_multi_users) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "before"); | |||
| TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_tensor_move_multi_users) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "before"); | |||
| ASSERT_TRUE(g_before != nullptr); | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::GetnextMemcpyElimination>(); | |||
| auto pass = std::make_shared<opt::GetnextTensorMoveElimination>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(g_before); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_next_multi_inputs) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "before"); | |||
| TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_next_multi_inputs) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "before"); | |||
| ASSERT_TRUE(g_before != nullptr); | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::GetnextMemcpyElimination>(); | |||
| auto pass = std::make_shared<opt::GetnextTensorMoveElimination>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(g_before); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| @@ -24,23 +24,23 @@ | |||
| #include "utils/utils.h" | |||
| #include "backend/kernel_compiler/kernel_build_info.h" | |||
| #include "backend/optimizer/common/optimizer.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h" | |||
| namespace mindspore { | |||
| namespace opt { | |||
| using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder; | |||
| class TestHWInsertMemcpyAsyncForGetNext : public BackendCommon { | |||
| class TestHWInsertTensorMoveForGetNext : public BackendCommon { | |||
| public: | |||
| TestHWInsertMemcpyAsyncForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_getnext", true) {} | |||
| ~TestHWInsertMemcpyAsyncForGetNext() override = default; | |||
| TestHWInsertTensorMoveForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_getnext", true) {} | |||
| ~TestHWInsertTensorMoveForGetNext() override = default; | |||
| public: | |||
| UT::PyFuncGraphFetcher get_py_fun_; | |||
| }; | |||
| TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_multi_output) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_before"); | |||
| TEST_F(TestHWInsertTensorMoveForGetNext, test_insert_tensor_move_for_getnext_multi_output) { | |||
| FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_before"); | |||
| AbstractBasePtrList args_spec_list{}; | |||
| auto kernel_graph = GetKernelGraph(g_before, args_spec_list); | |||
| @@ -57,11 +57,11 @@ TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_m | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| pm->AddPass(std::make_shared<opt::InsertMemcpyAsyncForGetNext>()); | |||
| pm->AddPass(std::make_shared<opt::InsertTensorMoveForGetNext>()); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kernel_graph); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| } // namespace opt | |||
| @@ -25,24 +25,24 @@ | |||
| #include "ir/param_info.h" | |||
| #define private public | |||
| #define protected public | |||
| #include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h" | |||
| #include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h" | |||
| #undef private | |||
| #undef protected | |||
| namespace mindspore { | |||
| namespace opt { | |||
| class TestHWInsertMemcpyForHccl : public BackendCommon { | |||
| class TestHWInsertTensorMoveForHccl : public BackendCommon { | |||
| public: | |||
| TestHWInsertMemcpyForHccl() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_hccl_op", true) {} | |||
| ~TestHWInsertMemcpyForHccl() override = default; | |||
| TestHWInsertTensorMoveForHccl() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_hccl_op", true) {} | |||
| ~TestHWInsertTensorMoveForHccl() override = default; | |||
| public: | |||
| UT::PyFuncGraphFetcher get_py_fun_; | |||
| }; | |||
| class MockInsertMemcpyForHcclKernelQuery : public KernelQuery { | |||
| class MockInsertTensorMoveForHcclKernelQuery : public KernelQuery { | |||
| public: | |||
| MockInsertMemcpyForHcclKernelQuery() = default; | |||
| ~MockInsertMemcpyForHcclKernelQuery() override = default; | |||
| MockInsertTensorMoveForHcclKernelQuery() = default; | |||
| ~MockInsertTensorMoveForHcclKernelQuery() override = default; | |||
| bool IsTbeRef(const AnfNodePtr &node) override { | |||
| MS_EXCEPTION_IF_NULL(node); | |||
| if (!node->isa<CNode>()) { | |||
| @@ -53,9 +53,9 @@ class MockInsertMemcpyForHcclKernelQuery : public KernelQuery { | |||
| } | |||
| }; | |||
| TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) { | |||
| TEST_F(TestHWInsertTensorMoveForHccl, test_cond1_no_insert) { | |||
| get_py_fun_.SetDoResolve(true); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before2"); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond1", "before2"); | |||
| ASSERT_TRUE(g != nullptr); | |||
| std::vector<int64_t> shp_x{1, 64, 112, 112}; | |||
| auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x); | |||
| @@ -66,7 +66,7 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>(); | |||
| auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kg); | |||
| @@ -74,9 +74,9 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) { | |||
| EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph)); | |||
| } | |||
| TEST_F(TestHWInsertMemcpyForHccl, test_cond2) { | |||
| TEST_F(TestHWInsertTensorMoveForHccl, test_cond2) { | |||
| get_py_fun_.SetDoResolve(true); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "before"); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "before"); | |||
| ASSERT_TRUE(g != nullptr); | |||
| std::vector<int64_t> shp_x{1, 64, 112, 112}; | |||
| auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x); | |||
| @@ -90,19 +90,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond2) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>(); | |||
| auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kg); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestHWInsertMemcpyForHccl, test_cond3) { | |||
| TEST_F(TestHWInsertTensorMoveForHccl, test_cond3) { | |||
| get_py_fun_.SetDoResolve(true); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "before"); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "before"); | |||
| ASSERT_TRUE(g != nullptr); | |||
| std::vector<int64_t> shp_x{3, 2}; | |||
| auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x); | |||
| @@ -112,19 +112,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond3) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>(); | |||
| auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kg); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestHWInsertMemcpyForHccl, test_cond4) { | |||
| TEST_F(TestHWInsertTensorMoveForHccl, test_cond4) { | |||
| get_py_fun_.SetDoResolve(true); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "before"); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "before"); | |||
| ASSERT_TRUE(g != nullptr); | |||
| std::vector<int64_t> shp_x{1, 64, 112, 112}; | |||
| auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x); | |||
| @@ -139,19 +139,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond4) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>(); | |||
| auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kg); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| TEST_F(TestHWInsertMemcpyForHccl, test_cond5) { | |||
| TEST_F(TestHWInsertTensorMoveForHccl, test_cond5) { | |||
| get_py_fun_.SetDoResolve(true); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "before"); | |||
| FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "before"); | |||
| ASSERT_TRUE(g != nullptr); | |||
| std::vector<int64_t> shp_x{1, 64, 112, 112}; | |||
| auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x); | |||
| @@ -166,14 +166,14 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond5) { | |||
| auto optimizer = std::make_shared<opt::GraphOptimizer>(); | |||
| auto pm = std::make_shared<opt::PassManager>(); | |||
| auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>(); | |||
| auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>(); | |||
| pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>(); | |||
| pm->AddPass(pass); | |||
| optimizer->AddPassManager(pm); | |||
| auto new_graph = optimizer->Optimize(kg); | |||
| kg->SetExecOrderByDefault(); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "after"); | |||
| FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "after"); | |||
| EXPECT_TRUE(CheckEqualGraph(g_after, new_graph)); | |||
| } | |||
| } // namespace opt | |||
| @@ -18,9 +18,9 @@ from mindspore.ops import Primitive | |||
| from mindspore.ops import operations as P | |||
| get_next = P.GetNext([ms.float32], [[1, 64, 112, 112]], 1, "") | |||
| memcpy_async_attr = Primitive('memcpy_async') | |||
| memcpy_async_attr.add_prim_attr("label_for_insert_stream_active", True) | |||
| memcpy_async = Primitive('memcpy_async') | |||
| tensor_move_attr = Primitive('TensorMove') | |||
| tensor_move_attr.add_prim_attr("label_for_insert_stream_active", True) | |||
| tensor_move = Primitive('tensor_move') | |||
| cast = P.Cast() | |||
| add = P.Add() | |||
| @@ -36,13 +36,13 @@ class FnDict: | |||
| return self.fnDict[name] | |||
| def test_getnext_memcpy_elimination(tag): | |||
| def test_getnext_tensor_move_elimination(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| def before(): | |||
| res = get_next() | |||
| res = memcpy_async_attr(res) | |||
| res = tensor_move_attr(res) | |||
| res = cast(res) | |||
| res = add(res) | |||
| return res | |||
| @@ -57,63 +57,63 @@ def test_getnext_memcpy_elimination(tag): | |||
| return fns[tag] | |||
| def test_getnext_memcpy_elimination_no_attr(tag): | |||
| def test_getnext_tensor_move_elimination_no_attr(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| def before(): | |||
| res = get_next() | |||
| res = memcpy_async(res) | |||
| res = tensor_move(res) | |||
| res = cast(res) | |||
| return res | |||
| @fns | |||
| def after(): | |||
| res = get_next() | |||
| res = memcpy_async(res) | |||
| res = tensor_move(res) | |||
| res = cast(res) | |||
| return res | |||
| return fns[tag] | |||
| def test_getnext_memcpy_elimination_memcpy_multi_users(tag): | |||
| def test_getnext_tensor_move_elimination_tensor_move_multi_users(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| def before(): | |||
| res = get_next() | |||
| memcpy_out = memcpy_async_attr(res) | |||
| res = cast(memcpy_out) | |||
| res = add(memcpy_out, res) | |||
| tensor_move_out = tensor_move_attr(res) | |||
| res = cast(tensor_move_out) | |||
| res = add(tensor_move_out, res) | |||
| return res | |||
| @fns | |||
| def after(): | |||
| res = get_next() | |||
| memcpy_out = memcpy_async_attr(res) | |||
| res = cast(memcpy_out) | |||
| res = add(memcpy_out, res) | |||
| tensor_move_out = tensor_move_attr(res) | |||
| res = cast(tensor_move_out) | |||
| res = add(tensor_move_out, res) | |||
| return res | |||
| return fns[tag] | |||
| def test_getnext_memcpy_elimination_next_multi_inputs(tag): | |||
| def test_getnext_tensor_move_elimination_next_multi_inputs(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| def before(): | |||
| res = get_next() | |||
| memcpy_out = memcpy_async_attr(res) | |||
| res = add(memcpy_out, res) | |||
| tensormove_out = tensor_move_attr(res) | |||
| res = add(tensormove_out, res) | |||
| return res | |||
| @fns | |||
| def after(): | |||
| res = get_next() | |||
| memcpy_out = memcpy_async_attr(res) | |||
| res = add(memcpy_out, res) | |||
| tensormove_out = tensor_move_attr(res) | |||
| res = add(tensormove_out, res) | |||
| return res | |||
| return fns[tag] | |||
| @@ -19,7 +19,7 @@ from mindspore.ops import _constants as Constants | |||
| from mindspore.ops import operations as P | |||
| get_next = P.GetNext([ms.float32, ms.int32], [[32, 64], [32]], 2, "") | |||
| memcpy_async = Primitive('memcpy_async') | |||
| tensor_move = Primitive('TensorMove') | |||
| make_tuple = Primitive('MakeTuple') | |||
| tuple_getitem = Primitive(Constants.kTupleGetItem) | |||
| @@ -35,7 +35,7 @@ class FnDict: | |||
| return self.fnDict[name] | |||
| def test_insert_memcpy_async_for_getnext(tag): | |||
| def test_insert_tensor_move_for_getnext(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -48,9 +48,9 @@ def test_insert_memcpy_async_for_getnext(tag): | |||
| res = get_next() | |||
| data = tuple_getitem(res, 0) | |||
| label = tuple_getitem(res, 1) | |||
| memcpy_async_data = memcpy_async(data) | |||
| memcpy_async_label = memcpy_async(label) | |||
| bind_tuple = make_tuple(memcpy_async_data, memcpy_async_label) | |||
| tensor_move_data = tensor_move(data) | |||
| tensor_move_label = tensor_move(label) | |||
| bind_tuple = make_tuple(tensor_move_data, tensor_move_label) | |||
| get_item0 = tuple_getitem(bind_tuple, 0) | |||
| get_item1 = tuple_getitem(bind_tuple, 1) | |||
| bind_tuple = make_tuple(make_tuple(get_item0, get_item1)) | |||
| @@ -20,7 +20,7 @@ from mindspore.ops import _constants as Constants | |||
| depend = P.Depend() | |||
| all_reduce = P.AllReduce() | |||
| broadcast = P.Broadcast(1) | |||
| memcpy_async = Primitive('memcpy_async') | |||
| tensor_move = Primitive('TensorMove') | |||
| make_tuple = Primitive('MakeTuple') | |||
| tuple_getitem = Primitive(Constants.kTupleGetItem) | |||
| assign_add = P.AssignAdd() | |||
| @@ -39,7 +39,7 @@ class FnDict: | |||
| return self.fnDict[name] | |||
| def test_insert_memcpy_async_for_hccl_op_cond1(tag): | |||
| def test_insert_tensor_move_for_hccl_op_cond1(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -57,14 +57,14 @@ def test_insert_memcpy_async_for_hccl_op_cond1(tag): | |||
| @fns | |||
| def after(x): | |||
| res1 = relu(x) | |||
| res2 = memcpy_async(res1) | |||
| res2 = tensor_move(res1) | |||
| res2 = all_reduce(res2) | |||
| return make_tuple(make_tuple(res1, res2)) | |||
| return fns[tag] | |||
| def test_insert_memcpy_async_for_hccl_op_cond2(tag): | |||
| def test_insert_tensor_move_for_hccl_op_cond2(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -74,14 +74,14 @@ def test_insert_memcpy_async_for_hccl_op_cond2(tag): | |||
| @fns | |||
| def after(x): | |||
| res = memcpy_async(x) | |||
| res = tensor_move(x) | |||
| res = all_reduce(res) | |||
| return make_tuple(res) | |||
| return fns[tag] | |||
| def test_insert_memcpy_async_for_hccl_op_cond3(tag): | |||
| def test_insert_tensor_move_for_hccl_op_cond3(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -93,14 +93,14 @@ def test_insert_memcpy_async_for_hccl_op_cond3(tag): | |||
| @fns | |||
| def after(a, b): | |||
| res = assign_add(a, b) | |||
| res = memcpy_async(res) | |||
| res = tensor_move(res) | |||
| res = all_reduce(res) | |||
| return make_tuple(res) | |||
| return fns[tag] | |||
| def test_insert_memcpy_async_for_hccl_op_cond4(tag): | |||
| def test_insert_tensor_move_for_hccl_op_cond4(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -113,7 +113,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag): | |||
| @fns | |||
| def after(a, b): | |||
| x = relu(a) | |||
| y1 = memcpy_async(b) | |||
| y1 = tensor_move(b) | |||
| y2 = all_reduce(y1) | |||
| res = depend(x, y2) | |||
| return make_tuple(res) | |||
| @@ -121,7 +121,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag): | |||
| return fns[tag] | |||
| def test_insert_memcpy_async_for_hccl_op_cond5(tag): | |||
| def test_insert_tensor_move_for_hccl_op_cond5(tag): | |||
| fns = FnDict() | |||
| @fns | |||
| @@ -134,8 +134,8 @@ def test_insert_memcpy_async_for_hccl_op_cond5(tag): | |||
| @fns | |||
| def after(a, b, c): | |||
| x = relu(a) | |||
| m1 = memcpy_async(b) | |||
| m2 = memcpy_async(c) | |||
| m1 = tensor_move(b) | |||
| m2 = tensor_move(c) | |||
| y = broadcast(m1, m2) | |||
| y0 = tuple_getitem(y, 0) | |||
| y1 = tuple_getitem(y, 1) | |||