Browse Source

replace memcpy_async with tensor move

pull/15204/head
laiyongqiang 5 years ago
parent
commit
1533435015
17 changed files with 199 additions and 199 deletions
  1. +8
    -8
      mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc
  2. +2
    -2
      mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc
  3. +1
    -1
      mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h
  4. +17
    -17
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.cc
  5. +7
    -7
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h
  6. +15
    -15
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.cc
  7. +8
    -8
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h
  8. +9
    -9
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.cc
  9. +7
    -7
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h
  10. +22
    -22
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.cc
  11. +9
    -9
      mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h
  12. +19
    -19
      tests/ut/cpp/pre_activate/ascend/enhancer/getnext_tensor_move_elimination_test.cc
  13. +8
    -8
      tests/ut/cpp/pre_activate/ascend/enhancer/insert_tensor_move_for_getnext_test.cc
  14. +30
    -30
      tests/ut/cpp/pre_activate/ascend/enhancer/insert_tensor_move_for_hccl_op_test.cc
  15. +20
    -20
      tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_tensor_move_elimination_test.py
  16. +5
    -5
      tests/ut/cpp/python_input/gtest_input/pre_activate/insert_tensor_move_for_getnext.py
  17. +12
    -12
      tests/ut/cpp/python_input/gtest_input/pre_activate/insert_tensor_move_for_hccl_op.py

+ 8
- 8
mindspore/ccsrc/backend/optimizer/ascend/ascend_backend_optimization.cc View File

@@ -100,13 +100,13 @@
#include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
#include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
#include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h"
#include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h"
#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
#include "backend/optimizer/ascend/ir_fission/addn_fission.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
#include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h"
#include "backend/optimizer/ascend/ir_fission/split_fission.h"
#include "backend/optimizer/ascend/ir_fission/splitv_fission.h"
@@ -292,11 +292,11 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap

if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
ConfigManager::GetInstance().iter_num() > 1) {
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForGetNext>());
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>());
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
}
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForHcclOp>());
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>());
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
@@ -370,7 +370,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
other_pm->AddPass(std::make_shared<ReduceScatterFusion>());
other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
other_pm->AddPass(std::make_shared<BroadcastFusion>());
other_pm->AddPass(std::make_shared<InsertMemcpyAsyncForCascade>());
other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
@@ -387,7 +387,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
ConfigManager::GetInstance().iter_num() > 1) {
other2_pm->AddPass(std::make_shared<GetnextMemcpyElimination>());
other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>());
}
other2_pm->AddPass(std::make_shared<CheckConsistency>());
optimizer2->AddPassManager(other2_pm);


+ 2
- 2
mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.cc View File

@@ -383,10 +383,10 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
return new_node;
}

AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
auto prim = std::make_shared<Primitive>(kMemCpyAsyncOpName);
auto prim = std::make_shared<Primitive>(kTensorMoveOpName);
std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim), node};
auto new_node = graph->NewCNode(new_node_inputs);
MS_EXCEPTION_IF_NULL(new_node);


+ 1
- 1
mindspore/ccsrc/backend/optimizer/ascend/ascend_helper.h View File

@@ -108,7 +108,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP

CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnode);

AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node);
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node);

AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const KernelSelectPtr &kernel_select, size_t insert_index, bool is_insert_input);


mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.cc → mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.cc View File

@@ -14,49 +14,49 @@
* limitations under the License.
*/

#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
#include <memory>
#include "backend/session/anf_runtime_algorithm.h"
#include "frontend/optimizer/opt.h"

namespace mindspore::opt {

const BaseRef GetnextMemcpyElimination::DefinePattern() const {
auto prim_memcpy = std::make_shared<Primitive>(kMemCpyAsyncOpName);
const BaseRef GetnextTensorMoveElimination::DefinePattern() const {
auto prim_tensor_move = std::make_shared<Primitive>(kTensorMoveOpName);
VarPtr x = std::make_shared<SeqVar>();
VectorRef memcpy_async({prim_memcpy, x});
return memcpy_async;
VectorRef tensor_move({prim_tensor_move, x});
return tensor_move;
}

const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
const AnfNodePtr GetnextTensorMoveElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
if (graph == nullptr || node == nullptr || equiv == nullptr) {
return nullptr;
}
auto memcpy_cnode = node->cast<CNodePtr>();
if (memcpy_cnode == nullptr) {
auto tensor_move_node = node->cast<CNodePtr>();
if (tensor_move_node == nullptr) {
return nullptr;
}

// 1. memcpy has attr kAttrLabelForInsertStreamActive
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, memcpy_cnode)) {
// 1. tensor move has attr kAttrLabelForInsertStreamActive
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, tensor_move_node)) {
MS_LOG(DEBUG) << "node has no label_for_insert_stream_active attr";
return nullptr;
}

// 2. memcpy's output has only one user next_node
// 2. tensor move's output has only one user next_node
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
if (manager->node_users().find(memcpy_cnode) == manager->node_users().end()) {
MS_LOG(EXCEPTION) << "memcpy has no output in manager";
if (manager->node_users().find(tensor_move_node) == manager->node_users().end()) {
MS_LOG(EXCEPTION) << "tensor move has no output in manager";
}
auto next_nodes = manager->node_users()[memcpy_cnode];
auto next_nodes = manager->node_users()[tensor_move_node];
if (next_nodes.size() > 1) {
MS_LOG(DEBUG) << "node's output has more than one users";
return nullptr;
}

// 3. next_node is not nop node, not graph output and it has only one input which is memcpy's output
// 3. next_node is not nop node, not graph output and it has only one input which is tensor move's output
for (auto &item : next_nodes) {
auto next_node = item.first->cast<CNodePtr>();
if (opt::IsNopNode(next_node)) {
@@ -77,6 +77,6 @@ const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, co
AnfAlgo::SetNodeAttr(kAttrLabelForInsertStreamActive, MakeValue(true), next_node);
}

return memcpy_cnode->input(1);
return tensor_move_node->input(1);
}
} // namespace mindspore::opt

mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h → mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h View File

@@ -13,21 +13,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H

#include "backend/optimizer/common/optimizer.h"

namespace mindspore {
namespace opt {
class GetnextMemcpyElimination : public PatternProcessPass {
class GetnextTensorMoveElimination : public PatternProcessPass {
public:
explicit GetnextMemcpyElimination(bool multigraph = true)
: PatternProcessPass("getnext_memcpy_elimination", multigraph) {}
~GetnextMemcpyElimination() override = default;
explicit GetnextTensorMoveElimination(bool multigraph = true)
: PatternProcessPass("getnext_tensormove_elimination", multigraph) {}
~GetnextTensorMoveElimination() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.cc → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.cc View File

@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
#include <vector>
#include "utils/utils.h"
#include "backend/session/anf_runtime_algorithm.h"
@@ -69,36 +69,36 @@ bool IsPartOutputsOfHcclOp(const AnfNodePtr &node, const CNodePtr &cur_hccl, con
}
} // namespace

AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
AnfNodePtr InsertTensorMoveForCascade::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(hccl_node);
std::vector<AnfNodePtr> memcpy_async_list;
std::vector<AnfNodePtr> tensor_move_list;
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
for (size_t i = 1; i < hccl_node->size(); ++i) {
auto input = hccl_node->input(i);
MS_EXCEPTION_IF_NULL(input);
// when input is also a hccl op and just part outputs of it linking with cur_hccl_op
if (IsPartOutputsOfHcclOp(input, hccl_node, graph)) {
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
if (memcpy_async == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed."
auto tensor_move = CreateTensorMoveOp(graph, input);
if (tensor_move == nullptr) {
MS_LOG(EXCEPTION) << "Create tensor_move op failed."
<< " trace: " << trace::DumpSourceLines(hccl_node);
}
if (AnfAlgo::IsNodeDynamicShape(input)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
}
auto kernel_info = std::make_shared<device::KernelInfo>();
memcpy_async->set_kernel_info(kernel_info);
tensor_move->set_kernel_info(kernel_info);
MS_EXCEPTION_IF_NULL(kernel_select_);
kernel_select_->SelectKernel(memcpy_async->cast<CNodePtr>());
new_inputs.push_back(memcpy_async);
memcpy_async_list.push_back(memcpy_async);
kernel_select_->SelectKernel(tensor_move->cast<CNodePtr>());
new_inputs.push_back(tensor_move);
tensor_move_list.push_back(tensor_move);
} else {
new_inputs.push_back(input);
}
}

if (!memcpy_async_list.empty()) {
if (!tensor_move_list.empty()) {
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
new_hccl_node->set_inputs(new_inputs);
return new_hccl_node;
@@ -106,8 +106,8 @@ AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &gr
return nullptr;
}

const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
return nullptr;
}
@@ -115,7 +115,7 @@ const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_g
if (!AnfAlgo::IsCommunicationOp(node)) {
return nullptr;
}
return InsertMemcpyAsync(func_graph, cnode);
return InsertTensorMove(func_graph, cnode);
}
} // namespace opt
} // namespace mindspore

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h View File

@@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_

#include <memory>
#include "backend/optimizer/common/optimizer.h"
@@ -22,18 +22,18 @@

namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForCascade : public PatternProcessPass {
class InsertTensorMoveForCascade : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForCascade(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_cascade", multigraph),
explicit InsertTensorMoveForCascade(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_cascade", multigraph),
kernel_select_(std::make_shared<KernelSelect>()) {}
~InsertMemcpyAsyncForCascade() override = default;
~InsertTensorMoveForCascade() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

private:
AnfNodePtr InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
AnfNodePtr InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
KernelSelectPtr kernel_select_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_OP_CASCADE_H_
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_OP_CASCADE_H_

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.cc → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.cc View File

@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
#include <vector>
#include <memory>
#include "backend/optimizer/ascend/ascend_helper.h"
@@ -22,14 +22,14 @@

namespace mindspore {
namespace opt {
AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
AnfNodePtr InsertTensorMoveForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
if (func_graph == nullptr || node == nullptr) {
return nullptr;
}

size_t output_num = AnfAlgo::GetOutputTensorNum(node);
if (output_num == 0) {
MS_LOG(DEBUG) << "Output number is zero, no need to insert memcpy_async!";
MS_LOG(DEBUG) << "Output number is zero, no need to insert tensor_move!";
return node;
}

@@ -39,9 +39,9 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co

for (size_t output_index = 0; output_index < output_num; ++output_index) {
auto tuple_get_item = CreatTupleGetItemNode(func_graph, node, output_index);
auto new_node = CreateMemcpyAsyncOp(func_graph, tuple_get_item);
auto new_node = CreateTensorMoveOp(func_graph, tuple_get_item);
if (new_node == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed!";
MS_LOG(EXCEPTION) << "Create tensor move op failed!";
}
if (AnfAlgo::IsNodeDynamicShape(tuple_get_item)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), new_node);
@@ -53,15 +53,15 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co
return make_tuple;
}

const BaseRef InsertMemcpyAsyncForGetNext::DefinePattern() const {
const BaseRef InsertTensorMoveForGetNext::DefinePattern() const {
std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
auto prim = std::make_shared<Primitive>(kGetNextOpName);

return VectorRef({prim, Xs});
}

const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !AnfAlgo::IsRealKernel(node)) {
return nullptr;
}
@@ -73,7 +73,7 @@ const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_g
}
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cnode);

return InsertMemcpyAsyncForGetNextOutputs(func_graph, cnode);
return InsertTensorMoveForGetNextOutputs(func_graph, cnode);
}
} // namespace opt
} // namespace mindspore

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h View File

@@ -14,22 +14,22 @@
* limitations under the License.
*/

#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_

#include "backend/optimizer/common/optimizer.h"

namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForGetNext : public PatternProcessPass {
class InsertTensorMoveForGetNext : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForGetNext(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_getnext", multigraph) {}
~InsertMemcpyAsyncForGetNext() override = default;
explicit InsertTensorMoveForGetNext(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_getnext", multigraph) {}
~InsertTensorMoveForGetNext() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
} // namespace opt
} // namespace mindspore

#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.cc → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.cc View File

@@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#include <vector>
#include <set>
#include <string>
@@ -25,9 +25,9 @@
namespace mindspore {
namespace opt {
namespace {
// insert memcpy for some cnode even if not a Ref cnode
const std::set<std::string> kNeedInsertMemcpyOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
kLambUpdateWithLROpName};
// insert tensormove for some cnode even if not a Ref cnode
const std::set<std::string> kNeedInsertTensorMoveOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
kLambUpdateWithLROpName};

bool IsParameterOrValueNode(const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(node);
@@ -43,7 +43,7 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) {
// NodeUsersMap, for node B input i use node A, it will be one item in map with key: A, and value: (B, i)
bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
if (node_users.size() == 1) {
MS_LOG(INFO) << "This node only used once, no need to insert memcpy node.";
MS_LOG(INFO) << "This node only used once, no need to insert tensormove node.";
return false;
}
for (const auto &node_pair : node_users) {
@@ -53,13 +53,13 @@ bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
return true;
}
}
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert memcpy node.";
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert tensormove node.";
return false;
}
} // namespace

bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input,
const CNodePtr &cur_node) const {
bool InsertTensorMoveForHcclOp::NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input,
const CNodePtr &cur_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(input);
MS_EXCEPTION_IF_NULL(cur_node);
@@ -79,7 +79,7 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
}

// when input is some special cnodes
if (kNeedInsertMemcpyOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertMemcpyOpSet.end()) {
if (kNeedInsertTensorMoveOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertTensorMoveOpSet.end()) {
return true;
}

@@ -96,29 +96,29 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
return false;
}

void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
void InsertTensorMoveForHcclOp::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(hccl_node);
bool need_memcpy_async = false;
bool need_tensor_move_async = false;
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
for (size_t i = 1; i < hccl_node->size(); ++i) {
auto input = hccl_node->input(i);
if (NeedInsertMemcpy(graph, input, hccl_node)) {
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
if (memcpy_async == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed.";
if (NeedInsertTensorMove(graph, input, hccl_node)) {
auto tensor_move = CreateTensorMoveOp(graph, input);
if (tensor_move == nullptr) {
MS_LOG(EXCEPTION) << "Create tensor_move op failed.";
}
if (input->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(input)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
}
new_inputs.push_back(memcpy_async);
need_memcpy_async = true;
new_inputs.push_back(tensor_move);
need_tensor_move_async = true;
} else {
new_inputs.push_back(input);
}
}

if (need_memcpy_async) {
if (need_tensor_move_async) {
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
new_hccl_node->set_inputs(new_inputs);
auto manager = graph->manager();
@@ -129,15 +129,15 @@ void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, co
}
}

const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
return nullptr;
}
if (!AnfAlgo::IsCommunicationOp(node)) {
return nullptr;
}
InsertMemcpyAsync(func_graph, node->cast<CNodePtr>());
InsertTensorMove(func_graph, node->cast<CNodePtr>());
return nullptr;
}
} // namespace opt

mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h → mindspore/ccsrc/backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h View File

@@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_

#include <memory>
#include "backend/optimizer/common/optimizer.h"
@@ -22,19 +22,19 @@

namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForHcclOp : public PatternProcessPass {
class InsertTensorMoveForHcclOp : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForHcclOp(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_hccl_op", multigraph),
explicit InsertTensorMoveForHcclOp(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_hccl_op", multigraph),
kernel_query_(std::make_shared<KernelQuery>()) {}
~InsertMemcpyAsyncForHcclOp() override = default;
~InsertTensorMoveForHcclOp() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;

private:
void InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
bool NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
void InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
bool NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
KernelQueryPtr kernel_query_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_

tests/ut/cpp/pre_activate/ascend/enhancer/getnext_memcpy_elimination.cc → tests/ut/cpp/pre_activate/ascend/enhancer/getnext_tensor_move_elimination_test.cc View File

@@ -22,75 +22,75 @@
#include "utils/utils.h"
#include "backend/kernel_compiler/kernel_build_info.h"
#include "backend/optimizer/common/optimizer.h"
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"

namespace mindspore {
namespace opt {
class TestGetNextMemcpyElimination : public BackendCommon {
class TestGetNextTensorMoveElimination : public BackendCommon {
public:
TestGetNextMemcpyElimination() : get_py_fun_("gtest_input.pre_activate.getnext_memcpy_elimination_test", true) {}
TestGetNextTensorMoveElimination() : get_py_fun_("gtest_input.pre_activate.getnext_tensor_move_elimination_test", true) {}

public:
UT::PyFuncGraphFetcher get_py_fun_;
};

TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensormove_elimination) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "before");
ASSERT_TRUE(g_before != nullptr);

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_no_attr) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_no_attr) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "before");
ASSERT_TRUE(g_before != nullptr);

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_memcpy_multi_users) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_tensor_move_multi_users) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "before");
ASSERT_TRUE(g_before != nullptr);

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_next_multi_inputs) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_next_multi_inputs) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "before");
ASSERT_TRUE(g_before != nullptr);

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}


tests/ut/cpp/pre_activate/ascend/enhancer/insert_memcpy_async_for_getnext.cc → tests/ut/cpp/pre_activate/ascend/enhancer/insert_tensor_move_for_getnext_test.cc View File

@@ -24,23 +24,23 @@
#include "utils/utils.h"
#include "backend/kernel_compiler/kernel_build_info.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"

namespace mindspore {
namespace opt {
using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;

class TestHWInsertMemcpyAsyncForGetNext : public BackendCommon {
class TestHWInsertTensorMoveForGetNext : public BackendCommon {
public:
TestHWInsertMemcpyAsyncForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_getnext", true) {}
~TestHWInsertMemcpyAsyncForGetNext() override = default;
TestHWInsertTensorMoveForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_getnext", true) {}
~TestHWInsertTensorMoveForGetNext() override = default;

public:
UT::PyFuncGraphFetcher get_py_fun_;
};

TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_multi_output) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_before");
TEST_F(TestHWInsertTensorMoveForGetNext, test_insert_tensor_move_for_getnext_multi_output) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_before");

AbstractBasePtrList args_spec_list{};
auto kernel_graph = GetKernelGraph(g_before, args_spec_list);
@@ -57,11 +57,11 @@ TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_m

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
pm->AddPass(std::make_shared<opt::InsertMemcpyAsyncForGetNext>());
pm->AddPass(std::make_shared<opt::InsertTensorMoveForGetNext>());
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kernel_graph);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
} // namespace opt

tests/ut/cpp/pre_activate/ascend/enhancer/insert_memcpy_async_for_hccl_op_test.cc → tests/ut/cpp/pre_activate/ascend/enhancer/insert_tensor_move_for_hccl_op_test.cc View File

@@ -25,24 +25,24 @@
#include "ir/param_info.h"
#define private public
#define protected public
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#undef private
#undef protected
namespace mindspore {
namespace opt {
class TestHWInsertMemcpyForHccl : public BackendCommon {
class TestHWInsertTensorMoveForHccl : public BackendCommon {
public:
TestHWInsertMemcpyForHccl() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_hccl_op", true) {}
~TestHWInsertMemcpyForHccl() override = default;
TestHWInsertTensorMoveForHccl() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_hccl_op", true) {}
~TestHWInsertTensorMoveForHccl() override = default;

public:
UT::PyFuncGraphFetcher get_py_fun_;
};

class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
class MockInsertTensorMoveForHcclKernelQuery : public KernelQuery {
public:
MockInsertMemcpyForHcclKernelQuery() = default;
~MockInsertMemcpyForHcclKernelQuery() override = default;
MockInsertTensorMoveForHcclKernelQuery() = default;
~MockInsertTensorMoveForHcclKernelQuery() override = default;
bool IsTbeRef(const AnfNodePtr &node) override {
MS_EXCEPTION_IF_NULL(node);
if (!node->isa<CNode>()) {
@@ -53,9 +53,9 @@ class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
}
};

TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond1_no_insert) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before2");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond1", "before2");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -66,7 +66,7 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
@@ -74,9 +74,9 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
}

TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond2) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -90,19 +90,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond3) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{3, 2};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -112,19 +112,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond4) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -139,19 +139,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond5) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@@ -166,14 +166,14 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {

auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
kg->SetExecOrderByDefault();

FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
} // namespace opt

tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_memcpy_elimination_test.py → tests/ut/cpp/python_input/gtest_input/pre_activate/getnext_tensor_move_elimination_test.py View File

@@ -18,9 +18,9 @@ from mindspore.ops import Primitive
from mindspore.ops import operations as P

get_next = P.GetNext([ms.float32], [[1, 64, 112, 112]], 1, "")
memcpy_async_attr = Primitive('memcpy_async')
memcpy_async_attr.add_prim_attr("label_for_insert_stream_active", True)
memcpy_async = Primitive('memcpy_async')
tensor_move_attr = Primitive('TensorMove')
tensor_move_attr.add_prim_attr("label_for_insert_stream_active", True)
tensor_move = Primitive('tensor_move')
cast = P.Cast()
add = P.Add()

@@ -36,13 +36,13 @@ class FnDict:
return self.fnDict[name]


def test_getnext_memcpy_elimination(tag):
def test_getnext_tensor_move_elimination(tag):
fns = FnDict()

@fns
def before():
res = get_next()
res = memcpy_async_attr(res)
res = tensor_move_attr(res)
res = cast(res)
res = add(res)
return res
@@ -57,63 +57,63 @@ def test_getnext_memcpy_elimination(tag):
return fns[tag]


def test_getnext_memcpy_elimination_no_attr(tag):
def test_getnext_tensor_move_elimination_no_attr(tag):
fns = FnDict()

@fns
def before():
res = get_next()
res = memcpy_async(res)
res = tensor_move(res)
res = cast(res)
return res

@fns
def after():
res = get_next()
res = memcpy_async(res)
res = tensor_move(res)
res = cast(res)
return res

return fns[tag]


def test_getnext_memcpy_elimination_memcpy_multi_users(tag):
def test_getnext_tensor_move_elimination_tensor_move_multi_users(tag):
fns = FnDict()

@fns
def before():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = cast(memcpy_out)
res = add(memcpy_out, res)
tensor_move_out = tensor_move_attr(res)
res = cast(tensor_move_out)
res = add(tensor_move_out, res)
return res

@fns
def after():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = cast(memcpy_out)
res = add(memcpy_out, res)
tensor_move_out = tensor_move_attr(res)
res = cast(tensor_move_out)
res = add(tensor_move_out, res)
return res

return fns[tag]


def test_getnext_memcpy_elimination_next_multi_inputs(tag):
def test_getnext_tensor_move_elimination_next_multi_inputs(tag):
fns = FnDict()

@fns
def before():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = add(memcpy_out, res)
tensormove_out = tensor_move_attr(res)
res = add(tensormove_out, res)
return res

@fns
def after():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = add(memcpy_out, res)
tensormove_out = tensor_move_attr(res)
res = add(tensormove_out, res)
return res

return fns[tag]

tests/ut/cpp/python_input/gtest_input/pre_activate/insert_memcpy_async_for_getnext.py → tests/ut/cpp/python_input/gtest_input/pre_activate/insert_tensor_move_for_getnext.py View File

@@ -19,7 +19,7 @@ from mindspore.ops import _constants as Constants
from mindspore.ops import operations as P

get_next = P.GetNext([ms.float32, ms.int32], [[32, 64], [32]], 2, "")
memcpy_async = Primitive('memcpy_async')
tensor_move = Primitive('TensorMove')
make_tuple = Primitive('MakeTuple')
tuple_getitem = Primitive(Constants.kTupleGetItem)

@@ -35,7 +35,7 @@ class FnDict:
return self.fnDict[name]


def test_insert_memcpy_async_for_getnext(tag):
def test_insert_tensor_move_for_getnext(tag):
fns = FnDict()

@fns
@@ -48,9 +48,9 @@ def test_insert_memcpy_async_for_getnext(tag):
res = get_next()
data = tuple_getitem(res, 0)
label = tuple_getitem(res, 1)
memcpy_async_data = memcpy_async(data)
memcpy_async_label = memcpy_async(label)
bind_tuple = make_tuple(memcpy_async_data, memcpy_async_label)
tensor_move_data = tensor_move(data)
tensor_move_label = tensor_move(label)
bind_tuple = make_tuple(tensor_move_data, tensor_move_label)
get_item0 = tuple_getitem(bind_tuple, 0)
get_item1 = tuple_getitem(bind_tuple, 1)
bind_tuple = make_tuple(make_tuple(get_item0, get_item1))

tests/ut/cpp/python_input/gtest_input/pre_activate/insert_memcpy_async_for_hccl_op.py → tests/ut/cpp/python_input/gtest_input/pre_activate/insert_tensor_move_for_hccl_op.py View File

@@ -20,7 +20,7 @@ from mindspore.ops import _constants as Constants
depend = P.Depend()
all_reduce = P.AllReduce()
broadcast = P.Broadcast(1)
memcpy_async = Primitive('memcpy_async')
tensor_move = Primitive('TensorMove')
make_tuple = Primitive('MakeTuple')
tuple_getitem = Primitive(Constants.kTupleGetItem)
assign_add = P.AssignAdd()
@@ -39,7 +39,7 @@ class FnDict:
return self.fnDict[name]


def test_insert_memcpy_async_for_hccl_op_cond1(tag):
def test_insert_tensor_move_for_hccl_op_cond1(tag):
fns = FnDict()

@fns
@@ -57,14 +57,14 @@ def test_insert_memcpy_async_for_hccl_op_cond1(tag):
@fns
def after(x):
res1 = relu(x)
res2 = memcpy_async(res1)
res2 = tensor_move(res1)
res2 = all_reduce(res2)
return make_tuple(make_tuple(res1, res2))

return fns[tag]


def test_insert_memcpy_async_for_hccl_op_cond2(tag):
def test_insert_tensor_move_for_hccl_op_cond2(tag):
fns = FnDict()

@fns
@@ -74,14 +74,14 @@ def test_insert_memcpy_async_for_hccl_op_cond2(tag):

@fns
def after(x):
res = memcpy_async(x)
res = tensor_move(x)
res = all_reduce(res)
return make_tuple(res)

return fns[tag]


def test_insert_memcpy_async_for_hccl_op_cond3(tag):
def test_insert_tensor_move_for_hccl_op_cond3(tag):
fns = FnDict()

@fns
@@ -93,14 +93,14 @@ def test_insert_memcpy_async_for_hccl_op_cond3(tag):
@fns
def after(a, b):
res = assign_add(a, b)
res = memcpy_async(res)
res = tensor_move(res)
res = all_reduce(res)
return make_tuple(res)

return fns[tag]


def test_insert_memcpy_async_for_hccl_op_cond4(tag):
def test_insert_tensor_move_for_hccl_op_cond4(tag):
fns = FnDict()

@fns
@@ -113,7 +113,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
@fns
def after(a, b):
x = relu(a)
y1 = memcpy_async(b)
y1 = tensor_move(b)
y2 = all_reduce(y1)
res = depend(x, y2)
return make_tuple(res)
@@ -121,7 +121,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
return fns[tag]


def test_insert_memcpy_async_for_hccl_op_cond5(tag):
def test_insert_tensor_move_for_hccl_op_cond5(tag):
fns = FnDict()

@fns
@@ -134,8 +134,8 @@ def test_insert_memcpy_async_for_hccl_op_cond5(tag):
@fns
def after(a, b, c):
x = relu(a)
m1 = memcpy_async(b)
m2 = memcpy_async(c)
m1 = tensor_move(b)
m2 = tensor_move(c)
y = broadcast(m1, m2)
y0 = tuple_getitem(y, 0)
y1 = tuple_getitem(y, 1)

Loading…
Cancel
Save