Browse Source

[GraphKernel] fix code check.

r1.7
chenlei_autodiff 4 years ago
parent
commit
e5af40d31c
21 changed files with 143 additions and 139 deletions
  1. +1
    -0
      mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.h
  2. +15
    -7
      mindspore/ccsrc/common/graph_kernel/adapter/fake_abstract_shape.cc
  3. +7
    -1
      mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_expander_with_py.h
  4. +47
    -45
      mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_optimization.cc
  5. +1
    -1
      mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.cc
  6. +1
    -1
      mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.h
  7. +9
    -8
      mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_splitter_with_py.cc
  8. +4
    -10
      mindspore/ccsrc/common/graph_kernel/add_atomic_clean.cc
  9. +2
    -2
      mindspore/ccsrc/common/graph_kernel/add_atomic_clean.h
  10. +6
    -13
      mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.cc
  11. +7
    -5
      mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.h
  12. +2
    -2
      mindspore/ccsrc/common/graph_kernel/cast_matmul_fusion.cc
  13. +1
    -1
      mindspore/ccsrc/common/graph_kernel/core/graph_kernel_callback.h
  14. +2
    -1
      mindspore/ccsrc/common/graph_kernel/decrease_compute_precision.cc
  15. +8
    -14
      mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.cc
  16. +3
    -2
      mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.h
  17. +2
    -2
      mindspore/ccsrc/common/graph_kernel/insert_pad.cc
  18. +11
    -11
      mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_optimization.cc
  19. +8
    -8
      mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.cc
  20. +4
    -4
      mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.h
  21. +2
    -1
      mindspore/ccsrc/common/graph_kernel/optimize_assign.cc

+ 1
- 0
mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.h View File

@@ -23,6 +23,7 @@
namespace mindspore::graphkernel {
class CallbackImpl : public Callback {
public:
virtual ~CallbackImpl() = default;
ShapeVector GetInputShape(const AnfNodePtr &node, size_t i) override;
ShapeVector GetOutputShape(const AnfNodePtr &node, size_t i) override;
ShapeVector GetInputInferShape(const AnfNodePtr &node, size_t i) override;


+ 15
- 7
mindspore/ccsrc/common/graph_kernel/adapter/fake_abstract_shape.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@

namespace mindspore::graphkernel {
namespace {
class AbstractShapeCreator {
class FakeAbstractShape {
public:
using AbstractShapeTransferFunc = std::function<ShapeVector(const ShapeVector &)>;
/**
@@ -49,10 +49,14 @@ class AbstractShapeCreator {
static ShapeVector NchwAbstractShape(const ShapeVector &device_shape) { return device_shape; }
static ShapeVector NhwcAbstractShape(const ShapeVector &device_shape) {
const size_t nhwc_size = 4;
const size_t index_n = 0;
const size_t index_h = 1;
const size_t index_w = 2;
const size_t index_c = 3;
if (device_shape.size() != nhwc_size) {
MS_LOG(EXCEPTION) << "Shape size of NHWC should be 4, but got " << device_shape.size();
}
return {device_shape[0], device_shape[3], device_shape[1], device_shape[2]};
return {device_shape[index_n], device_shape[index_c], device_shape[index_h], device_shape[index_w]};
}
static ShapeVector FractalNzAbstractShape(const ShapeVector &device_shape) {
if (device_shape.size() == 1 && (device_shape[0] == 1 || static_cast<size_t>(device_shape[0]) % kCubeSize == 0)) {
@@ -64,12 +68,16 @@ class AbstractShapeCreator {
}
ShapeVector shape;
size_t dims = device_shape.size();
size_t batch = dims - 4;
size_t batch = dims - nz_size;
for (size_t i = 0; i < batch; ++i) {
shape.push_back(device_shape[i]);
}
int64_t m = device_shape[dims - 3] * device_shape[dims - 2];
int64_t n = device_shape[dims - 4] * device_shape[dims - 1];
const size_t index_m1 = 3;
const size_t index_m2 = 2;
const size_t index_n1 = 4;
const size_t index_n2 = 1;
int64_t m = device_shape[dims - index_m1] * device_shape[dims - index_m2];
int64_t n = device_shape[dims - index_n1] * device_shape[dims - index_n2];
shape.push_back(m);
shape.push_back(n);

@@ -79,6 +87,6 @@ class AbstractShapeCreator {
} // namespace

ShapeVector GetFakeAbstractShape(const ShapeVector &device_shape, const std::string &format) {
return AbstractShapeCreator::GetFakeAbstractShape(device_shape, format);
return FakeAbstractShape::GetFakeAbstractShape(device_shape, format);
}
} // namespace mindspore::graphkernel

+ 7
- 1
mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_expander_with_py.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -24,12 +24,18 @@

namespace mindspore::graphkernel {
class PyExpander : public DefaultExpander {
public:
virtual ~PyExpander() = default;

protected:
virtual bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json);
FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node) override;
};

class ComplexOpExpander : public PyExpander {
public:
virtual ~ComplexOpExpander() = default;

protected:
bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json);
};


+ 47
- 45
mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_optimization.cc View File

@@ -67,21 +67,21 @@ inline unsigned int GetPassLevelByFlag(bool flag) { return flag ? OptLevel_1 : O
PassManagerPtr GraphKernelOptimizer::PreProcess() const {
auto pm = std::make_shared<GraphKernelPassManager>(0, "preprocess");
// Do DependElimination all passes of graphkernel
pm->AddPass(std::make_shared<DependElimination>(), OptLevel_1);
pm->Add(std::make_shared<DependElimination>(), OptLevel_1);

// Do cse before all passes of graphkernel
pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);
pm->Add(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);

// Save the original output info
pm->AddPass(std::make_shared<SaveOutputShape>(), OptLevel_1);
pm->Add(std::make_shared<SaveOutputShape>(), OptLevel_1);

// Change Assign(p, a, U) to Assign(Depend(p, U), a)
pm->AddPass(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu);
pm->Add(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu);

// Spread the MakeTuple input of UpdateState
pm->AddPass(std::make_shared<SpreadUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<SpreadUpdateState>(), OptLevel_1);
// Eliminate the common nodes that generated in SpreadUpdateState
pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1);
pm->Add(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1);
return pm;
}

@@ -89,16 +89,16 @@ PassManagerPtr GraphKernelOptimizer::Cluster() const {
auto pm = std::make_shared<GraphKernelPassManager>(1, "cluster");

// Expand complex op to composite kernels
pm->AddPass(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu);
pm->Add(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu);

// Expand complex basic kernels to composite kernels
pm->AddPass(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1);

// Cluster basic kernels and composite kernels
pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1);

// Eliminate the outputs without external user
pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
return pm;
}

@@ -106,29 +106,29 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
auto pm = std::make_shared<GraphKernelPassManager>(2, "highlevelopt1");

// Remove redundant Cast(bias, fp16) for Matmul input
pm->AddPass(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);
pm->Add(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);

// Reorder Cast and Type-insensitive node
pm->AddPass(std::make_shared<ReorderOps>(), OptLevel_2);
pm->Add(std::make_shared<ReorderOps>(), OptLevel_2);

// normalize the Reduce axis
pm->AddPass(std::make_shared<AxisNormalizer>(), OptLevel_1);
pm->Add(std::make_shared<AxisNormalizer>(), OptLevel_1);

// Cast the input of ReduceSum from float16 to float32 for higher precision
pm->AddPass(std::make_shared<RaiseReductionPrecision>(), OptLevel_2);
pm->Add(std::make_shared<RaiseReductionPrecision>(), OptLevel_2);

// Insert PadAkg and UnPadAkg Ops for MatMul
pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
pm->Add(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);

// Universal arithmetic simplify
pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);
pm->Add(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);

// Common subexpression elimination
pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2);
pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_2);

// Eliminate unnecessary transform ops
auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_trans_op_optimize);
pm->AddPass(std::make_shared<TransformOpOptimizer>(), level, is_gpu);
pm->Add(std::make_shared<TransformOpOptimizer>(), level, is_gpu);
return pm;
}

@@ -137,21 +137,21 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
// Make certain nodes redundant so that they are used by only one user,
// which can avoid unnecessary input-output and get better performance.
// preprocess for ShapeOpsSplitter
pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);

// Split kernel according to costmodel
pm->AddPass(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1);

// After Simplify and Splitter, a lot of redundant getitem/maketuple
// will be exposed, use GetitemTuple Pass to delete them.
pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);

// Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_1);
pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
return pm;
}

@@ -161,30 +161,30 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
auto &flags = GraphKernelFlags::GetInstance();
// Auto recompute according to local memory burst.
auto recompute_lv = GetPassLevelByFlag(flags.recompute_increment_threshold > 0 || flags.recompute_peak_threshold > 0);
pm->AddPass(std::make_shared<GraphKernelRecompute>(), recompute_lv);
pm->Add(std::make_shared<GraphKernelRecompute>(), recompute_lv);

// Replace Assign with InplaceAssign, and replace original output with overridden parameters
pm->AddPass(std::make_shared<OptimizeAssign>(), OptLevel_2);
pm->Add(std::make_shared<OptimizeAssign>(), OptLevel_2);

pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
pm->AddPass(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));
pm->Add(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
pm->Add(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
pm->Add(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));

// Enable atomic add
pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);
pm->Add(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);

// Enable atomic add for stitch nodes.
auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_stitch_fusion);
pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu);
pm->Add(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu);

// Enable low precision
auto level_low_precision = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_low_precision);
pm->AddPass(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
pm->Add(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
pm->Add(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);

// Enable tsa and uss
pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
pm->Add(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
pm->Add(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);

return pm;
}
@@ -197,9 +197,8 @@ PassManagerPtr GraphKernelOptimizer::Combine() const {
auto target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_fusion);
// Atomic-add GraphKernel node may be linked directly to UpdateState, it should be spread before parallel fusion!
pm->AddPass(std::make_shared<SpreadUpdateState>(), level);
pm->AddPass(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level,
is_gpu || is_ascend);
pm->Add(std::make_shared<SpreadUpdateState>(), level);
pm->Add(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, is_gpu || is_ascend);

return pm;
}
@@ -207,17 +206,17 @@ PassManagerPtr GraphKernelOptimizer::Combine() const {
PassManagerPtr GraphKernelOptimizer::PostProcess() const {
auto pm = std::make_shared<GraphKernelPassManager>(6, "postprocess");
// Make Tuple for the inputs of UpdateState. (the reverse of SpreadUpdateState)
pm->AddPass(std::make_shared<ShrinkUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<ShrinkUpdateState>(), OptLevel_1);

// Recover the original output info
pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
pm->AddPass(std::make_shared<RewriteOutputShape>(), OptLevel_1);
pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
pm->Add(std::make_shared<RewriteOutputShape>(), OptLevel_1);

// Reduce fake output memory.
pm->AddPass(std::make_shared<ReduceFakeOutMem>(), OptLevel_1);
pm->Add(std::make_shared<ReduceFakeOutMem>(), OptLevel_1);

// Add the new tensors to the kernel_graph
pm->AddPass(std::make_shared<BindValueToGraph>(), OptLevel_1);
pm->Add(std::make_shared<BindValueToGraph>(), OptLevel_1);
return pm;
}

@@ -242,5 +241,8 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
(void)optimizer->Optimize(kernel_graph);
}

void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); }
void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) {
GraphKernelOptimizer graph_kernel_optimizer;
graph_kernel_optimizer.Run(kernel_graph);
}
} // namespace mindspore::graphkernel

+ 1
- 1
mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.cc View File

@@ -24,7 +24,7 @@
#include "debug/anf_ir_dump.h"
namespace mindspore::graphkernel {
void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
MS_EXCEPTION_IF_NULL(pass);
auto pass_id = passes_.size();
auto pass_name = pass->name();


+ 1
- 1
mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.h View File

@@ -33,7 +33,7 @@ class GraphKernelPassManager : public PassManager {
~GraphKernelPassManager() = default;
// Add graph pass, the pass object will be freed when pass manager freed.
virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
// Run passes on the func_graph
bool Run(const FuncGraphPtr &func_graph) const override;


+ 9
- 8
mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_splitter_with_py.cc View File

@@ -26,6 +26,7 @@
#include "include/common/utils/python_adapter.h"
#include "kernel/akg/akg_kernel_json_generator.h"
#include "kernel/common_utils.h"
#include "common/graph_kernel/core/graph_kernel_utils.h"
#include "common/graph_kernel/graph_kernel_helper.h"
#include "include/common/utils/context/graph_kernel_flags.h"

@@ -107,13 +108,13 @@ class CostModelSplitSchemer : public SplitSchemer {
MS_LOG(ERROR) << "Failed decode sub graph, " << graph_desc;
return false;
}
split_plan_.emplace_back(std::move(res_graph));
(void)split_plan_.emplace_back(std::move(res_graph));
}

// ops to be inlined.
need_inline_.clear();
std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_),
[](const std::string &mode) { return mode == "basic" ? 1 : 0; });
(void)std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_),
[](const std::string &mode) { return mode == "basic" ? 1 : 0; });
return true;
}

@@ -151,8 +152,8 @@ class CostModelSplitSchemer : public SplitSchemer {
virtual void GetValidKernelNodes() {
topo_all_nodes_ = TopoSort(func_graph_->get_return());
topo_valid_nodes_.clear();
std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_),
[this](const AnfNodePtr &node) { return IsValidKernelNode(node); });
(void)std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_),
[this](const AnfNodePtr &node) { return IsValidKernelNode(node); });
}

void MapNodeGroup() {
@@ -175,14 +176,14 @@ class CostModelSplitSchemer : public SplitSchemer {
if (IsValidKernelNode(output)) {
auto group_id = node_group_[output];
node_group_[ret_node] = group_id;
split_plan_[group_id].emplace_back(ret_node);
(void)split_plan_[group_id].emplace_back(ret_node);
return;
}
// assign the make_tuple node to a new group.
if (common::AnfAlgo::CheckPrimitiveType(output, prim::kPrimMakeTuple)) {
auto group_id = split_plan_.size();
split_plan_.emplace_back(AnfNodePtrList{output, ret_node});
need_inline_.emplace_back(1);
(void)split_plan_.emplace_back(AnfNodePtrList{output, ret_node});
(void)need_inline_.emplace_back(1);
node_group_[output] = group_id;
node_group_[ret_node] = group_id;
return;


+ 4
- 10
mindspore/ccsrc/common/graph_kernel/add_atomic_clean.cc View File

@@ -52,8 +52,9 @@ std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = fal
}
auto axis_vec = GetReduceAxis(node);
if (axis_vec.empty()) {
axis_vec.resize(src_shape_vec.size());
for (size_t i = 0; i < src_shape_vec.size(); ++i) {
(void)axis_vec.emplace_back(i);
axis_vec[i] = i;
}
} else {
(void)std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), [&src_shape_vec](int64_t axis) -> int64_t {
@@ -281,15 +282,8 @@ void AtomicCleanInsertter::CorrectKernelBuildInfo(
new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second));
}

kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder;
new_info_builder.SetInputsFormat(new_inputs_format);
new_info_builder.SetInputsDeviceType(new_inputs_type);
new_info_builder.SetOutputsFormat(new_outputs_format);
new_info_builder.SetOutputsDeviceType(new_outputs_type);
new_info_builder.SetProcessor(origin_processor);
new_info_builder.SetKernelType(KernelType::AKG_KERNEL);
new_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
auto new_selected_info = new_info_builder.Build();
auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format,
new_outputs_type, origin_processor);
AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get());
}



+ 2
- 2
mindspore/ccsrc/common/graph_kernel/add_atomic_clean.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ class AtomicAddChecker {
std::vector<AtomicAddInfo> GetAtomicAddInfo() { return atomic_add_infos_; }

protected:
virtual bool SuitableForAtomicAdd(const AnfNodePtr &node) { return false; }
virtual bool SuitableForAtomicAdd(const AnfNodePtr &) { return false; }
virtual bool FindCandidate(const AnfNodePtr &anf_node);
virtual bool CanActivateAtomicAdd(const AnfNodePtr &anf_node);
std::vector<AtomicAddInfo> atomic_add_infos_;


+ 6
- 13
mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.cc View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -52,15 +52,8 @@ void StitchAtomicCleanInsertter::CorrectKernelBuildInfo(
new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second));
new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second));

kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder;
new_info_builder.SetInputsFormat(new_inputs_format);
new_info_builder.SetInputsDeviceType(new_inputs_type);
new_info_builder.SetOutputsFormat(new_outputs_format);
new_info_builder.SetOutputsDeviceType(new_outputs_type);
new_info_builder.SetProcessor(origin_processor);
new_info_builder.SetKernelType(KernelType::AKG_KERNEL);
new_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
auto new_selected_info = new_info_builder.Build();
auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format,
new_outputs_type, origin_processor);
AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get());
}

@@ -124,7 +117,7 @@ void StitchAtomicCleanInsertter::ProcessOriginCNode(
for (const auto &[user_node, index] : reduce_user_nodes) {
auto user_cnode = user_node->cast<CNodePtr>();
MS_EXCEPTION_IF_NULL(user_cnode);
user_cnode->set_input(static_cast<size_t>(index), parameter);
user_cnode->set_input(IntToSize(index), parameter);
if (!connected) {
std::vector<std::pair<AnfNodePtr, int>> user_user = FindInnerCNodeUsers(stitch_node_, user_cnode);
if (!user_user.empty()) {
@@ -154,8 +147,8 @@ std::vector<std::pair<AnfNodePtr, int>> StitchAtomicCleanInsertter::FindInnerCNo
}
std::vector<std::pair<AnfNodePtr, int>> inner_user_nodes;
auto users = mng_sub->node_users()[target];
std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes),
[](const std::pair<AnfNodePtr, int> &pair) { return pair; });
(void)std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes),
[](const std::pair<AnfNodePtr, int> &pair) { return pair; });
return inner_user_nodes;
}



+ 7
- 5
mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.h View File

@@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -32,16 +32,18 @@ class StitchAtomicCleanInsertter : public AtomicCleanInsertter {
~StitchAtomicCleanInsertter() override = default;
bool Run(const FuncGraphPtr &func_graph) override;

private:
protected:
void CorrectKernelBuildInfo(const AnfNodePtr &composite_node,
const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &clean_infos) override;
void ProcessOriginCNode(
const AnfNodePtr &composite_node,
const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override;

private:
CNodePtr CreateInplaceAssignNode(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter,
const AtomicAddInfo &info) const;
std::vector<std::pair<AnfNodePtr, int>> FindInnerCNodeUsers(const AnfNodePtr &inner_node,
const CNodePtr &target) const;
void ProcessOriginCNode(
const AnfNodePtr &composite_node,
const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override;
std::pair<bool, AtomicAddInfo> IsStitchWithAtomic(const AnfNodePtr &anf_node);

void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node,


+ 2
- 2
mindspore/ccsrc/common/graph_kernel/cast_matmul_fusion.cc View File

@@ -33,8 +33,8 @@ void UpdateBuildInfo(const AnfNodePtr &matmul_node, const AnfNodePtr &cast_node)
input_types.push_back(cast_types.front());
std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node);
std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node);
auto graph_sel_info =
BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node);
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
AnfAlgo::GetProcessor(matmul_node));
AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get());
}


+ 1
- 1
mindspore/ccsrc/common/graph_kernel/core/graph_kernel_callback.h View File

@@ -169,6 +169,6 @@ class CallbackImplRegister {
};

#define GRAPH_KERNEL_CALLBACK_REGISTER(cls) \
static const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); })
const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); })
} // namespace mindspore::graphkernel
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CORE_GRAPH_KERNEL_CALLBACK_H_

+ 2
- 1
mindspore/ccsrc/common/graph_kernel/decrease_compute_precision.cc View File

@@ -74,7 +74,8 @@ void UpdateOutputInfo(const AnfNodePtr &cnode) {
}
std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(cnode);
std::vector<TypeId> output_types = {TypeId::kNumberTypeFloat16};
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode);
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
AnfAlgo::GetProcessor(cnode));
AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, cnode.get());
}
}


+ 8
- 14
mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.cc View File

@@ -91,33 +91,27 @@ AbstractBasePtr GetOutputAbstract(const AnfNodePtr &node, size_t output_idx) {
return out_spec;
}

// Rebuild as node inputs or outputs have changed, processor comes from node itself
// Build for new node, processor comes from context
kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
const std::vector<TypeId> &inputs_type,
const std::vector<std::string> &output_formats,
const std::vector<TypeId> &output_types, const AnfNodePtr &node) {
kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
graph_info_builder.SetInputsFormat(inputs_format);
graph_info_builder.SetInputsDeviceType(inputs_type);
graph_info_builder.SetOutputsFormat(output_formats);
graph_info_builder.SetOutputsDeviceType(output_types);
graph_info_builder.SetProcessor(AnfAlgo::GetProcessor(node));
graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
return graph_info_builder.Build();
const std::vector<TypeId> &output_types) {
return BuildSelectKernelBuildInfo(inputs_format, inputs_type, output_formats, output_types,
kernel::GetProcessorFromContext());
}

// Build for new node, processor comes from context
// Build for new node with given processor
kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
const std::vector<TypeId> &inputs_type,
const std::vector<std::string> &output_formats,
const std::vector<TypeId> &output_types) {
const std::vector<TypeId> &output_types,
const kernel::Processor &processor) {
kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
graph_info_builder.SetInputsFormat(inputs_format);
graph_info_builder.SetInputsDeviceType(inputs_type);
graph_info_builder.SetOutputsFormat(output_formats);
graph_info_builder.SetOutputsDeviceType(output_types);
graph_info_builder.SetProcessor(kernel::GetProcessorFromContext());
graph_info_builder.SetProcessor(processor);
graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
return graph_info_builder.Build();


+ 3
- 2
mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.h View File

@@ -57,11 +57,12 @@ void SetNewKernelInfo(const AnfNodePtr &new_node, const FuncGraphPtr &fg, const
kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
const std::vector<TypeId> &inputs_type,
const std::vector<std::string> &output_formats,
const std::vector<TypeId> &output_types, const AnfNodePtr &node);
const std::vector<TypeId> &output_types);
kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
const std::vector<TypeId> &inputs_type,
const std::vector<std::string> &output_formats,
const std::vector<TypeId> &output_types);
const std::vector<TypeId> &output_types,
const kernel::Processor &processor);
bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc);
bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc,
std::map<std::string, AnfNodePtr> *address_node_map);


+ 2
- 2
mindspore/ccsrc/common/graph_kernel/insert_pad.cc View File

@@ -229,8 +229,8 @@ void UpdateMatmulInfo(const AnfNodePtr &matmul_node, const vec &unpad_shape, con
std::vector<TypeId> input_types = AnfAlgo::GetAllInputDeviceTypes(matmul_node);
std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node);
std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node);
auto graph_sel_info =
BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node);
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
AnfAlgo::GetProcessor(matmul_node));
AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get());
}


+ 11
- 11
mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_optimization.cc View File

@@ -39,14 +39,14 @@ using opt::GraphOptimizer;
PassManagerPtr GraphKernelOptimizer::Cluster() const {
auto pm = std::make_shared<GraphKernelPassManager>(0, "cluster");
// Expand complex basic kernels to composite kernels
pm->AddPass(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1);

// Cluster basic kernels and composite kernels
pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1);
pm->AddPass(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1);
pm->Add(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1);

// Eliminate the outputs without external user
pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
return pm;
}

@@ -55,27 +55,27 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
// Make certain nodes redundant so that they are used by only one user,
// which can avoid unnecessary input-output and get better performance.
// preprocess for ShapeOpsSplitter
pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);

// Split kernel according to costmodel
pm->AddPass(std::make_shared<GraphKernelSplitter>(), OptLevel_1);
pm->Add(std::make_shared<GraphKernelSplitter>(), OptLevel_1);

// After Simplify and Splitter, a lot of redundant getitem/maketuple
// will be exposed, use GetitemTuple Pass to delete them.
pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);

// Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
return pm;
}

PassManagerPtr GraphKernelOptimizer::PostProcess() const {
auto pm = std::make_shared<GraphKernelPassManager>(2, "postprocess");
// build akg and replace graph kernel nodes
pm->AddPass(std::make_shared<KernelBuilder>(), OptLevel_1);
pm->Add(std::make_shared<KernelBuilder>(), OptLevel_1);
return pm;
}



+ 8
- 8
mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.cc View File

@@ -23,26 +23,26 @@
#include "debug/anf_ir_dump.h"
namespace mindspore::graphkernel {
void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
MS_EXCEPTION_IF_NULL(pass);
auto pass_id = passes_.size();
auto pass_name = pass->name();
auto pass_id = passes_.size();
auto pass_in_list = [this, pass_id, &pass_name](const std::vector<std::string> &pass_list) {
// the config format can be "stage_id.pass_id" or "stage_name.pass_name"
// config format can be "stage_id.pass_id" or "stage_name.pass_name"
return std::find(pass_list.begin(), pass_list.end(),
std::to_string(this->stage_) + "." + std::to_string(pass_id)) != pass_list.end() ||
std::find(pass_list.begin(), pass_list.end(), this->name_ + "." + pass_name) != pass_list.end();
};
bool enable = supported_device && flags_.opt_level >= pass_level;
if (enable) {
// if it meets the condition to enable, check whether it's in the disabled list.
// if it meets the condition to enable, check whether it's in the disabled pass list.
enable = !pass_in_list(flags_.disable_pass);
} else {
// if it doesn't meet the condition to enable, check whether it's in the enabled list.
// if it doesn't meet the condition to enable, check whether it's in the enabled pass list.
enable = pass_in_list(flags_.enable_pass);
}
passes_.push_back(pass);
enabled_.push_back(enable);
passes_.push_back(pass);
}
bool GraphKernelPassManager::RunPass(const FuncGraphPtr &func_graph, size_t pass_id, const opt::PassPtr &pass) const {
@@ -59,10 +59,10 @@ bool GraphKernelPassManager::Run(const FuncGraphPtr &func_graph) const {
for (size_t i = 0; i < passes_.size(); i++) {
if (enabled_[i]) {
changed = RunPass(func_graph, i, passes_[i]) || changed;
// dump ir to a graph_kernel subdir, and set a global id in front of the filename
// dump ir to a graph_kernel subdir, and set a global id in front of the filenames
std::ostringstream oss;
static int g_id = 0;
constexpr int id_length = 4;
static int g_id = 0;
oss << "graph_kernel/" << std::setfill('0') << std::setw(id_length) << g_id++ << "_"
<< GetPassFullname(i, passes_[i]);
DumpPassIR(func_graph, oss.str());


+ 4
- 4
mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.h View File

@@ -21,8 +21,8 @@
#include <string>
#include <memory>
#include "include/common/utils/context/graph_kernel_flags.h"
#include "backend/common/optimizer/pass_manager.h"
#include "include/common/utils/context/graph_kernel_flags.h"
namespace mindspore::graphkernel {
using opt::PassManager;
@@ -32,10 +32,10 @@ class GraphKernelPassManager : public PassManager {
: PassManager(name, true), stage_(stage), flags_(GraphKernelFlags::GetInstance()) {}
~GraphKernelPassManager() = default;
// Add graph pass, the pass object will be freed when pass manager freed.
virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
// Add graph pass for lite, the pass object will be freed when pass manager freed.
void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
// Run passes on the func_graph
// Run passes for lite on the func_graph
bool Run(const FuncGraphPtr &func_graph) const override;
protected:


+ 2
- 1
mindspore/ccsrc/common/graph_kernel/optimize_assign.cc View File

@@ -178,7 +178,8 @@ bool ReplaceAssignByInplaceAssignInGraphkernel(const FuncGraphPtr &func_graph) {
input_types.push_back(input_types.back());
std::vector<std::string> output_formats = {input_formats.back()};
std::vector<TypeId> output_types = {input_types.back()};
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode);
auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
AnfAlgo::GetProcessor(cnode));
AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, new_cnode.get());
mng->Replace(cnode, new_cnode);
}


Loading…
Cancel
Save