Browse Source

[GraphKernel] add stable ops only flag for graph kernel fusion, and set ascend backend enable default

tags/v1.5.0-rc1
r1chardf1d0 4 years ago
parent
commit
7ec4649805
7 changed files with 164 additions and 124 deletions
  1. +73
    -69
      mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc
  2. +3
    -2
      mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.h
  3. +49
    -48
      mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc
  4. +16
    -0
      mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc
  5. +4
    -0
      mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h
  6. +5
    -0
      mindspore/ccsrc/utils/context/graph_kernel_flags.cc
  7. +14
    -5
      mindspore/ccsrc/utils/context/graph_kernel_flags.h

+ 73
- 69
mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.cc View File

@@ -20,6 +20,7 @@
#include <unordered_map>
#include <set>
#include <vector>
#include <tuple>
#include <memory>
#include <utility>
#include <fstream>
@@ -36,78 +37,81 @@

namespace mindspore {
namespace opt {
namespace {
std::vector<PrimitivePtr> GetClusterableOpList() {
std::vector<PrimitivePtr> clusterable_ops = {
prim::kPrimAbs,
prim::kPrimAdd,
prim::kPrimCast,
prim::kPrimEqual,
prim::kPrimExp,
prim::kPrimInplaceAssign,
prim::kPrimLog,
prim::kPrimMaximum,
prim::kPrimMinimum,
prim::kPrimMul,
prim::kPrimNeg,
prim::kPrimPow,
prim::kPrimRealDiv,
prim::kPrimReciprocal,
prim::kPrimReduceSum,
prim::kPrimReshape,
prim::kPrimRound,
prim::kPrimRsqrt,
prim::kPrimSqrt,
prim::kPrimSub,
prim::kPrimTanh,
prim::kPrimTranspose,
#if ENABLE_D
prim::kPrimMatMul,
prim::kPrimTransData,
prim::kPrimBatchMatMul,
#elif ENABLE_GPU
prim::kPrimACos,
prim::kPrimAcosh,
prim::kPrimArgMax,
prim::kPrimArgMin,
prim::kPrimAsin,
prim::kPrimAsinh,
prim::kPrimAssign,
prim::kPrimAtan,
prim::kPrimAtan2,
prim::kPrimCos,
prim::kPrimDiv,
prim::kPrimErf,
prim::kPrimExpm1,
prim::kPrimFloor,
prim::kPrimFloorDiv,
prim::kPrimFloorMod,
prim::kPrimGreater,
prim::kPrimGreaterEqual,
prim::kPrimIsFinite,
prim::kPrimIsInf,
prim::kPrimIsNan,
prim::kPrimLess,
prim::kPrimLessEqual,
prim::kPrimLogicalAnd,
prim::kPrimLogicalOr,
prim::kPrimLogicalNot,
prim::kPrimMod,
prim::kPrimNotEqual,
prim::kPrimReduceMax,
prim::kPrimReduceMin,
prim::kPrimSelect,
prim::kPrimSign,
prim::kPrimSin,
prim::kPrimStridedSlice,
prim::kPrimUserDefined,
#endif
using context::OpLevel_0;
using context::OpLevel_1;
std::vector<PrimitivePtr> GraphKernelCluster::GetClusterableOpList() {
std::vector<std::tuple<std::string, unsigned int, PrimitivePtr>> clusterable_ops_with_level = {
// all target
{kAllTarget, OpLevel_0, prim::kPrimAbs},
{kAllTarget, OpLevel_0, prim::kPrimAdd},
{kAllTarget, OpLevel_0, prim::kPrimCast},
{kAllTarget, OpLevel_0, prim::kPrimEqual},
{kAllTarget, OpLevel_0, prim::kPrimExp},
{kAllTarget, OpLevel_0, prim::kPrimInplaceAssign},
{kAllTarget, OpLevel_0, prim::kPrimLog},
{kAllTarget, OpLevel_0, prim::kPrimMaximum},
{kAllTarget, OpLevel_0, prim::kPrimMinimum},
{kAllTarget, OpLevel_0, prim::kPrimMul},
{kAllTarget, OpLevel_0, prim::kPrimNeg},
{kAllTarget, OpLevel_0, prim::kPrimPow},
{kAllTarget, OpLevel_0, prim::kPrimRealDiv},
{kAllTarget, OpLevel_0, prim::kPrimReciprocal},
{kAllTarget, OpLevel_1, prim::kPrimReduceSum},
{kAllTarget, OpLevel_1, prim::kPrimReshape},
{kAllTarget, OpLevel_0, prim::kPrimRound},
{kAllTarget, OpLevel_0, prim::kPrimRsqrt},
{kAllTarget, OpLevel_0, prim::kPrimSqrt},
{kAllTarget, OpLevel_0, prim::kPrimSub},
{kAllTarget, OpLevel_0, prim::kPrimTanh},
{kAllTarget, OpLevel_1, prim::kPrimTranspose},
// ascend
{kAscendDevice, OpLevel_1, prim::kPrimMatMul},
{kAscendDevice, OpLevel_1, prim::kPrimTransData},
{kAscendDevice, OpLevel_1, prim::kPrimBatchMatMul},
// gpu
{kGPUDevice, OpLevel_0, prim::kPrimACos},
{kGPUDevice, OpLevel_0, prim::kPrimAcosh},
{kGPUDevice, OpLevel_1, prim::kPrimArgMax},
{kGPUDevice, OpLevel_1, prim::kPrimArgMin},
{kGPUDevice, OpLevel_0, prim::kPrimAsin},
{kGPUDevice, OpLevel_0, prim::kPrimAsinh},
{kGPUDevice, OpLevel_0, prim::kPrimAssign},
{kGPUDevice, OpLevel_0, prim::kPrimAtan},
{kGPUDevice, OpLevel_0, prim::kPrimAtan2},
{kGPUDevice, OpLevel_0, prim::kPrimCos},
{kGPUDevice, OpLevel_0, prim::kPrimDiv},
{kGPUDevice, OpLevel_0, prim::kPrimErf},
{kGPUDevice, OpLevel_0, prim::kPrimExpm1},
{kGPUDevice, OpLevel_0, prim::kPrimFloor},
{kGPUDevice, OpLevel_0, prim::kPrimFloorDiv},
{kGPUDevice, OpLevel_0, prim::kPrimFloorMod},
{kGPUDevice, OpLevel_0, prim::kPrimGreater},
{kGPUDevice, OpLevel_0, prim::kPrimGreaterEqual},
{kGPUDevice, OpLevel_0, prim::kPrimIsFinite},
{kGPUDevice, OpLevel_0, prim::kPrimIsInf},
{kGPUDevice, OpLevel_0, prim::kPrimIsNan},
{kGPUDevice, OpLevel_0, prim::kPrimLess},
{kGPUDevice, OpLevel_0, prim::kPrimLessEqual},
{kGPUDevice, OpLevel_0, prim::kPrimLogicalAnd},
{kGPUDevice, OpLevel_0, prim::kPrimLogicalOr},
{kGPUDevice, OpLevel_0, prim::kPrimLogicalNot},
{kGPUDevice, OpLevel_0, prim::kPrimMod},
{kGPUDevice, OpLevel_0, prim::kPrimNotEqual},
{kGPUDevice, OpLevel_1, prim::kPrimReduceMax},
{kGPUDevice, OpLevel_1, prim::kPrimReduceMin},
{kGPUDevice, OpLevel_0, prim::kPrimSelect},
{kGPUDevice, OpLevel_0, prim::kPrimSign},
{kGPUDevice, OpLevel_0, prim::kPrimSin},
{kGPUDevice, OpLevel_0, prim::kPrimStridedSlice},
{kGPUDevice, OpLevel_0, prim::kPrimUserDefined},
};
const auto &flags = context::GraphKernelFlags::GetInstance();
std::vector<PrimitivePtr> clusterable_ops = GetValidOps(clusterable_ops_with_level, flags.fusion_ops_level);
OpListFilter(&clusterable_ops, flags.enable_cluster_ops_only, flags.enable_cluster_ops, flags.disable_cluster_ops);
return clusterable_ops;
}

namespace {
size_t CountGraphKernelInnerNodes(const AnfNodePtr &node) {
AnfNodePtrList node_list;
kernel::GetValidKernelNodes(AnfAlgo::GetCNodeFuncGraphPtr(node), &node_list);
@@ -115,15 +119,14 @@ size_t CountGraphKernelInnerNodes(const AnfNodePtr &node) {
}
} // namespace

bool IsClusterableOp(const AnfNodePtr &node) {
bool GraphKernelCluster::IsClusterableOp(const AnfNodePtr &node) {
if (AnfAlgo::IsGraphKernel(node)) {
return true;
}
if (IsKeepBasicNode(node)) {
return false;
}
auto op_list = GetClusterableOpList();
bool node_in_oplist = std::any_of(op_list.begin(), op_list.end(),
bool node_in_oplist = std::any_of(op_list_.begin(), op_list_.end(),
[&node](const PrimitivePtr &prim) { return IsPrimitiveCNode(node, prim); });
if (!node_in_oplist) {
return false;
@@ -496,6 +499,7 @@ void GraphKernelCluster::RemoveWildGetitem(std::vector<size_t> *candidates) {
}

void GraphKernelCluster::Init(const FuncGraphPtr &func_graph) {
op_list_ = GetClusterableOpList();
// process cnode only
nodes_ = TopoSort(func_graph->get_return(), SuccIncoming,
[](const AnfNodePtr &node) { return node->isa<CNode>() ? FOLLOW : EXCLUDE; });


+ 3
- 2
mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cluster.h View File

@@ -40,6 +40,8 @@ class GraphKernelCluster : public Pass {
bool Run(const FuncGraphPtr &func_graph) override;

private:
std::vector<PrimitivePtr> GetClusterableOpList();
bool IsClusterableOp(const AnfNodePtr &node);
void Init(const FuncGraphPtr &func_graph);
bool Process(const FuncGraphPtr &func_graph);
std::vector<size_t> FindCandidates(size_t basenode_id);
@@ -57,9 +59,8 @@ class GraphKernelCluster : public Pass {
std::vector<AnfNodePtr> nodes_;
std::unordered_map<AnfNodePtr, size_t> node_idx_map_;
std::stringstream dump_buf_;
std::vector<PrimitivePtr> op_list_;
};

bool IsClusterableOp(const AnfNodePtr &node);
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_CLUSTER_H_

+ 49
- 48
mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_expander.cc View File

@@ -20,6 +20,7 @@
#include <set>
#include <utility>
#include <vector>
#include <tuple>
#include <algorithm>

#include "utils/context/graph_kernel_flags.h"
@@ -40,61 +41,61 @@
namespace mindspore {
namespace opt {
namespace {
using context::OpLevel_0;
using context::OpLevel_1;
constexpr size_t kAssignInputIdx = 1;
constexpr size_t kLambOptimizerInputIdx = 12;
constexpr size_t kLambWeightInputIdx = 4;

std::vector<PrimitivePtr> GetExpandOps() {
std::vector<PrimitivePtr> expand_ops = {
prim::kPrimAddN,
prim::kPrimAssignAdd,
prim::kPrimErfc,
prim::kPrimExpandDims,
prim::kPrimGeLU,
prim::kPrimGeLUGrad,
prim::kPrimSquare,
prim::kPrimTile,
#if ENABLE_D
prim::kLambApplyOptimizerAssign,
prim::kLambApplyWeightAssign,
prim::kPrimClipByNormNoDivSum,
prim::kPrimSqrtGrad,
prim::kSoftmaxGradExt,
prim::kFusedMulAdd,
#elif ENABLE_GPU
prim::kPrimBatchMatMul,
prim::kPrimBiasAdd,
prim::kPrimBiasAddGrad,
prim::kPrimDropout,
prim::kPrimDropoutGrad,
prim::kPrimFusedAdam,
prim::kPrimFusedAdamWeightDecay,
prim::kPrimMaximumGrad,
prim::kPrimMinimumGrad,
prim::kPrimLayerNorm,
prim::kPrimLayerNormGrad,
prim::kPrimLogSoftmax,
prim::kPrimLogSoftmaxGrad,
prim::kPrimMatMul,
prim::kPrimReduceMean,
prim::kPrimRelu,
prim::kPrimReluGrad,
prim::kPrimSigmoid,
prim::kPrimSigmoidGrad,
prim::kPrimSigmoidCrossEntropyWithLogits,
prim::kPrimSigmoidCrossEntropyWithLogitsGrad,
prim::kPrimSlice,
prim::kPrimSoftmax,
prim::kPrimSoftmaxCrossEntropyWithLogits,
prim::kPrimSquaredDifference,
prim::kPrimSqueeze,
prim::kPrimEqualCount,
prim::kPrimSquareSumAll,
prim::kPrimIdentityMath,
prim::kPrimOnesLike,
#endif
std::vector<std::tuple<std::string, unsigned int, PrimitivePtr>> expand_ops_with_level = {
{kAllTarget, OpLevel_0, prim::kPrimAddN},
{kAllTarget, OpLevel_0, prim::kPrimAssignAdd},
{kAllTarget, OpLevel_0, prim::kPrimErfc},
{kAllTarget, OpLevel_1, prim::kPrimExpandDims},
{kAllTarget, OpLevel_0, prim::kPrimGeLU},
{kAllTarget, OpLevel_0, prim::kPrimGeLUGrad},
{kAllTarget, OpLevel_0, prim::kPrimSquare},
{kAllTarget, OpLevel_0, prim::kPrimTile},
{kAscendDevice, OpLevel_0, prim::kLambApplyOptimizerAssign},
{kAscendDevice, OpLevel_0, prim::kLambApplyWeightAssign},
{kAscendDevice, OpLevel_0, prim::kPrimClipByNormNoDivSum},
{kAscendDevice, OpLevel_0, prim::kPrimSqrtGrad},
{kAscendDevice, OpLevel_1, prim::kSoftmaxGradExt},
{kAscendDevice, OpLevel_0, prim::kFusedMulAdd},
{kGPUDevice, OpLevel_1, prim::kPrimBatchMatMul},
{kGPUDevice, OpLevel_0, prim::kPrimBiasAdd},
{kGPUDevice, OpLevel_1, prim::kPrimBiasAddGrad},
{kGPUDevice, OpLevel_0, prim::kPrimDropout},
{kGPUDevice, OpLevel_0, prim::kPrimDropoutGrad},
{kGPUDevice, OpLevel_0, prim::kPrimFusedAdam},
{kGPUDevice, OpLevel_0, prim::kPrimFusedAdamWeightDecay},
{kGPUDevice, OpLevel_1, prim::kPrimMaximumGrad},
{kGPUDevice, OpLevel_1, prim::kPrimMinimumGrad},
{kGPUDevice, OpLevel_1, prim::kPrimLayerNorm},
{kGPUDevice, OpLevel_1, prim::kPrimLayerNormGrad},
{kGPUDevice, OpLevel_0, prim::kPrimLogSoftmax},
{kGPUDevice, OpLevel_0, prim::kPrimLogSoftmaxGrad},
{kGPUDevice, OpLevel_1, prim::kPrimMatMul},
{kGPUDevice, OpLevel_1, prim::kPrimReduceMean},
{kGPUDevice, OpLevel_0, prim::kPrimRelu},
{kGPUDevice, OpLevel_0, prim::kPrimReluGrad},
{kGPUDevice, OpLevel_0, prim::kPrimSigmoid},
{kGPUDevice, OpLevel_0, prim::kPrimSigmoidGrad},
{kGPUDevice, OpLevel_0, prim::kPrimSigmoidCrossEntropyWithLogits},
{kGPUDevice, OpLevel_0, prim::kPrimSigmoidCrossEntropyWithLogitsGrad},
{kGPUDevice, OpLevel_0, prim::kPrimSlice},
{kGPUDevice, OpLevel_1, prim::kPrimSoftmax},
{kGPUDevice, OpLevel_1, prim::kPrimSoftmaxCrossEntropyWithLogits},
{kGPUDevice, OpLevel_0, prim::kPrimSquaredDifference},
{kGPUDevice, OpLevel_0, prim::kPrimSqueeze},
{kGPUDevice, OpLevel_0, prim::kPrimEqualCount},
{kGPUDevice, OpLevel_0, prim::kPrimSquareSumAll},
{kGPUDevice, OpLevel_0, prim::kPrimIdentityMath},
{kGPUDevice, OpLevel_0, prim::kPrimOnesLike},
};
const auto &flags = context::GraphKernelFlags::GetInstance();
std::vector<PrimitivePtr> expand_ops = GetValidOps(expand_ops_with_level, flags.fusion_ops_level);
OpListFilter(&expand_ops, flags.enable_expand_ops_only, flags.enable_expand_ops, flags.disable_expand_ops);
return expand_ops;
}


+ 16
- 0
mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc View File

@@ -951,5 +951,21 @@ void EliminateRedundantParameters(const FuncGraphPtr &func_graph, AnfNodePtrList
func_graph->set_parameters(new_parameter);
*inputs = std::move(new_inputs);
}

std::vector<PrimitivePtr> GetValidOps(
const std::vector<std::tuple<std::string, unsigned int, PrimitivePtr>> &ops_with_level, unsigned int level) {
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
std::string target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
std::vector<PrimitivePtr> valid_ops;
for (const auto &[op_target, op_level, op] : ops_with_level) {
if (op_target == kAllTarget || op_target == target) {
if (level >= op_level) {
valid_ops.emplace_back(op);
}
}
}
return valid_ops;
}
} // namespace opt
} // namespace mindspore

+ 4
- 0
mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h View File

@@ -47,6 +47,7 @@ constexpr auto kGetGraphKernelOpExpander = "get_op_expander";
constexpr auto kJsonKeyMultiGraph = "multi_graph";
constexpr auto kJsonKeyGraphDesc = "graph_desc";
constexpr auto kJsonKeyGraphMode = "graph_mode";
constexpr auto kAllTarget = "ALL";

constexpr auto kGraphKernelDumpPath = "graph_kernel_dump";
inline const PrimitivePtr kPrimUnPadAkg = std::make_shared<Primitive>("UnPadAkg");
@@ -141,6 +142,9 @@ FuncGraphPtr LiteGraph2AnfGraph(const graphkernel::LiteGraphPtr &lite_graph, Anf

// remove parameter which is not used
void EliminateRedundantParameters(const FuncGraphPtr &func_graph, AnfNodePtrList *inputs);

std::vector<PrimitivePtr> GetValidOps(
const std::vector<std::tuple<std::string, unsigned int, PrimitivePtr>> &ops_with_level, unsigned int level);
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_

+ 5
- 0
mindspore/ccsrc/utils/context/graph_kernel_flags.cc View File

@@ -170,6 +170,9 @@ void GraphKernelFlags::Refresh() {

void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_map) {
FlagRegister reg(flag_map);
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
bool is_gpu = (context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);

// Set opt_level first, some flags' default value depends on it.
// Default optimization level is level 2 when enable graphkernel
@@ -185,6 +188,7 @@ void GraphKernelFlags::RegisterFlags(std::map<std::string, std::string> *flag_ma
reg.AddFlag("enable_recompute_fusion", &enable_recompute_fusion, opt_level >= OptLevel_2);
reg.AddFlag("enable_parallel_fusion", &enable_parallel_fusion, opt_level == OptLevel_3);
reg.AddFlag("enable_low_precision", &enable_low_precision);
reg.AddFlag("fusion_ops_level", &fusion_ops_level, is_gpu ? OpLevel_MAX : OpLevel_0);

// Integer flags
reg.AddFlag("online_tuning", &online_tuning);
@@ -215,6 +219,7 @@ std::string GraphKernelFlags::DumpAllFlags() const {
json["enable_low_precision"] = enable_low_precision;

json["opt_level"] = opt_level;
json["fusion_ops_level"] = fusion_ops_level;
json["online_tuning"] = online_tuning;

json["repository_path"] = repository_path;


+ 14
- 5
mindspore/ccsrc/utils/context/graph_kernel_flags.h View File

@@ -32,6 +32,10 @@ constexpr unsigned int OptLevel_2 = 2; // Default functions
constexpr unsigned int OptLevel_3 = 3; // Experimental functions
constexpr unsigned int OptLevel_MAX = 4;

constexpr unsigned int OpLevel_0 = 0;
constexpr unsigned int OpLevel_1 = 1;
constexpr unsigned int OpLevel_MAX = 2;

class GraphKernelFlags {
public:
static const GraphKernelFlags &GetInstance() {
@@ -65,26 +69,31 @@ class GraphKernelFlags {
*
* Experimental feature, enabled by default when opt_level=3
*/
bool enable_stitch_fusion;
bool enable_stitch_fusion{false};

/**
* Enable recompute fusion in graph kernel fusion strategy, enabled when op_level>=2.
*/
bool enable_recompute_fusion;
bool enable_recompute_fusion{false};

/**
* Enable parallel fusion in graph kernel fusion strategy.
*
* Experimental feature, enabled by default when opt_level=3
*/
bool enable_parallel_fusion;
bool enable_parallel_fusion{false};

/**
* Enable low precision in data transferring between graph kernel and computing in graph kernel
* in graph kernel.
* Experimental feature, enabled by the enable_low_precision flag
*/
bool enable_low_precision;
bool enable_low_precision{false};

/**
* Expand and cluster AKG's operators by level.
*/
unsigned int fusion_ops_level{OpLevel_0};

/**
* Optimization level, value from 0 to 3.
@@ -95,7 +104,7 @@ class GraphKernelFlags {
* The default value is OptLevel_2 when the context "enable_graph_kernel" is set,
* but if it's also changed in "graph_kernel_flags", then the "graph_kernel_flags" will prevail.
*/
unsigned int opt_level; // defaults 0 or 2
unsigned int opt_level{0}; // defaults 0 or 2

/**
* Online tuning level, value from 0 to 3.


Loading…
Cancel
Save