[GraphKernel] fix code check.

4 years ago · e5af40d31c
--- a/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.h
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/callback_impl.h
@@ -23,6 +23,7 @@
 namespace mindspore::graphkernel {
 class CallbackImpl : public Callback {
 public:
  virtual ~CallbackImpl() = default;
  ShapeVector GetInputShape(const AnfNodePtr &node, size_t i) override;
  ShapeVector GetOutputShape(const AnfNodePtr &node, size_t i) override;
  ShapeVector GetInputInferShape(const AnfNodePtr &node, size_t i) override;
--- a/mindspore/ccsrc/common/graph_kernel/adapter/fake_abstract_shape.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/fake_abstract_shape.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@

 namespace mindspore::graphkernel {
 namespace {
 class AbstractShapeCreator {
 class FakeAbstractShape {
 public:
  using AbstractShapeTransferFunc = std::function<ShapeVector(const ShapeVector &)>;
  /**
@@ -49,10 +49,14 @@ class AbstractShapeCreator {
  static ShapeVector NchwAbstractShape(const ShapeVector &device_shape) { return device_shape; }
  static ShapeVector NhwcAbstractShape(const ShapeVector &device_shape) {
    const size_t nhwc_size = 4;
    const size_t index_n = 0;
    const size_t index_h = 1;
    const size_t index_w = 2;
    const size_t index_c = 3;
    if (device_shape.size() != nhwc_size) {
      MS_LOG(EXCEPTION) << "Shape size of NHWC should be 4, but got " << device_shape.size();
    }
    return {device_shape[0], device_shape[3], device_shape[1], device_shape[2]};
    return {device_shape[index_n], device_shape[index_c], device_shape[index_h], device_shape[index_w]};
  }
  static ShapeVector FractalNzAbstractShape(const ShapeVector &device_shape) {
    if (device_shape.size() == 1 && (device_shape[0] == 1 || static_cast<size_t>(device_shape[0]) % kCubeSize == 0)) {
@@ -64,12 +68,16 @@ class AbstractShapeCreator {
    }
    ShapeVector shape;
    size_t dims = device_shape.size();
    size_t batch = dims - 4;
    size_t batch = dims - nz_size;
    for (size_t i = 0; i < batch; ++i) {
      shape.push_back(device_shape[i]);
    }
    int64_t m = device_shape[dims - 3] * device_shape[dims - 2];
    int64_t n = device_shape[dims - 4] * device_shape[dims - 1];
    const size_t index_m1 = 3;
    const size_t index_m2 = 2;
    const size_t index_n1 = 4;
    const size_t index_n2 = 1;
    int64_t m = device_shape[dims - index_m1] * device_shape[dims - index_m2];
    int64_t n = device_shape[dims - index_n1] * device_shape[dims - index_n2];
    shape.push_back(m);
    shape.push_back(n);

@@ -79,6 +87,6 @@ class AbstractShapeCreator {
 }  // namespace

 ShapeVector GetFakeAbstractShape(const ShapeVector &device_shape, const std::string &format) {
  return AbstractShapeCreator::GetFakeAbstractShape(device_shape, format);
  return FakeAbstractShape::GetFakeAbstractShape(device_shape, format);
 }
 }  // namespace mindspore::graphkernel
--- a/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_expander_with_py.h
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_expander_with_py.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -24,12 +24,18 @@

 namespace mindspore::graphkernel {
 class PyExpander : public DefaultExpander {
 public:
  virtual ~PyExpander() = default;

 protected:
  virtual bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json);
  FuncGraphPtr CreateExpandFuncGraph(const CNodePtr &node) override;
 };

 class ComplexOpExpander : public PyExpander {
 public:
  virtual ~ComplexOpExpander() = default;

 protected:
  bool ExpandJsonInfo(const AnfNodePtr &node, nlohmann::json *kernel_json);
 };
--- a/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_optimization.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_optimization.cc
@@ -67,21 +67,21 @@ inline unsigned int GetPassLevelByFlag(bool flag) { return flag ? OptLevel_1 : O
 PassManagerPtr GraphKernelOptimizer::PreProcess() const {
  auto pm = std::make_shared<GraphKernelPassManager>(0, "preprocess");
  // Do DependElimination all passes of graphkernel
  pm->AddPass(std::make_shared<DependElimination>(), OptLevel_1);
  pm->Add(std::make_shared<DependElimination>(), OptLevel_1);

  // Do cse before all passes of graphkernel
  pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);
  pm->Add(std::make_shared<CommonSubexpressionElimination>("cse1"), OptLevel_1);

  // Save the original output info
  pm->AddPass(std::make_shared<SaveOutputShape>(), OptLevel_1);
  pm->Add(std::make_shared<SaveOutputShape>(), OptLevel_1);

  // Change Assign(p, a, U) to Assign(Depend(p, U), a)
  pm->AddPass(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu);
  pm->Add(std::make_shared<SplitAssign>(), OptLevel_1, is_gpu);

  // Spread the MakeTuple input of UpdateState
  pm->AddPass(std::make_shared<SpreadUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<SpreadUpdateState>(), OptLevel_1);
  // Eliminate the common nodes that generated in SpreadUpdateState
  pm->AddPass(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1);
  pm->Add(std::make_shared<CommonSubexpressionElimination>("cse2"), OptLevel_1);
  return pm;
 }

@@ -89,16 +89,16 @@ PassManagerPtr GraphKernelOptimizer::Cluster() const {
  auto pm = std::make_shared<GraphKernelPassManager>(1, "cluster");

  // Expand complex op to composite kernels
  pm->AddPass(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu);
  pm->Add(std::make_shared<GraphKernelComplexExpander>(), OptLevel_1, is_gpu);

  // Expand complex basic kernels to composite kernels
  pm->AddPass(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelExpanderWithPy>(), OptLevel_1);

  // Cluster basic kernels and composite kernels
  pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1);

  // Eliminate the outputs without external user
  pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  return pm;
 }

@@ -106,29 +106,29 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt1() const {
  auto pm = std::make_shared<GraphKernelPassManager>(2, "highlevelopt1");

  // Remove redundant Cast(bias, fp16) for Matmul input
  pm->AddPass(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);
  pm->Add(std::make_shared<CastMatmulFusion>(), OptLevel_2, is_ascend);

  // Reorder Cast and Type-insensitive node
  pm->AddPass(std::make_shared<ReorderOps>(), OptLevel_2);
  pm->Add(std::make_shared<ReorderOps>(), OptLevel_2);

  // normalize the Reduce axis
  pm->AddPass(std::make_shared<AxisNormalizer>(), OptLevel_1);
  pm->Add(std::make_shared<AxisNormalizer>(), OptLevel_1);

  // Cast the input of ReduceSum from float16 to float32 for higher precision
  pm->AddPass(std::make_shared<RaiseReductionPrecision>(), OptLevel_2);
  pm->Add(std::make_shared<RaiseReductionPrecision>(), OptLevel_2);

  // Insert PadAkg and UnPadAkg Ops for MatMul
  pm->AddPass(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);
  pm->Add(std::make_shared<InsertPadOps>(), OptLevel_1, is_gpu);

  // Universal arithmetic simplify
  pm->AddPass(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);
  pm->Add(std::make_shared<ArithmeticSimplify>(), OptLevel_2, is_gpu || is_cpu);

  // Common subexpression elimination
  pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_2);
  pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_2);

  // Eliminate unnecessary transform ops
  auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_trans_op_optimize);
  pm->AddPass(std::make_shared<TransformOpOptimizer>(), level, is_gpu);
  pm->Add(std::make_shared<TransformOpOptimizer>(), level, is_gpu);
  return pm;
 }

@@ -137,21 +137,21 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
  // Make certain nodes redundant so that they are used by only one user,
  // which can avoid unnecessary input-output and get better performance.
  // preprocess for ShapeOpsSplitter
  pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
  std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
  pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
  pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);

  // Split kernel according to costmodel
  pm->AddPass(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelSplitterWithPy>(), OptLevel_1);

  // After Simplify and Splitter, a lot of redundant getitem/maketuple
  // will be exposed, use GetitemTuple Pass to delete them.
  pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
  pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);

  // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
  pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
  pm->AddPass(std::make_shared<GraphKernelCSE>(), OptLevel_1);
  pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelCSE>(), OptLevel_1);
  pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  return pm;
 }

@@ -161,30 +161,30 @@ PassManagerPtr GraphKernelOptimizer::HighLevelOpt2() const {
  auto &flags = GraphKernelFlags::GetInstance();
  // Auto recompute according to local memory burst.
  auto recompute_lv = GetPassLevelByFlag(flags.recompute_increment_threshold > 0 || flags.recompute_peak_threshold > 0);
  pm->AddPass(std::make_shared<GraphKernelRecompute>(), recompute_lv);
  pm->Add(std::make_shared<GraphKernelRecompute>(), recompute_lv);

  // Replace Assign with InplaceAssign, and replace original output with overridden parameters
  pm->AddPass(std::make_shared<OptimizeAssign>(), OptLevel_2);
  pm->Add(std::make_shared<OptimizeAssign>(), OptLevel_2);

  pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
  pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
  pm->AddPass(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));
  pm->Add(std::make_shared<ExtendOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
  pm->Add(std::make_shared<MergeOutputForUpdateState>(), std::min(recompute_lv, OptLevel_2));
  pm->Add(std::make_shared<EliminateRedundantOutput>(), std::min(recompute_lv, OptLevel_2));

  // Enable atomic add
  pm->AddPass(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);
  pm->Add(std::make_shared<AtomicCleanInsertter>(), OptLevel_2, is_gpu || is_ascend);

  // Enable atomic add for stitch nodes.
  auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_stitch_fusion);
  pm->AddPass(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu);
  pm->Add(std::make_shared<StitchAtomicCleanInsertter>(), level, is_gpu);

  // Enable low precision
  auto level_low_precision = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_low_precision);
  pm->AddPass(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
  pm->AddPass(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);
  pm->Add(std::make_shared<DecreaseTransferPrecision>(), level_low_precision);
  pm->Add(std::make_shared<DecreaseComputePrecision>(), level_low_precision, is_ascend);

  // Enable tsa and uss
  pm->AddPass(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
  pm->AddPass(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);
  pm->Add(std::make_shared<TsaAtomicAddToFirstTensor>(), OptLevel_1, is_gpu);
  pm->Add(std::make_shared<UssAtomicAdd>(), OptLevel_1, is_gpu);

  return pm;
 }
@@ -197,9 +197,8 @@ PassManagerPtr GraphKernelOptimizer::Combine() const {
  auto target = context_ptr->get_param<std::string>(MS_CTX_DEVICE_TARGET);
  auto level = GetPassLevelByFlag(GraphKernelFlags::GetInstance().enable_parallel_fusion);
  // Atomic-add GraphKernel node may be linked directly to UpdateState, it should be spread before parallel fusion!
  pm->AddPass(std::make_shared<SpreadUpdateState>(), level);
  pm->AddPass(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level,
              is_gpu || is_ascend);
  pm->Add(std::make_shared<SpreadUpdateState>(), level);
  pm->Add(std::make_shared<ParallelOpFusion>(target, ParallelConfig(PARALLEL_OPS_LIMIT)), level, is_gpu || is_ascend);

  return pm;
 }
@@ -207,17 +206,17 @@ PassManagerPtr GraphKernelOptimizer::Combine() const {
 PassManagerPtr GraphKernelOptimizer::PostProcess() const {
  auto pm = std::make_shared<GraphKernelPassManager>(6, "postprocess");
  // Make Tuple for the inputs of UpdateState. (the reverse of SpreadUpdateState)
  pm->AddPass(std::make_shared<ShrinkUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<ShrinkUpdateState>(), OptLevel_1);

  // Recover the original output info
  pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
  pm->AddPass(std::make_shared<RewriteOutputShape>(), OptLevel_1);
  pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);
  pm->Add(std::make_shared<RewriteOutputShape>(), OptLevel_1);

  // Reduce fake output memory.
  pm->AddPass(std::make_shared<ReduceFakeOutMem>(), OptLevel_1);
  pm->Add(std::make_shared<ReduceFakeOutMem>(), OptLevel_1);

  // Add the new tensors to the kernel_graph
  pm->AddPass(std::make_shared<BindValueToGraph>(), OptLevel_1);
  pm->Add(std::make_shared<BindValueToGraph>(), OptLevel_1);
  return pm;
 }

@@ -242,5 +241,8 @@ void GraphKernelOptimizer::Run(const KernelGraphPtr &kernel_graph) {
  (void)optimizer->Optimize(kernel_graph);
 }

 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) { GraphKernelOptimizer().Run(kernel_graph); }
 void GraphKernelOptimize(const KernelGraphPtr &kernel_graph) {
  GraphKernelOptimizer graph_kernel_optimizer;
  graph_kernel_optimizer.Run(kernel_graph);
 }
 }  // namespace mindspore::graphkernel
--- a/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.cc
@@ -24,7 +24,7 @@
 #include "debug/anf_ir_dump.h"

 namespace mindspore::graphkernel {
 void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
 void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
  MS_EXCEPTION_IF_NULL(pass);
  auto pass_id = passes_.size();
  auto pass_name = pass->name();
--- a/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.h
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_pass_manager.h
@@ -33,7 +33,7 @@ class GraphKernelPassManager : public PassManager {
  ~GraphKernelPassManager() = default;

  // Add graph pass, the pass object will be freed when pass manager freed.
  virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
  void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);

  // Run passes on the func_graph
  bool Run(const FuncGraphPtr &func_graph) const override;
--- a/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_splitter_with_py.cc
+++ b/mindspore/ccsrc/common/graph_kernel/adapter/graph_kernel_splitter_with_py.cc
@@ -26,6 +26,7 @@
 #include "include/common/utils/python_adapter.h"
 #include "kernel/akg/akg_kernel_json_generator.h"
 #include "kernel/common_utils.h"
 #include "common/graph_kernel/core/graph_kernel_utils.h"
 #include "common/graph_kernel/graph_kernel_helper.h"
 #include "include/common/utils/context/graph_kernel_flags.h"

@@ -107,13 +108,13 @@ class CostModelSplitSchemer : public SplitSchemer {
        MS_LOG(ERROR) << "Failed decode sub graph, " << graph_desc;
        return false;
      }
      split_plan_.emplace_back(std::move(res_graph));
      (void)split_plan_.emplace_back(std::move(res_graph));
    }

    // ops to be inlined.
    need_inline_.clear();
    std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_),
                   [](const std::string &mode) { return mode == "basic" ? 1 : 0; });
    (void)std::transform(graph_modes.begin(), graph_modes.end(), std::back_inserter(need_inline_),
                         [](const std::string &mode) { return mode == "basic" ? 1 : 0; });
    return true;
  }

@@ -151,8 +152,8 @@ class CostModelSplitSchemer : public SplitSchemer {
  virtual void GetValidKernelNodes() {
    topo_all_nodes_ = TopoSort(func_graph_->get_return());
    topo_valid_nodes_.clear();
    std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_),
                 [this](const AnfNodePtr &node) { return IsValidKernelNode(node); });
    (void)std::copy_if(topo_all_nodes_.begin(), topo_all_nodes_.end(), std::back_inserter(topo_valid_nodes_),
                       [this](const AnfNodePtr &node) { return IsValidKernelNode(node); });
  }

  void MapNodeGroup() {
@@ -175,14 +176,14 @@ class CostModelSplitSchemer : public SplitSchemer {
    if (IsValidKernelNode(output)) {
      auto group_id = node_group_[output];
      node_group_[ret_node] = group_id;
      split_plan_[group_id].emplace_back(ret_node);
      (void)split_plan_[group_id].emplace_back(ret_node);
      return;
    }
    // assign the make_tuple node to a new group.
    if (common::AnfAlgo::CheckPrimitiveType(output, prim::kPrimMakeTuple)) {
      auto group_id = split_plan_.size();
      split_plan_.emplace_back(AnfNodePtrList{output, ret_node});
      need_inline_.emplace_back(1);
      (void)split_plan_.emplace_back(AnfNodePtrList{output, ret_node});
      (void)need_inline_.emplace_back(1);
      node_group_[output] = group_id;
      node_group_[ret_node] = group_id;
      return;
--- a/mindspore/ccsrc/common/graph_kernel/add_atomic_clean.cc
+++ b/mindspore/ccsrc/common/graph_kernel/add_atomic_clean.cc
@@ -52,8 +52,9 @@ std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = fal
  }
  auto axis_vec = GetReduceAxis(node);
  if (axis_vec.empty()) {
    axis_vec.resize(src_shape_vec.size());
    for (size_t i = 0; i < src_shape_vec.size(); ++i) {
      (void)axis_vec.emplace_back(i);
      axis_vec[i] = i;
    }
  } else {
    (void)std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), [&src_shape_vec](int64_t axis) -> int64_t {
@@ -281,15 +282,8 @@ void AtomicCleanInsertter::CorrectKernelBuildInfo(
    new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second));
  }

  kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder;
  new_info_builder.SetInputsFormat(new_inputs_format);
  new_info_builder.SetInputsDeviceType(new_inputs_type);
  new_info_builder.SetOutputsFormat(new_outputs_format);
  new_info_builder.SetOutputsDeviceType(new_outputs_type);
  new_info_builder.SetProcessor(origin_processor);
  new_info_builder.SetKernelType(KernelType::AKG_KERNEL);
  new_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
  auto new_selected_info = new_info_builder.Build();
  auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format,
                                                      new_outputs_type, origin_processor);
  AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get());
 }

--- a/mindspore/ccsrc/common/graph_kernel/add_atomic_clean.h
+++ b/mindspore/ccsrc/common/graph_kernel/add_atomic_clean.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2020-2021 Huawei Technologies Co., Ltd
 * Copyright 2020-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ class AtomicAddChecker {
  std::vector<AtomicAddInfo> GetAtomicAddInfo() { return atomic_add_infos_; }

 protected:
  virtual bool SuitableForAtomicAdd(const AnfNodePtr &node) { return false; }
  virtual bool SuitableForAtomicAdd(const AnfNodePtr &) { return false; }
  virtual bool FindCandidate(const AnfNodePtr &anf_node);
  virtual bool CanActivateAtomicAdd(const AnfNodePtr &anf_node);
  std::vector<AtomicAddInfo> atomic_add_infos_;
--- a/mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.cc
+++ b/mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.cc
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -52,15 +52,8 @@ void StitchAtomicCleanInsertter::CorrectKernelBuildInfo(
  new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second));
  new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second));

  kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder;
  new_info_builder.SetInputsFormat(new_inputs_format);
  new_info_builder.SetInputsDeviceType(new_inputs_type);
  new_info_builder.SetOutputsFormat(new_outputs_format);
  new_info_builder.SetOutputsDeviceType(new_outputs_type);
  new_info_builder.SetProcessor(origin_processor);
  new_info_builder.SetKernelType(KernelType::AKG_KERNEL);
  new_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
  auto new_selected_info = new_info_builder.Build();
  auto new_selected_info = BuildSelectKernelBuildInfo(new_inputs_format, new_inputs_type, new_outputs_format,
                                                      new_outputs_type, origin_processor);
  AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get());
 }

@@ -124,7 +117,7 @@ void StitchAtomicCleanInsertter::ProcessOriginCNode(
  for (const auto &[user_node, index] : reduce_user_nodes) {
    auto user_cnode = user_node->cast<CNodePtr>();
    MS_EXCEPTION_IF_NULL(user_cnode);
    user_cnode->set_input(static_cast<size_t>(index), parameter);
    user_cnode->set_input(IntToSize(index), parameter);
    if (!connected) {
      std::vector<std::pair<AnfNodePtr, int>> user_user = FindInnerCNodeUsers(stitch_node_, user_cnode);
      if (!user_user.empty()) {
@@ -154,8 +147,8 @@ std::vector<std::pair<AnfNodePtr, int>> StitchAtomicCleanInsertter::FindInnerCNo
  }
  std::vector<std::pair<AnfNodePtr, int>> inner_user_nodes;
  auto users = mng_sub->node_users()[target];
  std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes),
                 [](const std::pair<AnfNodePtr, int> &pair) { return pair; });
  (void)std::transform(users.cbegin(), users.cend(), std::back_inserter(inner_user_nodes),
                       [](const std::pair<AnfNodePtr, int> &pair) { return pair; });
  return inner_user_nodes;
 }

--- a/mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.h
+++ b/mindspore/ccsrc/common/graph_kernel/add_stitch_atomic_clean_gpu.h
@@ -1,5 +1,5 @@
 /**
 * Copyright 2021 Huawei Technologies Co., Ltd
 * Copyright 2021-2022 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -32,16 +32,18 @@ class StitchAtomicCleanInsertter : public AtomicCleanInsertter {
  ~StitchAtomicCleanInsertter() override = default;
  bool Run(const FuncGraphPtr &func_graph) override;

 private:
 protected:
  void CorrectKernelBuildInfo(const AnfNodePtr &composite_node,
                              const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &clean_infos) override;
  void ProcessOriginCNode(
    const AnfNodePtr &composite_node,
    const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override;

 private:
  CNodePtr CreateInplaceAssignNode(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter,
                                   const AtomicAddInfo &info) const;
  std::vector<std::pair<AnfNodePtr, int>> FindInnerCNodeUsers(const AnfNodePtr &inner_node,
                                                              const CNodePtr &target) const;
  void ProcessOriginCNode(
    const AnfNodePtr &composite_node,
    const std::vector<std::pair<AtomicAddInfo, AnfNodePtr>> &info_and_broadcast_to_nodes) override;
  std::pair<bool, AtomicAddInfo> IsStitchWithAtomic(const AnfNodePtr &anf_node);

  void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node,
--- a/mindspore/ccsrc/common/graph_kernel/cast_matmul_fusion.cc
+++ b/mindspore/ccsrc/common/graph_kernel/cast_matmul_fusion.cc
@@ -33,8 +33,8 @@ void UpdateBuildInfo(const AnfNodePtr &matmul_node, const AnfNodePtr &cast_node)
  input_types.push_back(cast_types.front());
  std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node);
  std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node);
  auto graph_sel_info =
    BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node);
  auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
                                                   AnfAlgo::GetProcessor(matmul_node));
  AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get());
 }

--- a/mindspore/ccsrc/common/graph_kernel/core/graph_kernel_callback.h
+++ b/mindspore/ccsrc/common/graph_kernel/core/graph_kernel_callback.h
@@ -169,6 +169,6 @@ class CallbackImplRegister {
 };

 #define GRAPH_KERNEL_CALLBACK_REGISTER(cls) \
  static const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); })
  const CallbackImplRegister g_graphkernel_callback([]() { return static_cast<Callback *>(new cls()); })
 }  // namespace mindspore::graphkernel
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CORE_GRAPH_KERNEL_CALLBACK_H_
--- a/mindspore/ccsrc/common/graph_kernel/decrease_compute_precision.cc
+++ b/mindspore/ccsrc/common/graph_kernel/decrease_compute_precision.cc
@@ -74,7 +74,8 @@ void UpdateOutputInfo(const AnfNodePtr &cnode) {
    }
    std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(cnode);
    std::vector<TypeId> output_types = {TypeId::kNumberTypeFloat16};
    auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode);
    auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
                                                     AnfAlgo::GetProcessor(cnode));
    AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, cnode.get());
  }
 }
--- a/mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.cc
+++ b/mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.cc
@@ -91,33 +91,27 @@ AbstractBasePtr GetOutputAbstract(const AnfNodePtr &node, size_t output_idx) {
  return out_spec;
 }

 // Rebuild as node inputs or outputs have changed, processor comes from node itself
 // Build for new node, processor comes from context
 kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
                                                      const std::vector<TypeId> &inputs_type,
                                                      const std::vector<std::string> &output_formats,
                                                      const std::vector<TypeId> &output_types, const AnfNodePtr &node) {
  kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
  graph_info_builder.SetInputsFormat(inputs_format);
  graph_info_builder.SetInputsDeviceType(inputs_type);
  graph_info_builder.SetOutputsFormat(output_formats);
  graph_info_builder.SetOutputsDeviceType(output_types);
  graph_info_builder.SetProcessor(AnfAlgo::GetProcessor(node));
  graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
  graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
  return graph_info_builder.Build();
                                                      const std::vector<TypeId> &output_types) {
  return BuildSelectKernelBuildInfo(inputs_format, inputs_type, output_formats, output_types,
                                    kernel::GetProcessorFromContext());
 }

 // Build for new node, processor comes from context
 // Build for new node with given processor
 kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
                                                      const std::vector<TypeId> &inputs_type,
                                                      const std::vector<std::string> &output_formats,
                                                      const std::vector<TypeId> &output_types) {
                                                      const std::vector<TypeId> &output_types,
                                                      const kernel::Processor &processor) {
  kernel::KernelBuildInfo::KernelBuildInfoBuilder graph_info_builder;
  graph_info_builder.SetInputsFormat(inputs_format);
  graph_info_builder.SetInputsDeviceType(inputs_type);
  graph_info_builder.SetOutputsFormat(output_formats);
  graph_info_builder.SetOutputsDeviceType(output_types);
  graph_info_builder.SetProcessor(kernel::GetProcessorFromContext());
  graph_info_builder.SetProcessor(processor);
  graph_info_builder.SetKernelType(KernelType::AKG_KERNEL);
  graph_info_builder.SetFusionType(kernel::FusionType::OPAQUE);
  return graph_info_builder.Build();
--- a/mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.h
+++ b/mindspore/ccsrc/common/graph_kernel/graph_kernel_helper.h
@@ -57,11 +57,12 @@ void SetNewKernelInfo(const AnfNodePtr &new_node, const FuncGraphPtr &fg, const
 kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
                                                      const std::vector<TypeId> &inputs_type,
                                                      const std::vector<std::string> &output_formats,
                                                      const std::vector<TypeId> &output_types, const AnfNodePtr &node);
                                                      const std::vector<TypeId> &output_types);
 kernel::KernelBuildInfoPtr BuildSelectKernelBuildInfo(const std::vector<std::string> &inputs_format,
                                                      const std::vector<TypeId> &inputs_type,
                                                      const std::vector<std::string> &output_formats,
                                                      const std::vector<TypeId> &output_types);
                                                      const std::vector<TypeId> &output_types,
                                                      const kernel::Processor &processor);
 bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc);
 bool AnfToJsonDesc(const AnfNodePtrList &nodes, const DumpOption &dump_option, nlohmann::json *op_desc,
                   std::map<std::string, AnfNodePtr> *address_node_map);
--- a/mindspore/ccsrc/common/graph_kernel/insert_pad.cc
+++ b/mindspore/ccsrc/common/graph_kernel/insert_pad.cc
@@ -229,8 +229,8 @@ void UpdateMatmulInfo(const AnfNodePtr &matmul_node, const vec &unpad_shape, con
  std::vector<TypeId> input_types = AnfAlgo::GetAllInputDeviceTypes(matmul_node);
  std::vector<std::string> output_formats = AnfAlgo::GetAllOutputFormats(matmul_node);
  std::vector<TypeId> output_types = AnfAlgo::GetAllOutputDeviceTypes(matmul_node);
  auto graph_sel_info =
    BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, matmul_node);
  auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
                                                   AnfAlgo::GetProcessor(matmul_node));
  AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, matmul_node.get());
 }

--- a/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_optimization.cc
+++ b/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_optimization.cc
@@ -39,14 +39,14 @@ using opt::GraphOptimizer;
 PassManagerPtr GraphKernelOptimizer::Cluster() const {
  auto pm = std::make_shared<GraphKernelPassManager>(0, "cluster");
  // Expand complex basic kernels to composite kernels
  pm->AddPass(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelExpanderLite>(), OptLevel_1);

  // Cluster basic kernels and composite kernels
  pm->AddPass(std::make_shared<GraphKernelCluster>(), OptLevel_1);
  pm->AddPass(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelCluster>(), OptLevel_1);
  pm->Add(std::make_shared<ConvertConstInputToAttr>(), OptLevel_1);

  // Eliminate the outputs without external user
  pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  return pm;
 }

@@ -55,27 +55,27 @@ PassManagerPtr GraphKernelOptimizer::Split() const {
  // Make certain nodes redundant so that they are used by only one user,
  // which can avoid unnecessary input-output and get better performance.
  // preprocess for ShapeOpsSplitter
  pm->AddPass(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<ExtendOutputForUpdateState>(), OptLevel_1);
  std::vector<PrimitivePtr> duplicated_ops = {prim::kPrimReshape};
  pm->AddPass(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);
  pm->Add(std::make_shared<ShapeOpsSplitter>(duplicated_ops), OptLevel_1);

  // Split kernel according to costmodel
  pm->AddPass(std::make_shared<GraphKernelSplitter>(), OptLevel_1);
  pm->Add(std::make_shared<GraphKernelSplitter>(), OptLevel_1);

  // After Simplify and Splitter, a lot of redundant getitem/maketuple
  // will be exposed, use GetitemTuple Pass to delete them.
  pm->AddPass(std::make_shared<GetitemTuple>(), OptLevel_1);
  pm->Add(std::make_shared<GetitemTuple>(), OptLevel_1);

  // Eliminate the redundant node that is copied above but not handled by GraphKernelSplitter
  pm->AddPass(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
  pm->AddPass(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  pm->Add(std::make_shared<MergeOutputForUpdateState>(), OptLevel_1);
  pm->Add(std::make_shared<EliminateRedundantOutput>(), OptLevel_1);
  return pm;
 }

 PassManagerPtr GraphKernelOptimizer::PostProcess() const {
  auto pm = std::make_shared<GraphKernelPassManager>(2, "postprocess");
  // build akg and replace graph kernel nodes
  pm->AddPass(std::make_shared<KernelBuilder>(), OptLevel_1);
  pm->Add(std::make_shared<KernelBuilder>(), OptLevel_1);
  return pm;
 }

--- a/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.cc
+++ b/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.cc
@@ -23,26 +23,26 @@
 #include "debug/anf_ir_dump.h"

 namespace mindspore::graphkernel {
 void GraphKernelPassManager::AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
 void GraphKernelPassManager::Add(const opt::PassPtr &pass, unsigned int pass_level, bool supported_device) {
  MS_EXCEPTION_IF_NULL(pass);
  auto pass_id = passes_.size();
  auto pass_name = pass->name();
  auto pass_id = passes_.size();
  auto pass_in_list = [this, pass_id, &pass_name](const std::vector<std::string> &pass_list) {
    // the config format can be "stage_id.pass_id" or "stage_name.pass_name"
    // config format can be "stage_id.pass_id" or "stage_name.pass_name"
    return std::find(pass_list.begin(), pass_list.end(),
                     std::to_string(this->stage_) + "." + std::to_string(pass_id)) != pass_list.end() ||
           std::find(pass_list.begin(), pass_list.end(), this->name_ + "." + pass_name) != pass_list.end();
  };
  bool enable = supported_device && flags_.opt_level >= pass_level;
  if (enable) {
    // if it meets the condition to enable, check whether it's in the disabled list.
    // if it meets the condition to enable, check whether it's in the disabled pass list.
    enable = !pass_in_list(flags_.disable_pass);
  } else {
    // if it doesn't meet the condition to enable, check whether it's in the enabled list.
    // if it doesn't meet the condition to enable, check whether it's in the enabled pass list.
    enable = pass_in_list(flags_.enable_pass);
  }
  passes_.push_back(pass);
  enabled_.push_back(enable);
  passes_.push_back(pass);
 }

 bool GraphKernelPassManager::RunPass(const FuncGraphPtr &func_graph, size_t pass_id, const opt::PassPtr &pass) const {
@@ -59,10 +59,10 @@ bool GraphKernelPassManager::Run(const FuncGraphPtr &func_graph) const {
  for (size_t i = 0; i < passes_.size(); i++) {
    if (enabled_[i]) {
      changed = RunPass(func_graph, i, passes_[i]) || changed;
      // dump ir to a graph_kernel subdir, and set a global id in front of the filename
      // dump ir to a graph_kernel subdir, and set a global id in front of the filenames
      std::ostringstream oss;
      static int g_id = 0;
      constexpr int id_length = 4;
      static int g_id = 0;
      oss << "graph_kernel/" << std::setfill('0') << std::setw(id_length) << g_id++ << "_"
          << GetPassFullname(i, passes_[i]);
      DumpPassIR(func_graph, oss.str());
--- a/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.h
+++ b/mindspore/ccsrc/common/graph_kernel/lite_adapter/graph_kernel_pass_manager.h
@@ -21,8 +21,8 @@
 #include <string>
 #include <memory>

 #include "include/common/utils/context/graph_kernel_flags.h"
 #include "backend/common/optimizer/pass_manager.h"
 #include "include/common/utils/context/graph_kernel_flags.h"

 namespace mindspore::graphkernel {
 using opt::PassManager;
@@ -32,10 +32,10 @@ class GraphKernelPassManager : public PassManager {
      : PassManager(name, true), stage_(stage), flags_(GraphKernelFlags::GetInstance()) {}
  ~GraphKernelPassManager() = default;

  // Add graph pass, the pass object will be freed when pass manager freed.
  virtual void AddPass(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);
  // Add graph pass for lite, the pass object will be freed when pass manager freed.
  void Add(const opt::PassPtr &pass, unsigned int pass_level, bool default_enable = true);

  // Run passes on the func_graph
  // Run passes for lite on the func_graph
  bool Run(const FuncGraphPtr &func_graph) const override;

 protected:
--- a/mindspore/ccsrc/common/graph_kernel/optimize_assign.cc
+++ b/mindspore/ccsrc/common/graph_kernel/optimize_assign.cc
@@ -178,7 +178,8 @@ bool ReplaceAssignByInplaceAssignInGraphkernel(const FuncGraphPtr &func_graph) {
    input_types.push_back(input_types.back());
    std::vector<std::string> output_formats = {input_formats.back()};
    std::vector<TypeId> output_types = {input_types.back()};
    auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types, cnode);
    auto graph_sel_info = BuildSelectKernelBuildInfo(input_formats, input_types, output_formats, output_types,
                                                     AnfAlgo::GetProcessor(cnode));
    AnfAlgo::SetSelectKernelBuildInfo(graph_sel_info, new_cnode.get());
    mng->Replace(cnode, new_cnode);
  }