[GraphKernel][Ascend]Increase the rules for enabling atomic addition

5 years ago · cbe79265c7
--- a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
+++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean.cc
@@ -37,13 +37,18 @@
 namespace mindspore {
 namespace opt {
 namespace {
 std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node) {
 std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = false) {
  if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) {
    MS_LOG(EXCEPTION) << "Only process for reduce sum!";
  }

  auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);
  auto src_shape_vec = GetShape(input);
  ShapeVector src_shape_vec;
  if (is_ascend) {
    src_shape_vec = GetDeviceShape(input);
  } else {
    src_shape_vec = GetShape(input);
  }
  auto axis_vec = GetReduceAxis(node);
  if (axis_vec.empty()) {
    for (size_t i = 0; i < src_shape_vec.size(); ++i) {
@@ -140,7 +145,8 @@ bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) {
  //    which mean it should be in output list.
  // 2. The reduce axis and reduce number should meet condition:
  //    (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y.
  //    (Ascend) all-reduce or non-reduce axes with dimension 1
  //    (Ascend) The first valid axis of the input data is the reduce axis or the non-reduce axis
  //    cannot make full use of multi-core.
  // 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation).

  // Rule 1.
@@ -180,11 +186,46 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) {
 }

 bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) {
  auto dst_shape_vec = AnfAlgo::GetOutputDeviceShape(node, 0);
  auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex);

  // all reduce
  // non-reduce axes with dimension 1
  return std::all_of(dst_shape_vec.cbegin(), dst_shape_vec.cend(), [](const size_t &dim) { return dim == 1; });
  // Atomic addition is enabled only when the data type is fp32
  auto type = AnfAlgo::GetOutputDeviceDataType(input, 0);
  if (type != kNumberTypeFloat32) {
    return false;
  }

  // If the first valid axis of the input data is the reduce axis, enable atomic addition
  auto src_shape_vec = GetDeviceShape(input);
  std::set<int64_t> reduce_axis_set = GetUniqReduceAxes(node, true);
  auto start_with_reduce = false;
  for (size_t i = 0; i < src_shape_vec.size(); ++i) {
    auto dim = src_shape_vec[i];
    if (dim != 1) {
      if (reduce_axis_set.count(i)) {
        start_with_reduce = true;
      }
      break;
    }
  }
  if (start_with_reduce) {
    return true;
  }

  // If the non-reduce axis cannot make full use of multi-core, enable atomic addition
  auto processor_core_num = 32;
  auto start_non_reduce_dim = 1;
  for (size_t i = 0; i < src_shape_vec.size(); ++i) {
    auto dim = src_shape_vec[i];
    if (reduce_axis_set.count(i)) {
      break;
    }
    start_non_reduce_dim = start_non_reduce_dim * dim;
  }
  if (start_non_reduce_dim < processor_core_num) {
    return true;
  }

  return false;
 }

 void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) {