|
|
|
@@ -37,13 +37,18 @@ |
|
|
|
namespace mindspore { |
|
|
|
namespace opt { |
|
|
|
namespace { |
|
|
|
std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node) { |
|
|
|
std::set<int64_t> GetUniqReduceAxes(const AnfNodePtr &node, bool is_ascend = false) { |
|
|
|
if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { |
|
|
|
MS_LOG(EXCEPTION) << "Only process for reduce sum!"; |
|
|
|
} |
|
|
|
|
|
|
|
auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex); |
|
|
|
auto src_shape_vec = GetShape(input); |
|
|
|
ShapeVector src_shape_vec; |
|
|
|
if (is_ascend) { |
|
|
|
src_shape_vec = GetDeviceShape(input); |
|
|
|
} else { |
|
|
|
src_shape_vec = GetShape(input); |
|
|
|
} |
|
|
|
auto axis_vec = GetReduceAxis(node); |
|
|
|
if (axis_vec.empty()) { |
|
|
|
for (size_t i = 0; i < src_shape_vec.size(); ++i) { |
|
|
|
@@ -140,7 +145,8 @@ bool AtomicAddChecker::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { |
|
|
|
// which mean it should be in output list. |
|
|
|
// 2. The reduce axis and reduce number should meet condition: |
|
|
|
// (GPU) all-reduce or reduce-x when fuse number is greater than or equal to 1024, or reduce-y. |
|
|
|
// (Ascend) all-reduce or non-reduce axes with dimension 1 |
|
|
|
// (Ascend) The first valid axis of the input data is the reduce axis or the non-reduce axis |
|
|
|
// cannot make full use of multi-core. |
|
|
|
// 3. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). |
|
|
|
|
|
|
|
// Rule 1. |
|
|
|
@@ -180,11 +186,46 @@ bool AtomicAddCheckerGPU::SuitableForAtomicAdd(const AnfNodePtr &node) { |
|
|
|
} |
|
|
|
|
|
|
|
bool AtomicAddCheckerAscend::SuitableForAtomicAdd(const AnfNodePtr &node) { |
|
|
|
auto dst_shape_vec = AnfAlgo::GetOutputDeviceShape(node, 0); |
|
|
|
auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex); |
|
|
|
|
|
|
|
// all reduce |
|
|
|
// non-reduce axes with dimension 1 |
|
|
|
return std::all_of(dst_shape_vec.cbegin(), dst_shape_vec.cend(), [](const size_t &dim) { return dim == 1; }); |
|
|
|
// Atomic addition is enabled only when the data type is fp32 |
|
|
|
auto type = AnfAlgo::GetOutputDeviceDataType(input, 0); |
|
|
|
if (type != kNumberTypeFloat32) { |
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
// If the first valid axis of the input data is the reduce axis, enable atomic addition |
|
|
|
auto src_shape_vec = GetDeviceShape(input); |
|
|
|
std::set<int64_t> reduce_axis_set = GetUniqReduceAxes(node, true); |
|
|
|
auto start_with_reduce = false; |
|
|
|
for (size_t i = 0; i < src_shape_vec.size(); ++i) { |
|
|
|
auto dim = src_shape_vec[i]; |
|
|
|
if (dim != 1) { |
|
|
|
if (reduce_axis_set.count(i)) { |
|
|
|
start_with_reduce = true; |
|
|
|
} |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
if (start_with_reduce) { |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
// If the non-reduce axis cannot make full use of multi-core, enable atomic addition |
|
|
|
auto processor_core_num = 32; |
|
|
|
auto start_non_reduce_dim = 1; |
|
|
|
for (size_t i = 0; i < src_shape_vec.size(); ++i) { |
|
|
|
auto dim = src_shape_vec[i]; |
|
|
|
if (reduce_axis_set.count(i)) { |
|
|
|
break; |
|
|
|
} |
|
|
|
start_non_reduce_dim = start_non_reduce_dim * dim; |
|
|
|
} |
|
|
|
if (start_non_reduce_dim < processor_core_num) { |
|
|
|
return true; |
|
|
|
} |
|
|
|
|
|
|
|
return false; |
|
|
|
} |
|
|
|
|
|
|
|
void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) { |
|
|
|
|