diff --git a/akg b/akg index 6ffe9c2431..0a0338fecd 160000 --- a/akg +++ b/akg @@ -1 +1 @@ -Subproject commit 6ffe9c24319d7297d0feeb10ee2bd8135e24c5c8 +Subproject commit 0a0338fecd54c654c1992af156d41e036569343c diff --git a/mindspore/_extends/graph_kernel/expanders/gkdropout.py b/mindspore/_extends/graph_kernel/expanders/gkdropout.py index f340087131..d761681c9a 100644 --- a/mindspore/_extends/graph_kernel/expanders/gkdropout.py +++ b/mindspore/_extends/graph_kernel/expanders/gkdropout.py @@ -37,7 +37,9 @@ def expand_gkdropout(expand_info): keep_prob_v = graph_builder.value(input_x.dtype, keep_prob, "DefaultFormat") r_keep_prob = graph_builder.value(input_x.dtype, 1.0 / keep_prob, "DefaultFormat") - mask = graph_builder.emit('LessEqual', [input_mask, keep_prob_v]) + if input_mask.dtype != input_x.dtype: + input_mask = graph_builder.emit('Cast', [input_mask], attrs={'dst_type': input_x.dtype}) + mask = graph_builder.emit('LessEqual', [input_mask, keep_prob_v]) # output is bool type mask = graph_builder.emit('Cast', [mask], attrs={'dst_type': input_x.dtype}) # compute result diff --git a/mindspore/_extends/graph_kernel/model/graph_split.py b/mindspore/_extends/graph_kernel/model/graph_split.py index 48408859a9..b92f5f930f 100644 --- a/mindspore/_extends/graph_kernel/model/graph_split.py +++ b/mindspore/_extends/graph_kernel/model/graph_split.py @@ -16,7 +16,7 @@ from .model import PrimLib, Graph, Tensor -use_poly_reduce = False +use_poly_reduce = True class GraphSplitByPattern: """Graph splitter""" diff --git a/mindspore/_extends/graph_kernel/model/model_builder.py b/mindspore/_extends/graph_kernel/model/model_builder.py index 6c7f77fd84..21722e9439 100644 --- a/mindspore/_extends/graph_kernel/model/model_builder.py +++ b/mindspore/_extends/graph_kernel/model/model_builder.py @@ -204,6 +204,16 @@ class CompositeGraph: def load(self, desc): """Load Graph from json""" def _attr_of(op, inputs, output): + def _get_axis_while_none(input_shape, output_shape): + red_axis = [] + if len(output_shape) == len(input_shape): + for s, i in enumerate(output_shape): + if s == 1 and input_shape[i] > 1: + red_axis.append(i) + else: + red_axis = list(range(len(output_shape))) + return red_axis + attr = {} if op['name'] not in ('ReduceSum', 'ReduceMax', 'ReduceMin'): return attr @@ -211,10 +221,7 @@ class CompositeGraph: if a['name'] == 'axis': red_axis, dim_size = [], len(inputs[0].shape) if not a['value']: - assert len(output.shape) == len(inputs[0].shape) - for i in range(len(output.shape)): - if output.shape[i] == 1 and inputs[0].shape[i] > 1: - red_axis.append(i) + red_axis = _get_axis_while_none(inputs[0].shape, output.shape) else: if isinstance(a['value'], int): a['value'] = [a['value']] diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc index 17cfedc989..2e7c594e09 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/akg_kernel_json_generator.cc @@ -244,7 +244,11 @@ bool AkgKernelJsonGenerator::CreateOutputDescJson(const AnfNodePtr &anf_node, co output_json[kJsonKeyFormat] = this->GetOutputFormat(anf_node, i); output_json[kJsonKeyName] = output_name; output_json[kJsonKeyTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc()); - output_json[kJsonKeyShape] = this->GetOutputShape(anf_node, i); + auto output_shape = this->GetOutputShape(anf_node, i); + if (output_shape.empty()) { + output_shape.push_back(1); + } + output_json[kJsonKeyShape] = output_shape; outputs_json->push_back(output_json); } return true; @@ -680,7 +684,11 @@ nlohmann::json AkgKernelJsonGenerator::CreateInputsJson(const std::vectorGetInputFormat(tmp_input.first, tmp_input.second.first); - input_desc_json[kJsonKeyShape] = this->GetInputShape(tmp_input.first, tmp_input.second.first); + auto input_shape = this->GetInputShape(tmp_input.first, tmp_input.second.first); + if (input_shape.empty()) { + input_shape.push_back(1); + } + input_desc_json[kJsonKeyShape] = input_shape; inputs_json.emplace_back(std::vector{input_desc_json}); } return inputs_json; diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc new file mode 100644 index 0000000000..ecc590f557 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.cc @@ -0,0 +1,505 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "base/core_ops.h" +#include "ir/tensor.h" +#include "utils/utils.h" +#include "utils/log_adapter.h" +#include "backend/kernel_compiler/kernel.h" +#include "backend/optimizer/graph_kernel/graph_kernel_helper.h" +#include "backend/session/anf_runtime_algorithm.h" +#include "backend/session/kernel_graph.h" +#include "debug/anf_ir_dump.h" + +namespace mindspore { +namespace opt { +namespace { +bool SuitableForAtomicAdd(const AnfNodePtr &node) { + if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { + MS_LOG(EXCEPTION) << "Only process for reduce sum!"; + } + + auto input = node->cast()->input(kFirstDataInputIndex); + auto src_shape_vec = GetShape(input); + auto axis_vec = GetReduceAxis(node); + if (axis_vec.empty()) { + for (size_t i = 0; i < src_shape_vec.size(); ++i) { + axis_vec.push_back(i); + } + } else { + std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), + [&src_shape_vec](int64_t axis) -> int64_t { return axis < 0 ? axis + src_shape_vec.size() : axis; }); + } + + std::set axis_set(axis_vec.begin(), axis_vec.end()); + + // For reduce whose last dim is reduced (including all-reduce), + // it is suitable for atomic add only the reduce num is greater than or equal to 1024. + if (axis_set.count(src_shape_vec.size() - 1) != 0) { + size_t reduce_size = + std::accumulate(axis_set.begin(), axis_set.end(), 1, + [&src_shape_vec](size_t size, int64_t axis) { return size * src_shape_vec[axis]; }); + return reduce_size >= 1024; + } + + // For reduce whose last dim is not reduced, always true. + return true; +} + +bool HaveReduceInPredecessors(const AnfNodePtr &node) { + std::stack st; + st.push(node); + while (!st.empty()) { + auto n = st.top(); + st.pop(); + + if (n != node) { + if (!n->isa()) { + continue; + } + if (IsPrimitiveCNode(n, prim::kPrimReduceSum)) { + return true; + } + } + + auto n_inputs = n->cast()->inputs(); + std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); }); + } + + return false; +} + +inline int64_t CalNewIndex(int64_t old_index, int64_t reduce_index) { + return old_index - (old_index > reduce_index ? 1 : 0); +} +} // namespace + +bool AtomicCleanInsertter::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { + auto node = anf_node->cast(); + MS_EXCEPTION_IF_NULL(node); + auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node); + auto mng_sub = sub_graph->manager(); + if (mng_sub == nullptr) { + mng_sub = Manage(sub_graph, false); + sub_graph->set_manager(mng_sub); + } + + // Rules to activate atomic add: + // 1. ReduceSum should not fuse any other ops in out direction, which mean it should be in output list. + // 2. only one ReduceSum in output list. + // 3. The reduce axis and reduce number should meet condition (all-reduce or reduce-x when fuse number is greater than + // or equal to 1024, or reduce-y). + // 4. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). + + // Rule 2. + auto real_return_node = sub_graph->get_return()->input(kFirstDataInputIndex); + if (IsPrimitiveCNode(real_return_node, prim::kPrimMakeTuple)) { + AnfNodePtrList reduce_ops; + size_t reduce_cnt = 0; + const auto &inputs = real_return_node->cast()->inputs(); + for (size_t i = 1; i < inputs.size(); ++i) { + if (IsPrimitiveCNode(inputs[i], prim::kPrimReduceSum)) { + atomic_add_node_ = inputs[i]->cast(); + reduce_real_output_index_ = i - 1; + reduce_cnt++; + } + } + + if (reduce_cnt != 1) { + return false; + } + } else if (IsPrimitiveCNode(real_return_node, prim::kPrimReduceSum)) { + atomic_add_node_ = real_return_node->cast(); + } else { + return false; + } + + // Rule 1. + if (mng_sub->node_users()[atomic_add_node_].size() > 1) { + return false; + } + + // Rule 3 and 4. + if (!SuitableForAtomicAdd(atomic_add_node_) || HaveReduceInPredecessors(atomic_add_node_)) { + return false; + } + + return true; +} + +void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) { + // Change kernel build info. + auto kernel_info = static_cast(composite_node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + const auto &origin_kernel_build_info = kernel_info->GetMutableSelectKernelBuildInfo(); + auto origin_inputs_format = origin_kernel_build_info->GetAllInputFormats(); + auto origin_outputs_format = origin_kernel_build_info->GetAllOutputFormats(); + auto origin_inputs_type = origin_kernel_build_info->GetAllInputDeviceTypes(); + auto origin_outputs_type = origin_kernel_build_info->GetAllOutputDeviceTypes(); + auto origin_processor = origin_kernel_build_info->processor(); + + std::vector &new_inputs_format = origin_inputs_format; + std::vector &new_inputs_type = origin_inputs_type; + std::vector new_outputs_format; + std::vector new_outputs_type; + for (size_t i = 0; i < origin_outputs_format.size(); ++i) { + if (real_output_num_ > 1 && i == reduce_real_output_index_) { + continue; + } + new_outputs_format.push_back(origin_outputs_format[i]); + new_outputs_type.push_back(origin_outputs_type[i]); + } + + auto kernel_with_index = AnfAlgo::VisitKernel(new_input, 0); + new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second)); + new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); + + kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; + new_info_builder.SetInputsFormat(new_inputs_format); + new_info_builder.SetInputsDeviceType(new_inputs_type); + new_info_builder.SetOutputsFormat(new_outputs_format); + new_info_builder.SetOutputsDeviceType(new_outputs_type); + new_info_builder.SetProcessor(origin_processor); + new_info_builder.SetKernelType(KernelType::AKG_KERNEL); + new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); + auto new_selected_info = new_info_builder.Build(); + AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); +} + +void AtomicCleanInsertter::CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, + const AnfNodePtr &new_parameter) { + // add inplaceassign + AnfNodePtr out_node; + bool fake_out = false; + size_t replace_index = 0; + auto retrun_node = sub_graph->get_return()->input(kFirstDataInputIndex); + if (IsPrimitiveCNode(retrun_node, prim::kPrimMakeTuple)) { + const auto &outs = retrun_node->cast()->inputs(); + real_output_num_ = outs.size() - 1; + for (size_t i = 1; i < outs.size(); ++i) { + if (i != reduce_real_output_index_ + 1) { + out_node = outs[i]; + replace_index = i; + break; + } + } + } else { + real_output_num_ = 1; + out_node = atomic_add_node_; // Use result data itself, and set attr "fake_out" true. + fake_out = true; + } + + auto inplace_assign_node = + CreateCNode({NewValueNode(std::make_shared("InplaceAssign")), new_parameter, atomic_add_node_, out_node}, + sub_graph, {.format = GetFormat(out_node), .shape = GetShape(out_node), .type = GetType(out_node)}); + AnfAlgo::SetNodeAttr("fake_output", MakeValue(fake_out), inplace_assign_node); + + CNodePtr new_out_node; + if (real_output_num_ > 2) { + std::vector output_args = {NewValueNode(prim::kPrimMakeTuple)}; + const auto &outs = retrun_node->cast()->inputs(); + for (size_t i = 1; i < outs.size(); ++i) { + if (i == reduce_real_output_index_ + 1) { + continue; + } else if (i == replace_index) { + output_args.push_back(inplace_assign_node); + } else { + output_args.push_back(outs[i]); + } + } + // Set output for AnfGraph + new_out_node = sub_graph->NewCNode(output_args); + } else { + new_out_node = inplace_assign_node; + } + sub_graph->set_output(new_out_node); +} + +void AtomicCleanInsertter::CorrectAbstract(const AnfNodePtr &composite_node) { + // If there is only one output(ReduceSum), it should be a fake output with the same abstract with origin output. + if (real_output_num_ <= 1) { + return; + } + + // Change abstract. + auto origin_out_spec = composite_node->abstract()->cast(); + MS_EXCEPTION_IF_NULL(origin_out_spec); + const auto &origin_out_specs = origin_out_spec->elements(); + AbstractBasePtrList new_out_specs; + for (size_t i = 0; i < origin_out_specs.size(); ++i) { + if (i != reduce_real_output_index_) { + new_out_specs.push_back(origin_out_specs[i]); + } + } + composite_node->set_abstract(std::make_shared(new_out_specs)); +} + +void AtomicCleanInsertter::ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, + const FuncGraphManagerPtr &mng) { + auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(composite_node); + auto mng_sub = sub_graph->manager(); + if (mng_sub == nullptr) { + mng_sub = Manage(sub_graph, false); + sub_graph->set_manager(mng_sub); + } + + // Add atomic attribute to reducesum node. + AnfAlgo::SetNodeAttr("enable_atomic_add", MakeValue(true), atomic_add_node_); + + // add input + auto inputs = composite_node->cast()->inputs(); + inputs.push_back(new_input); + composite_node->cast()->set_inputs(inputs); + + // add parameter + auto parameter = sub_graph->add_parameter(); + parameter->set_abstract(new_input->abstract()); + parameter->set_kernel_info(new_input->kernel_info_ptr()); + + CreateInplaceAssignNodeAndCorrectReturn(sub_graph, parameter); + + CorrectAbstract(composite_node); + CorrectKernelBuildInfo(composite_node, new_input); + + auto old_graph_name = GetValue(sub_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)); + auto new_graph_name = ExtractGraphKernelName(TopoSort(sub_graph->get_return()), "", "atomic_add"); + sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(new_graph_name)); + MS_LOG(INFO) << "Convert " << old_graph_name << " to atomic add graph " << new_graph_name; +} + +void AtomicCleanInsertter::AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, + const AnfNodePtr &composite_node, const AnfNodePtr &user_node, int index) { + // Create depend node to hold new control depend node. + AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), clean_node, composite_node}; + auto depend_cnode = main_graph->NewCNode(d_inputs); + depend_cnode->set_abstract(clean_node->abstract()); + main_graph->AddNode(depend_cnode); + + auto user_cnode = user_node->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, depend_cnode); +} + +void AtomicCleanInsertter::AddControlDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &pre_node, + const AnfNodePtr &post_node, const FuncGraphManagerPtr &mng) { + // Collect use dependencies firstly. + auto post_users = mng->node_users()[post_node]; + + // Create control depend, first input is composite op, second is user + AnfNodePtrList cd_inputs = {NewValueNode(prim::kPrimControlDepend), pre_node, post_node}; + auto control_depend_cnode = main_graph->NewCNode(cd_inputs); + main_graph->AddNode(control_depend_cnode); + + // Create depend node to hold new control depend node. + AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), post_node, control_depend_cnode}; + auto depend_cnode = main_graph->NewCNode(d_inputs); + depend_cnode->set_abstract(post_node->abstract()); + main_graph->AddNode(depend_cnode); + + for (const auto &[user_node, index] : post_users) { + auto user_cnode = user_node->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, depend_cnode); + } +} + +CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type) { + std::set data_support = {kNumberTypeFloat16, kNumberTypeFloat32, kNumberTypeFloat64}; + + if (!std::any_of(data_support.cbegin(), data_support.cend(), [&dst_type](TypeId type) { return dst_type == type; })) { + MS_LOG(EXCEPTION) << "Atomic add not support data type " << dst_type; + } + + // Create zero value which will be broadcast to target shape. + auto format = GetFormat(atomic_add_node_); + auto dtype = (dst_type == kNumberTypeFloat16) ? kNumberTypeFloat32 : dst_type; + ValueNodePtr value_node; + if (dtype == kNumberTypeFloat32) { + value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, + static_cast(0), sizeof(float)); + } else { + value_node = CreateScalarTensorValueNode({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, + static_cast(0), sizeof(double)); + } + + // Create composite op's sub-graph. + auto new_sub_graph = std::make_shared(); + auto parameter = new_sub_graph->add_parameter(); + parameter->set_abstract(value_node->abstract()); + parameter->set_kernel_info(value_node->kernel_info_ptr()); + + AnfNodePtr broadcast_input_node = parameter; + if (dst_type == kNumberTypeFloat16) { + AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), parameter}; + auto cast_node_inner = + CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); + AnfAlgo::SetNodeAttr("dst_type", MakeValue("float32"), cast_node_inner); + broadcast_input_node = cast_node_inner; + } + + // Create broadcast basic op. + auto dst_shape_vec = GetShape(atomic_add_node_); + if (dst_shape_vec.empty()) { + dst_shape_vec.push_back(1); + } + AnfNodePtrList atomic_clean_inputs = {NewValueNode(std::make_shared(kBroadcastToOpName)), + broadcast_input_node}; + auto broadcast_to_node_inner = CreateCNode( + atomic_clean_inputs, new_sub_graph, {.format = format, .shape = dst_shape_vec, .type = GetType(atomic_add_node_)}); + AnfAlgo::SetNodeAttr("shape", MakeValue(dst_shape_vec), broadcast_to_node_inner); + + // Makeup sub-graph. + new_sub_graph->set_output(broadcast_to_node_inner); + auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph), value_node}); + broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); + SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {value_node}, {broadcast_to_node_inner}, + kernel::Processor::CUDA); + auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); + new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); + // mng->AddFuncGraph(new_sub_graph); + + return broadcast_to_composite_node; +} + +void AtomicCleanInsertter::ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, + const AnfNodePtr &broadcast_to_node, const FuncGraphManagerPtr &mng) { + // 1. find users, change getitem index if needed. + std::vector > reduce_user_nodes; + if (real_output_num_ <= 1) { + auto users = mng->node_users()[composite_node]; + std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes), + [](const std::pair &pair) { return pair; }); + } else { + std::vector > getitem_user_nodes; + auto users = mng->node_users()[composite_node]; + for (const auto &node_index : users) { + const auto &user_node = node_index.first; + if (!IsPrimitiveCNode(user_node, prim::kPrimTupleGetItem)) { + continue; + } + auto get_item_cnode = user_node->cast(); + auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem); + MS_EXCEPTION_IF_NULL(value_input); + auto value_node = value_input->cast(); + MS_EXCEPTION_IF_NULL(value_node); + auto item_idx = GetValue(value_node->value()); + if (item_idx == static_cast(reduce_real_output_index_)) { + getitem_user_nodes.push_back(node_index); + } else { + if (real_output_num_ > 2) { + // Recorrect other getitem index. + int64_t new_item_idx = CalNewIndex(item_idx, reduce_real_output_index_); + AnfNodePtrList new_inputs = {NewValueNode(prim::kPrimTupleGetItem), composite_node, + NewValueNode(new_item_idx)}; + auto new_out = main_graph->NewCNode(new_inputs); + new_out->set_abstract(get_item_cnode->abstract()); + for (const auto &[user, index] : mng->node_users()[get_item_cnode]) { + auto user_cnode = user->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, new_out); + } + } else { + for (const auto &[user, index] : mng->node_users()[node_index.first]) { + auto user_cnode = user->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, composite_node); + } + } + } + } + + for (auto &pair : getitem_user_nodes) { + // dirctory to find real user. + auto real_users = mng->node_users()[pair.first]; + reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end()); + } + } + + for (const auto &[user_node, index] : reduce_user_nodes) { + // 2. set ac output as user's input. + auto user_cnode = user_node->cast(); + MS_EXCEPTION_IF_NULL(user_cnode); + user_cnode->set_input(index, broadcast_to_node); + // mng->SetEdge(user_node, index, broadcast_to_node); + // 3. Make sure modified composite node running first. + // * To not change the origin node's dependency relation, add ControlDepend and Depend node. + // * For Return node and output node, ControlDepend node will change the order of these two node, which will may + // main graph running failed. So only add Depend node to meet the need of execute order. + if (IsPrimitiveCNode(user_node, prim::kPrimReturn) || user_node == main_graph->output()) { + AddDepend(main_graph, broadcast_to_node, composite_node, user_node, index); + } else { + AddControlDepend(main_graph, composite_node, user_node, mng); + } + } +} + +void AtomicCleanInsertter::InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, + const FuncGraphManagerPtr &mng) { + auto origin_composite_node = anf_node->cast(); + MS_EXCEPTION_IF_NULL(origin_composite_node); + + // Create broadcst node. + auto out_type = GetType(atomic_add_node_)->cast(); + MS_EXCEPTION_IF_NULL(out_type); + auto broadcast_to_node = CreateAtomicCleanCompositeNode(main_graph, out_type->element()->type_id()); + + // Insert extra input(broadcast node output) to composite node, and make Reducesum inplaceassign to it. + // Note: if it's single output, this will increase total memory because of a fake out. + ProcessOriginCNode(origin_composite_node, broadcast_to_node, mng); + + // Replace origin ReduceSum's user with atomic clean output, and add control depend from composite op to user. + ProcessOriginCNodeUser(main_graph, origin_composite_node, broadcast_to_node, mng); +} + +bool AtomicCleanInsertter::Run(const FuncGraphPtr &func_graph) { + auto kernel_graph = std::dynamic_pointer_cast(func_graph); + MS_EXCEPTION_IF_NULL(kernel_graph); + auto mng = kernel_graph->manager(); + if (mng == nullptr) { + mng = Manage(kernel_graph, true); + kernel_graph->set_manager(mng); + } + + bool changed = false; + auto topo_nodes = TopoSort(kernel_graph->get_return()); + for (const auto &node : topo_nodes) { + if (!AnfAlgo::IsGraphKernel(node) || !CanActivateAtomicAdd(node)) { + continue; + } + InsertAtomicClean(kernel_graph, node, mng); + changed = true; + } + + if (changed) { + mng->RemoveRoots(); + mng->KeepRoots({func_graph}); + } + + return changed; +} +} // namespace opt +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h new file mode 100644 index 0000000000..731505e7db --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/add_atomic_clean_gpu.h @@ -0,0 +1,57 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ + +#include +#include +#include "backend/optimizer/common/optimizer.h" +#include "backend/session/kernel_graph.h" + +namespace mindspore { +namespace opt { +class AtomicCleanInsertter : public Pass { + public: + AtomicCleanInsertter() : Pass("atomic_clean") {} + ~AtomicCleanInsertter() override = default; + bool Run(const FuncGraphPtr &func_graph) override; + + private: + void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, + const FuncGraphManagerPtr &mng); + bool CanActivateAtomicAdd(const AnfNodePtr &anf_node); + void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng); + void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node, + const AnfNodePtr &user_node, int index); + void AddControlDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &pre_node, const AnfNodePtr &post_node, + const FuncGraphManagerPtr &mng); + void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter); + void CorrectAbstract(const AnfNodePtr &composite_node); + void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input); + CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type); + void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, + const AnfNodePtr &broadcast_to_node, const FuncGraphManagerPtr &mng); + + CNodePtr atomic_add_node_{nullptr}; + size_t reduce_real_output_index_{0}; + size_t real_output_num_{0}; +}; +using AtomicCleanInsertterPtr = std::shared_ptr; +} // namespace opt +} // namespace mindspore + +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cse.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cse.cc index 97f38f6587..b401d1fa44 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cse.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_cse.cc @@ -30,7 +30,9 @@ bool IsCNodePrimitveEqual(const CNodePtr &main, const CNodePtr &node) { auto main_primitive = AnfAlgo::GetCNodePrimitive(main); auto node_primitive = AnfAlgo::GetCNodePrimitive(node); if (main_primitive != nullptr && node_primitive != nullptr) { - if (main_primitive->name() != node_primitive->name()) { + // Some ops such as Reshape is not real op, cse these type will not get gain. And for ops fusion, keep these op + // alone can prevent some redundant output case (input -> reshape -> output). + if (main_primitive->name() != node_primitive->name() || IsPrimitiveCNode(node, prim::kPrimReshape)) { return false; } diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc index 60c0e77054..c365943dec 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.cc @@ -908,5 +908,126 @@ void ReplaceNewFuseCNodeForDependPrior(std::multimapinsert(item); } } + +std::string GetFormat(const AnfNodePtr &node) { + auto kernel_info = static_cast(node->kernel_info()); + MS_EXCEPTION_IF_NULL(kernel_info); + auto kernel_build_info = kernel_info->select_kernel_build_info(); + MS_EXCEPTION_IF_NULL(kernel_build_info); + return kernel_build_info->GetOutputFormat(0); +} + +TypePtr GetType(const AnfNodePtr &node) { + const auto &abstract = node->abstract(); + auto type = abstract->BuildType(); + MS_EXCEPTION_IF_NULL(type); + return type; +} + +ShapeVector GetShape(const AnfNodePtr &node) { + auto abstract = node->abstract(); + MS_EXCEPTION_IF_NULL(abstract); + auto shape = abstract->GetShapeTrack(); + if (shape == nullptr || !shape->isa()) { + MS_LOG(EXCEPTION) << "Cannot get shape from " << node->fullname_with_scope(); + } + return shape->cast()->shape(); +} + +std::vector GetReduceAxis(const AnfNodePtr &node) { + auto prim = GetCNodePrimitive(node); + MS_EXCEPTION_IF_NULL(prim); + const auto &attrs = prim->attrs(); + auto iter = attrs.find("axis"); + if (iter == attrs.end()) { + MS_LOG(EXCEPTION) << "Origin node have no attributes!"; + } + + std::vector axis; + + auto &v = iter->second; + if (v->isa() || v->isa()) { + auto vec = v->isa() ? v->cast()->value() : v->cast()->value(); + for (auto value : vec) { + if (value->isa()) { + axis.push_back(GetValue(value)); + } else { + MS_LOG(EXCEPTION) << "Reduce axis type should be int64!"; + } + } + } else if (v->isa()) { + axis.push_back(GetValue(v)); + } else { + MS_LOG(EXCEPTION) << "Reduce axis should be a list or tuple!"; + } + + return axis; +} + +CNodePtr CreateCNode(const std::vector &inputs, const FuncGraphPtr &func_graph, const DataInfo &out_info) { + // Limitation: 1. Node's attributes should be set out of this function; 2. only one output. + MS_EXCEPTION_IF_NULL(out_info.type); + auto out_type = out_info.type; + if (auto otype = out_info.type->cast(); otype != nullptr) { + out_type = otype->element(); + } + + // Create CNode. + auto cnode = func_graph->NewCNode(inputs); + MS_EXCEPTION_IF_NULL(cnode); + + // Setup abstract. + auto abs_tensor = std::make_shared(out_type, out_info.shape); + cnode->set_abstract(abs_tensor); + + // Setup kernel info. + auto kernel_info = std::make_shared(); + cnode->set_kernel_info(kernel_info); + std::vector feature_map_input_indexs; + kernel_info->set_feature_map_flag(false); + for (size_t i = 1; i < inputs.size(); ++i) { + if (AnfAlgo::IsFeatureMapOutput(inputs[i])) { + kernel_info->set_feature_map_flag(true); + feature_map_input_indexs.push_back(i); + } + } + if (inputs.size() == 1) { + kernel_info->set_feature_map_flag(true); + } + if (AnfAlgo::IsRealKernel(cnode)) { + // if the node only has the primitive(such as getNext) or the node's input has a feature map input + // then the node's output is a feature map output + AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode); + AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode); + } + + // Setup kernel build info. + std::vector input_formats; + std::vector input_types; + for (size_t i = 1; i < inputs.size(); ++i) { + auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0); + auto input_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second); + input_formats.push_back(input_format); + auto input_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second); + input_types.push_back(input_type); + } + + std::vector output_formats = {out_info.format}; + std::vector output_types = {out_type->type_id()}; + + kernel::KernelBuildInfo::KernelBuildInfoBuilder info_builder; + info_builder.SetInputsFormat(input_formats); + info_builder.SetInputsDeviceType(input_types); + info_builder.SetOutputsFormat(output_formats); + info_builder.SetOutputsDeviceType(output_types); + info_builder.SetProcessor(kernel::Processor::CUDA); + info_builder.SetKernelType(KernelType::AKG_KERNEL); + info_builder.SetFusionType(kernel::FusionType::OPAQUE); + auto selected_info = info_builder.Build(); + AnfAlgo::SetSelectKernelBuildInfo(selected_info, cnode.get()); + + func_graph->AddNode(cnode); + return cnode; +} } // namespace opt } // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h index 2fda42c07c..30974acb95 100644 --- a/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h +++ b/mindspore/ccsrc/backend/optimizer/graph_kernel/graph_kernel_helper.h @@ -27,6 +27,7 @@ #include "ir/anf.h" #include "ir/func_graph.h" #include "ir/primitive.h" +#include "backend/session/anf_runtime_algorithm.h" #include "backend/session/kernel_graph.h" #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h" #include @@ -38,6 +39,8 @@ inline const PrimitivePtr kPrimGkDropout = std::make_shared("GkDropou namespace opt { using kernel::DumpOption; +constexpr auto kIsFeatureMapOutput = "IsFeatureMapOutput"; +constexpr auto kIsFeatureMapInputList = "IsFeatureMapInputList"; constexpr auto kGraphKernelModule = "mindspore._extends.graph_kernel"; constexpr auto kGraphKernelSplitFunc = "split_with_json"; constexpr auto kGetGraphKernelOpExpander = "get_op_expander"; @@ -45,6 +48,12 @@ constexpr auto kJsonKeyMultiGraph = "multi_graph"; constexpr auto kJsonKeyGraphDesc = "graph_desc"; constexpr auto kJsonKeyGraphMode = "graph_mode"; +struct DataInfo { + std::string format{kOpFormat_DEFAULT}; + ShapeVector shape{1}; + TypePtr type{nullptr}; +}; + bool ConvertNonscalarTensorToParameter(const FuncGraphPtr &fg, AnfNodePtrList *inputs_ptr); std::tuple MixedNodesTransToGraph(const AnfNodePtrList &fuse_nodes, AnfNodePtrList *src_outputs = nullptr); @@ -74,6 +83,49 @@ void UpdateControlDependNode(std::multimap> *depend_prior, const AnfNodePtr &new_fuse_cnode, const AnfNodePtrList &outputs); + +std::string GetFormat(const AnfNodePtr &node); +TypePtr GetType(const AnfNodePtr &node); +ShapeVector GetShape(const AnfNodePtr &node); +std::vector GetReduceAxis(const AnfNodePtr &node); + +CNodePtr CreateCNode(const std::vector &inputs, const FuncGraphPtr &func_graph, const DataInfo &out_info); + +template +ValueNodePtr CreateScalarTensorValueNode(const DataInfo &info, T value, size_t data_length) { + // Create tensor value. + if (info.shape.size() != 1 && info.shape[0] != 1) { + MS_LOG(EXCEPTION) << "Only support create scalar tensor value node!!!"; + } + + if (info.type == nullptr) { + MS_LOG(EXCEPTION) << "Data type is needed!!!"; + } + + tensor::TensorPtr tensor = std::make_shared(info.type->type_id(), info.shape); + MS_EXCEPTION_IF_NULL(tensor); + tensor::DeviceInfo device_info{info.format, info.type}; + tensor->set_device_info(device_info); + auto data_ptr = tensor->data_c(); + MS_EXCEPTION_IF_NULL(data_ptr); + auto ret_code = memcpy_s(data_ptr, static_cast(tensor->data().nbytes()), &value, data_length); + if (ret_code != 0) { + MS_LOG(EXCEPTION) << "Failed to copy data into scalar tensor."; + } + + // Create value node. + ValueNodePtr new_value_node = std::make_shared(tensor); + new_value_node->set_abstract(tensor->ToAbstract()); + auto kernel_info = std::make_shared(); + new_value_node->set_kernel_info(kernel_info); + auto kernel_build_info_builder = std::make_shared(); + kernel_build_info_builder->SetOutputsFormat(std::vector{info.format}); + std::vector types = {info.type->type_id()}; + kernel_build_info_builder->SetOutputsDeviceType(types); + AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get()); + + return new_value_node; +} } // namespace opt } // namespace mindspore #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_ diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index cfd2238347..cda7a03848 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -35,6 +35,7 @@ #include "backend/optimizer/gpu/remove_format_transform_pair.h" #include "backend/optimizer/gpu/remove_redundant_format_transform.h" #include "backend/optimizer/gpu/reduce_precision_fusion.h" +#include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" #include "backend/optimizer/graph_kernel/composite_ops_fusion.h" @@ -176,6 +177,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr &kernel_ // After Simplify and Splitter, a lot of redundant getitem/maketuple // will be exposed, use GetitemTuple Pass to delete them. pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); optimizer->AddPassManager(pm); (void)optimizer->Optimize(kernel_graph); diff --git a/tests/st/ops/graph_kernel/test_atomic_add.py b/tests/st/ops/graph_kernel/test_atomic_add.py new file mode 100644 index 0000000000..6f3cd8c93c --- /dev/null +++ b/tests/st/ops/graph_kernel/test_atomic_add.py @@ -0,0 +1,124 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +import numpy as np +import pytest +import mindspore.context as context +from mindspore import Tensor +from mindspore.nn import Cell +import mindspore.ops.operations as P + + +class SumOutNet(Cell): + def __init__(self): + super(SumOutNet, self).__init__() + self.square = P.Square() + self.sum = P.ReduceSum() + + def construct(self, x): + mul_res = self.square(x) + return self.sum(mul_res, (0,)) + + +class SingleOutNet(Cell): + def __init__(self): + super(SingleOutNet, self).__init__() + self.add = P.TensorAdd() + self.mul = P.Mul() + self.sum = P.ReduceSum() + + def construct(self, x, y): + mul_res = self.mul(x, y) + sum_res = self.sum(mul_res, ()) + return self.add(sum_res, x) + + +class MultiOutNet(Cell): + def __init__(self): + super(MultiOutNet, self).__init__() + self.add = P.TensorAdd() + self.mul = P.Mul() + self.sum = P.ReduceSum() + + def construct(self, x, y): + add_res = self.add(x, y) + mul_res = self.mul(add_res, add_res) + sum_res = self.sum(mul_res, ()) + return self.add(add_res, sum_res) + + +def atomic_add_sum_output(): + np.random.seed(0) + input_x = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32) + + expect = np.sum(np.square(input_x), axis=(0,)) + + net = SumOutNet() + result = net(Tensor(input_x)) + + res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) + assert res + + +def atomic_add_single_output(): + np.random.seed(0) + input_x = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) + input_y = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) + + expect = np.sum(input_x * input_y) + input_x + + net = SingleOutNet() + result = net(Tensor(input_x), Tensor(input_y)) + + res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) + assert res + + +def atomic_add_multi_output(): + np.random.seed(0) + input_x = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) + input_y = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) + + expect = np.sum(np.square(input_x + input_y)) + (input_x + input_y) + + net = MultiOutNet() + result = net(Tensor(input_x), Tensor(input_y)) + + res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) + assert res + + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_atomic_add_sum_output_gpu(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") + atomic_add_sum_output() + + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_atomic_add_single_output_gpu(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") + atomic_add_single_output() + + +@pytest.mark.level0 +@pytest.mark.platform_x86_gpu_training +@pytest.mark.env_onecard +def test_atomic_add_multi_output_gpu(): + context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") + atomic_add_multi_output()