| @@ -1 +1 @@ | |||||
| Subproject commit 6ffe9c24319d7297d0feeb10ee2bd8135e24c5c8 | |||||
| Subproject commit 0a0338fecd54c654c1992af156d41e036569343c | |||||
| @@ -37,7 +37,9 @@ def expand_gkdropout(expand_info): | |||||
| keep_prob_v = graph_builder.value(input_x.dtype, keep_prob, "DefaultFormat") | keep_prob_v = graph_builder.value(input_x.dtype, keep_prob, "DefaultFormat") | ||||
| r_keep_prob = graph_builder.value(input_x.dtype, 1.0 / keep_prob, "DefaultFormat") | r_keep_prob = graph_builder.value(input_x.dtype, 1.0 / keep_prob, "DefaultFormat") | ||||
| mask = graph_builder.emit('LessEqual', [input_mask, keep_prob_v]) | |||||
| if input_mask.dtype != input_x.dtype: | |||||
| input_mask = graph_builder.emit('Cast', [input_mask], attrs={'dst_type': input_x.dtype}) | |||||
| mask = graph_builder.emit('LessEqual', [input_mask, keep_prob_v]) # output is bool type | |||||
| mask = graph_builder.emit('Cast', [mask], attrs={'dst_type': input_x.dtype}) | mask = graph_builder.emit('Cast', [mask], attrs={'dst_type': input_x.dtype}) | ||||
| # compute result | # compute result | ||||
| @@ -16,7 +16,7 @@ | |||||
| from .model import PrimLib, Graph, Tensor | from .model import PrimLib, Graph, Tensor | ||||
| use_poly_reduce = False | |||||
| use_poly_reduce = True | |||||
| class GraphSplitByPattern: | class GraphSplitByPattern: | ||||
| """Graph splitter""" | """Graph splitter""" | ||||
| @@ -204,6 +204,16 @@ class CompositeGraph: | |||||
| def load(self, desc): | def load(self, desc): | ||||
| """Load Graph from json""" | """Load Graph from json""" | ||||
| def _attr_of(op, inputs, output): | def _attr_of(op, inputs, output): | ||||
| def _get_axis_while_none(input_shape, output_shape): | |||||
| red_axis = [] | |||||
| if len(output_shape) == len(input_shape): | |||||
| for s, i in enumerate(output_shape): | |||||
| if s == 1 and input_shape[i] > 1: | |||||
| red_axis.append(i) | |||||
| else: | |||||
| red_axis = list(range(len(output_shape))) | |||||
| return red_axis | |||||
| attr = {} | attr = {} | ||||
| if op['name'] not in ('ReduceSum', 'ReduceMax', 'ReduceMin'): | if op['name'] not in ('ReduceSum', 'ReduceMax', 'ReduceMin'): | ||||
| return attr | return attr | ||||
| @@ -211,10 +221,7 @@ class CompositeGraph: | |||||
| if a['name'] == 'axis': | if a['name'] == 'axis': | ||||
| red_axis, dim_size = [], len(inputs[0].shape) | red_axis, dim_size = [], len(inputs[0].shape) | ||||
| if not a['value']: | if not a['value']: | ||||
| assert len(output.shape) == len(inputs[0].shape) | |||||
| for i in range(len(output.shape)): | |||||
| if output.shape[i] == 1 and inputs[0].shape[i] > 1: | |||||
| red_axis.append(i) | |||||
| red_axis = _get_axis_while_none(inputs[0].shape, output.shape) | |||||
| else: | else: | ||||
| if isinstance(a['value'], int): | if isinstance(a['value'], int): | ||||
| a['value'] = [a['value']] | a['value'] = [a['value']] | ||||
| @@ -244,7 +244,11 @@ bool AkgKernelJsonGenerator::CreateOutputDescJson(const AnfNodePtr &anf_node, co | |||||
| output_json[kJsonKeyFormat] = this->GetOutputFormat(anf_node, i); | output_json[kJsonKeyFormat] = this->GetOutputFormat(anf_node, i); | ||||
| output_json[kJsonKeyName] = output_name; | output_json[kJsonKeyName] = output_name; | ||||
| output_json[kJsonKeyTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc()); | output_json[kJsonKeyTensorName] = "output_" + std::to_string(i) + "_" + std::to_string(GetOutputTensorIdxInc()); | ||||
| output_json[kJsonKeyShape] = this->GetOutputShape(anf_node, i); | |||||
| auto output_shape = this->GetOutputShape(anf_node, i); | |||||
| if (output_shape.empty()) { | |||||
| output_shape.push_back(1); | |||||
| } | |||||
| output_json[kJsonKeyShape] = output_shape; | |||||
| outputs_json->push_back(output_json); | outputs_json->push_back(output_json); | ||||
| } | } | ||||
| return true; | return true; | ||||
| @@ -680,7 +684,11 @@ nlohmann::json AkgKernelJsonGenerator::CreateInputsJson(const std::vector<AnfNod | |||||
| GetTensorName(node_json_map.at(tmp_input.first), kJsonKeyInputDesc, tmp_input.second); | GetTensorName(node_json_map.at(tmp_input.first), kJsonKeyInputDesc, tmp_input.second); | ||||
| input_desc_json[kJsonKeyDataType] = dtype; | input_desc_json[kJsonKeyDataType] = dtype; | ||||
| input_desc_json[kJsonKeyFormat] = this->GetInputFormat(tmp_input.first, tmp_input.second.first); | input_desc_json[kJsonKeyFormat] = this->GetInputFormat(tmp_input.first, tmp_input.second.first); | ||||
| input_desc_json[kJsonKeyShape] = this->GetInputShape(tmp_input.first, tmp_input.second.first); | |||||
| auto input_shape = this->GetInputShape(tmp_input.first, tmp_input.second.first); | |||||
| if (input_shape.empty()) { | |||||
| input_shape.push_back(1); | |||||
| } | |||||
| input_desc_json[kJsonKeyShape] = input_shape; | |||||
| inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json}); | inputs_json.emplace_back(std::vector<nlohmann::json>{input_desc_json}); | ||||
| } | } | ||||
| return inputs_json; | return inputs_json; | ||||
| @@ -0,0 +1,505 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||||
| #include <algorithm> | |||||
| #include <functional> | |||||
| #include <list> | |||||
| #include <map> | |||||
| #include <memory> | |||||
| #include <utility> | |||||
| #include <set> | |||||
| #include <stack> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| #include "base/core_ops.h" | |||||
| #include "ir/tensor.h" | |||||
| #include "utils/utils.h" | |||||
| #include "utils/log_adapter.h" | |||||
| #include "backend/kernel_compiler/kernel.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_helper.h" | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "backend/session/kernel_graph.h" | |||||
| #include "debug/anf_ir_dump.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| namespace { | |||||
| bool SuitableForAtomicAdd(const AnfNodePtr &node) { | |||||
| if (!IsPrimitiveCNode(node, prim::kPrimReduceSum)) { | |||||
| MS_LOG(EXCEPTION) << "Only process for reduce sum!"; | |||||
| } | |||||
| auto input = node->cast<CNodePtr>()->input(kFirstDataInputIndex); | |||||
| auto src_shape_vec = GetShape(input); | |||||
| auto axis_vec = GetReduceAxis(node); | |||||
| if (axis_vec.empty()) { | |||||
| for (size_t i = 0; i < src_shape_vec.size(); ++i) { | |||||
| axis_vec.push_back(i); | |||||
| } | |||||
| } else { | |||||
| std::transform(axis_vec.begin(), axis_vec.end(), axis_vec.begin(), | |||||
| [&src_shape_vec](int64_t axis) -> int64_t { return axis < 0 ? axis + src_shape_vec.size() : axis; }); | |||||
| } | |||||
| std::set<int64_t> axis_set(axis_vec.begin(), axis_vec.end()); | |||||
| // For reduce whose last dim is reduced (including all-reduce), | |||||
| // it is suitable for atomic add only the reduce num is greater than or equal to 1024. | |||||
| if (axis_set.count(src_shape_vec.size() - 1) != 0) { | |||||
| size_t reduce_size = | |||||
| std::accumulate(axis_set.begin(), axis_set.end(), 1, | |||||
| [&src_shape_vec](size_t size, int64_t axis) { return size * src_shape_vec[axis]; }); | |||||
| return reduce_size >= 1024; | |||||
| } | |||||
| // For reduce whose last dim is not reduced, always true. | |||||
| return true; | |||||
| } | |||||
| bool HaveReduceInPredecessors(const AnfNodePtr &node) { | |||||
| std::stack<AnfNodePtr> st; | |||||
| st.push(node); | |||||
| while (!st.empty()) { | |||||
| auto n = st.top(); | |||||
| st.pop(); | |||||
| if (n != node) { | |||||
| if (!n->isa<CNode>()) { | |||||
| continue; | |||||
| } | |||||
| if (IsPrimitiveCNode(n, prim::kPrimReduceSum)) { | |||||
| return true; | |||||
| } | |||||
| } | |||||
| auto n_inputs = n->cast<CNodePtr>()->inputs(); | |||||
| std::for_each(n_inputs.cbegin() + 1, n_inputs.cend(), [&st](const AnfNodePtr &n) -> void { st.push(n); }); | |||||
| } | |||||
| return false; | |||||
| } | |||||
| inline int64_t CalNewIndex(int64_t old_index, int64_t reduce_index) { | |||||
| return old_index - (old_index > reduce_index ? 1 : 0); | |||||
| } | |||||
| } // namespace | |||||
| bool AtomicCleanInsertter::CanActivateAtomicAdd(const AnfNodePtr &anf_node) { | |||||
| auto node = anf_node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(node); | |||||
| auto mng_sub = sub_graph->manager(); | |||||
| if (mng_sub == nullptr) { | |||||
| mng_sub = Manage(sub_graph, false); | |||||
| sub_graph->set_manager(mng_sub); | |||||
| } | |||||
| // Rules to activate atomic add: | |||||
| // 1. ReduceSum should not fuse any other ops in out direction, which mean it should be in output list. | |||||
| // 2. only one ReduceSum in output list. | |||||
| // 3. The reduce axis and reduce number should meet condition (all-reduce or reduce-x when fuse number is greater than | |||||
| // or equal to 1024, or reduce-y). | |||||
| // 4. No other ReduceSum as output ReduceSum's predecessors (reduce compile limitation). | |||||
| // Rule 2. | |||||
| auto real_return_node = sub_graph->get_return()->input(kFirstDataInputIndex); | |||||
| if (IsPrimitiveCNode(real_return_node, prim::kPrimMakeTuple)) { | |||||
| AnfNodePtrList reduce_ops; | |||||
| size_t reduce_cnt = 0; | |||||
| const auto &inputs = real_return_node->cast<CNodePtr>()->inputs(); | |||||
| for (size_t i = 1; i < inputs.size(); ++i) { | |||||
| if (IsPrimitiveCNode(inputs[i], prim::kPrimReduceSum)) { | |||||
| atomic_add_node_ = inputs[i]->cast<CNodePtr>(); | |||||
| reduce_real_output_index_ = i - 1; | |||||
| reduce_cnt++; | |||||
| } | |||||
| } | |||||
| if (reduce_cnt != 1) { | |||||
| return false; | |||||
| } | |||||
| } else if (IsPrimitiveCNode(real_return_node, prim::kPrimReduceSum)) { | |||||
| atomic_add_node_ = real_return_node->cast<CNodePtr>(); | |||||
| } else { | |||||
| return false; | |||||
| } | |||||
| // Rule 1. | |||||
| if (mng_sub->node_users()[atomic_add_node_].size() > 1) { | |||||
| return false; | |||||
| } | |||||
| // Rule 3 and 4. | |||||
| if (!SuitableForAtomicAdd(atomic_add_node_) || HaveReduceInPredecessors(atomic_add_node_)) { | |||||
| return false; | |||||
| } | |||||
| return true; | |||||
| } | |||||
| void AtomicCleanInsertter::CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input) { | |||||
| // Change kernel build info. | |||||
| auto kernel_info = static_cast<device::KernelInfo *>(composite_node->kernel_info()); | |||||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||||
| const auto &origin_kernel_build_info = kernel_info->GetMutableSelectKernelBuildInfo(); | |||||
| auto origin_inputs_format = origin_kernel_build_info->GetAllInputFormats(); | |||||
| auto origin_outputs_format = origin_kernel_build_info->GetAllOutputFormats(); | |||||
| auto origin_inputs_type = origin_kernel_build_info->GetAllInputDeviceTypes(); | |||||
| auto origin_outputs_type = origin_kernel_build_info->GetAllOutputDeviceTypes(); | |||||
| auto origin_processor = origin_kernel_build_info->processor(); | |||||
| std::vector<std::string> &new_inputs_format = origin_inputs_format; | |||||
| std::vector<TypeId> &new_inputs_type = origin_inputs_type; | |||||
| std::vector<std::string> new_outputs_format; | |||||
| std::vector<TypeId> new_outputs_type; | |||||
| for (size_t i = 0; i < origin_outputs_format.size(); ++i) { | |||||
| if (real_output_num_ > 1 && i == reduce_real_output_index_) { | |||||
| continue; | |||||
| } | |||||
| new_outputs_format.push_back(origin_outputs_format[i]); | |||||
| new_outputs_type.push_back(origin_outputs_type[i]); | |||||
| } | |||||
| auto kernel_with_index = AnfAlgo::VisitKernel(new_input, 0); | |||||
| new_inputs_format.push_back(AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second)); | |||||
| new_inputs_type.push_back(AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second)); | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder new_info_builder; | |||||
| new_info_builder.SetInputsFormat(new_inputs_format); | |||||
| new_info_builder.SetInputsDeviceType(new_inputs_type); | |||||
| new_info_builder.SetOutputsFormat(new_outputs_format); | |||||
| new_info_builder.SetOutputsDeviceType(new_outputs_type); | |||||
| new_info_builder.SetProcessor(origin_processor); | |||||
| new_info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||||
| new_info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||||
| auto new_selected_info = new_info_builder.Build(); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(new_selected_info, composite_node.get()); | |||||
| } | |||||
| void AtomicCleanInsertter::CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, | |||||
| const AnfNodePtr &new_parameter) { | |||||
| // add inplaceassign | |||||
| AnfNodePtr out_node; | |||||
| bool fake_out = false; | |||||
| size_t replace_index = 0; | |||||
| auto retrun_node = sub_graph->get_return()->input(kFirstDataInputIndex); | |||||
| if (IsPrimitiveCNode(retrun_node, prim::kPrimMakeTuple)) { | |||||
| const auto &outs = retrun_node->cast<CNodePtr>()->inputs(); | |||||
| real_output_num_ = outs.size() - 1; | |||||
| for (size_t i = 1; i < outs.size(); ++i) { | |||||
| if (i != reduce_real_output_index_ + 1) { | |||||
| out_node = outs[i]; | |||||
| replace_index = i; | |||||
| break; | |||||
| } | |||||
| } | |||||
| } else { | |||||
| real_output_num_ = 1; | |||||
| out_node = atomic_add_node_; // Use result data itself, and set attr "fake_out" true. | |||||
| fake_out = true; | |||||
| } | |||||
| auto inplace_assign_node = | |||||
| CreateCNode({NewValueNode(std::make_shared<Primitive>("InplaceAssign")), new_parameter, atomic_add_node_, out_node}, | |||||
| sub_graph, {.format = GetFormat(out_node), .shape = GetShape(out_node), .type = GetType(out_node)}); | |||||
| AnfAlgo::SetNodeAttr("fake_output", MakeValue(fake_out), inplace_assign_node); | |||||
| CNodePtr new_out_node; | |||||
| if (real_output_num_ > 2) { | |||||
| std::vector<AnfNodePtr> output_args = {NewValueNode(prim::kPrimMakeTuple)}; | |||||
| const auto &outs = retrun_node->cast<CNodePtr>()->inputs(); | |||||
| for (size_t i = 1; i < outs.size(); ++i) { | |||||
| if (i == reduce_real_output_index_ + 1) { | |||||
| continue; | |||||
| } else if (i == replace_index) { | |||||
| output_args.push_back(inplace_assign_node); | |||||
| } else { | |||||
| output_args.push_back(outs[i]); | |||||
| } | |||||
| } | |||||
| // Set output for AnfGraph | |||||
| new_out_node = sub_graph->NewCNode(output_args); | |||||
| } else { | |||||
| new_out_node = inplace_assign_node; | |||||
| } | |||||
| sub_graph->set_output(new_out_node); | |||||
| } | |||||
| void AtomicCleanInsertter::CorrectAbstract(const AnfNodePtr &composite_node) { | |||||
| // If there is only one output(ReduceSum), it should be a fake output with the same abstract with origin output. | |||||
| if (real_output_num_ <= 1) { | |||||
| return; | |||||
| } | |||||
| // Change abstract. | |||||
| auto origin_out_spec = composite_node->abstract()->cast<abstract::AbstractTuplePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(origin_out_spec); | |||||
| const auto &origin_out_specs = origin_out_spec->elements(); | |||||
| AbstractBasePtrList new_out_specs; | |||||
| for (size_t i = 0; i < origin_out_specs.size(); ++i) { | |||||
| if (i != reduce_real_output_index_) { | |||||
| new_out_specs.push_back(origin_out_specs[i]); | |||||
| } | |||||
| } | |||||
| composite_node->set_abstract(std::make_shared<abstract::AbstractTuple>(new_out_specs)); | |||||
| } | |||||
| void AtomicCleanInsertter::ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, | |||||
| const FuncGraphManagerPtr &mng) { | |||||
| auto sub_graph = AnfAlgo::GetCNodeFuncGraphPtr(composite_node); | |||||
| auto mng_sub = sub_graph->manager(); | |||||
| if (mng_sub == nullptr) { | |||||
| mng_sub = Manage(sub_graph, false); | |||||
| sub_graph->set_manager(mng_sub); | |||||
| } | |||||
| // Add atomic attribute to reducesum node. | |||||
| AnfAlgo::SetNodeAttr("enable_atomic_add", MakeValue(true), atomic_add_node_); | |||||
| // add input | |||||
| auto inputs = composite_node->cast<CNodePtr>()->inputs(); | |||||
| inputs.push_back(new_input); | |||||
| composite_node->cast<CNodePtr>()->set_inputs(inputs); | |||||
| // add parameter | |||||
| auto parameter = sub_graph->add_parameter(); | |||||
| parameter->set_abstract(new_input->abstract()); | |||||
| parameter->set_kernel_info(new_input->kernel_info_ptr()); | |||||
| CreateInplaceAssignNodeAndCorrectReturn(sub_graph, parameter); | |||||
| CorrectAbstract(composite_node); | |||||
| CorrectKernelBuildInfo(composite_node, new_input); | |||||
| auto old_graph_name = GetValue<std::string>(sub_graph->get_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL)); | |||||
| auto new_graph_name = ExtractGraphKernelName(TopoSort(sub_graph->get_return()), "", "atomic_add"); | |||||
| sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(new_graph_name)); | |||||
| MS_LOG(INFO) << "Convert " << old_graph_name << " to atomic add graph " << new_graph_name; | |||||
| } | |||||
| void AtomicCleanInsertter::AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, | |||||
| const AnfNodePtr &composite_node, const AnfNodePtr &user_node, int index) { | |||||
| // Create depend node to hold new control depend node. | |||||
| AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), clean_node, composite_node}; | |||||
| auto depend_cnode = main_graph->NewCNode(d_inputs); | |||||
| depend_cnode->set_abstract(clean_node->abstract()); | |||||
| main_graph->AddNode(depend_cnode); | |||||
| auto user_cnode = user_node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||||
| user_cnode->set_input(index, depend_cnode); | |||||
| } | |||||
| void AtomicCleanInsertter::AddControlDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &pre_node, | |||||
| const AnfNodePtr &post_node, const FuncGraphManagerPtr &mng) { | |||||
| // Collect use dependencies firstly. | |||||
| auto post_users = mng->node_users()[post_node]; | |||||
| // Create control depend, first input is composite op, second is user | |||||
| AnfNodePtrList cd_inputs = {NewValueNode(prim::kPrimControlDepend), pre_node, post_node}; | |||||
| auto control_depend_cnode = main_graph->NewCNode(cd_inputs); | |||||
| main_graph->AddNode(control_depend_cnode); | |||||
| // Create depend node to hold new control depend node. | |||||
| AnfNodePtrList d_inputs = {NewValueNode(prim::kPrimDepend), post_node, control_depend_cnode}; | |||||
| auto depend_cnode = main_graph->NewCNode(d_inputs); | |||||
| depend_cnode->set_abstract(post_node->abstract()); | |||||
| main_graph->AddNode(depend_cnode); | |||||
| for (const auto &[user_node, index] : post_users) { | |||||
| auto user_cnode = user_node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||||
| user_cnode->set_input(index, depend_cnode); | |||||
| } | |||||
| } | |||||
| CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type) { | |||||
| std::set<TypeId> data_support = {kNumberTypeFloat16, kNumberTypeFloat32, kNumberTypeFloat64}; | |||||
| if (!std::any_of(data_support.cbegin(), data_support.cend(), [&dst_type](TypeId type) { return dst_type == type; })) { | |||||
| MS_LOG(EXCEPTION) << "Atomic add not support data type " << dst_type; | |||||
| } | |||||
| // Create zero value which will be broadcast to target shape. | |||||
| auto format = GetFormat(atomic_add_node_); | |||||
| auto dtype = (dst_type == kNumberTypeFloat16) ? kNumberTypeFloat32 : dst_type; | |||||
| ValueNodePtr value_node; | |||||
| if (dtype == kNumberTypeFloat32) { | |||||
| value_node = CreateScalarTensorValueNode<float>({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, | |||||
| static_cast<float>(0), sizeof(float)); | |||||
| } else { | |||||
| value_node = CreateScalarTensorValueNode<double>({.format = format, .shape = {1}, .type = TypeIdToType(dtype)}, | |||||
| static_cast<double>(0), sizeof(double)); | |||||
| } | |||||
| // Create composite op's sub-graph. | |||||
| auto new_sub_graph = std::make_shared<FuncGraph>(); | |||||
| auto parameter = new_sub_graph->add_parameter(); | |||||
| parameter->set_abstract(value_node->abstract()); | |||||
| parameter->set_kernel_info(value_node->kernel_info_ptr()); | |||||
| AnfNodePtr broadcast_input_node = parameter; | |||||
| if (dst_type == kNumberTypeFloat16) { | |||||
| AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), parameter}; | |||||
| auto cast_node_inner = | |||||
| CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); | |||||
| AnfAlgo::SetNodeAttr("dst_type", MakeValue("float32"), cast_node_inner); | |||||
| broadcast_input_node = cast_node_inner; | |||||
| } | |||||
| // Create broadcast basic op. | |||||
| auto dst_shape_vec = GetShape(atomic_add_node_); | |||||
| if (dst_shape_vec.empty()) { | |||||
| dst_shape_vec.push_back(1); | |||||
| } | |||||
| AnfNodePtrList atomic_clean_inputs = {NewValueNode(std::make_shared<Primitive>(kBroadcastToOpName)), | |||||
| broadcast_input_node}; | |||||
| auto broadcast_to_node_inner = CreateCNode( | |||||
| atomic_clean_inputs, new_sub_graph, {.format = format, .shape = dst_shape_vec, .type = GetType(atomic_add_node_)}); | |||||
| AnfAlgo::SetNodeAttr("shape", MakeValue(dst_shape_vec), broadcast_to_node_inner); | |||||
| // Makeup sub-graph. | |||||
| new_sub_graph->set_output(broadcast_to_node_inner); | |||||
| auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph), value_node}); | |||||
| broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); | |||||
| SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {value_node}, {broadcast_to_node_inner}, | |||||
| kernel::Processor::CUDA); | |||||
| auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); | |||||
| new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); | |||||
| // mng->AddFuncGraph(new_sub_graph); | |||||
| return broadcast_to_composite_node; | |||||
| } | |||||
| void AtomicCleanInsertter::ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, | |||||
| const AnfNodePtr &broadcast_to_node, const FuncGraphManagerPtr &mng) { | |||||
| // 1. find users, change getitem index if needed. | |||||
| std::vector<std::pair<AnfNodePtr, int> > reduce_user_nodes; | |||||
| if (real_output_num_ <= 1) { | |||||
| auto users = mng->node_users()[composite_node]; | |||||
| std::transform(users.cbegin(), users.cend(), std::back_inserter(reduce_user_nodes), | |||||
| [](const std::pair<AnfNodePtr, int> &pair) { return pair; }); | |||||
| } else { | |||||
| std::vector<std::pair<AnfNodePtr, int> > getitem_user_nodes; | |||||
| auto users = mng->node_users()[composite_node]; | |||||
| for (const auto &node_index : users) { | |||||
| const auto &user_node = node_index.first; | |||||
| if (!IsPrimitiveCNode(user_node, prim::kPrimTupleGetItem)) { | |||||
| continue; | |||||
| } | |||||
| auto get_item_cnode = user_node->cast<CNodePtr>(); | |||||
| auto value_input = get_item_cnode->input(kInputNodeOutputIndexInTupleGetItem); | |||||
| MS_EXCEPTION_IF_NULL(value_input); | |||||
| auto value_node = value_input->cast<ValueNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(value_node); | |||||
| auto item_idx = GetValue<int64_t>(value_node->value()); | |||||
| if (item_idx == static_cast<int64_t>(reduce_real_output_index_)) { | |||||
| getitem_user_nodes.push_back(node_index); | |||||
| } else { | |||||
| if (real_output_num_ > 2) { | |||||
| // Recorrect other getitem index. | |||||
| int64_t new_item_idx = CalNewIndex(item_idx, reduce_real_output_index_); | |||||
| AnfNodePtrList new_inputs = {NewValueNode(prim::kPrimTupleGetItem), composite_node, | |||||
| NewValueNode(new_item_idx)}; | |||||
| auto new_out = main_graph->NewCNode(new_inputs); | |||||
| new_out->set_abstract(get_item_cnode->abstract()); | |||||
| for (const auto &[user, index] : mng->node_users()[get_item_cnode]) { | |||||
| auto user_cnode = user->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||||
| user_cnode->set_input(index, new_out); | |||||
| } | |||||
| } else { | |||||
| for (const auto &[user, index] : mng->node_users()[node_index.first]) { | |||||
| auto user_cnode = user->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||||
| user_cnode->set_input(index, composite_node); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| for (auto &pair : getitem_user_nodes) { | |||||
| // dirctory to find real user. | |||||
| auto real_users = mng->node_users()[pair.first]; | |||||
| reduce_user_nodes.insert(reduce_user_nodes.end(), real_users.begin(), real_users.end()); | |||||
| } | |||||
| } | |||||
| for (const auto &[user_node, index] : reduce_user_nodes) { | |||||
| // 2. set ac output as user's input. | |||||
| auto user_cnode = user_node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(user_cnode); | |||||
| user_cnode->set_input(index, broadcast_to_node); | |||||
| // mng->SetEdge(user_node, index, broadcast_to_node); | |||||
| // 3. Make sure modified composite node running first. | |||||
| // * To not change the origin node's dependency relation, add ControlDepend and Depend node. | |||||
| // * For Return node and output node, ControlDepend node will change the order of these two node, which will may | |||||
| // main graph running failed. So only add Depend node to meet the need of execute order. | |||||
| if (IsPrimitiveCNode(user_node, prim::kPrimReturn) || user_node == main_graph->output()) { | |||||
| AddDepend(main_graph, broadcast_to_node, composite_node, user_node, index); | |||||
| } else { | |||||
| AddControlDepend(main_graph, composite_node, user_node, mng); | |||||
| } | |||||
| } | |||||
| } | |||||
| void AtomicCleanInsertter::InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, | |||||
| const FuncGraphManagerPtr &mng) { | |||||
| auto origin_composite_node = anf_node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(origin_composite_node); | |||||
| // Create broadcst node. | |||||
| auto out_type = GetType(atomic_add_node_)->cast<TensorTypePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(out_type); | |||||
| auto broadcast_to_node = CreateAtomicCleanCompositeNode(main_graph, out_type->element()->type_id()); | |||||
| // Insert extra input(broadcast node output) to composite node, and make Reducesum inplaceassign to it. | |||||
| // Note: if it's single output, this will increase total memory because of a fake out. | |||||
| ProcessOriginCNode(origin_composite_node, broadcast_to_node, mng); | |||||
| // Replace origin ReduceSum's user with atomic clean output, and add control depend from composite op to user. | |||||
| ProcessOriginCNodeUser(main_graph, origin_composite_node, broadcast_to_node, mng); | |||||
| } | |||||
| bool AtomicCleanInsertter::Run(const FuncGraphPtr &func_graph) { | |||||
| auto kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(func_graph); | |||||
| MS_EXCEPTION_IF_NULL(kernel_graph); | |||||
| auto mng = kernel_graph->manager(); | |||||
| if (mng == nullptr) { | |||||
| mng = Manage(kernel_graph, true); | |||||
| kernel_graph->set_manager(mng); | |||||
| } | |||||
| bool changed = false; | |||||
| auto topo_nodes = TopoSort(kernel_graph->get_return()); | |||||
| for (const auto &node : topo_nodes) { | |||||
| if (!AnfAlgo::IsGraphKernel(node) || !CanActivateAtomicAdd(node)) { | |||||
| continue; | |||||
| } | |||||
| InsertAtomicClean(kernel_graph, node, mng); | |||||
| changed = true; | |||||
| } | |||||
| if (changed) { | |||||
| mng->RemoveRoots(); | |||||
| mng->KeepRoots({func_graph}); | |||||
| } | |||||
| return changed; | |||||
| } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,57 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include "backend/optimizer/common/optimizer.h" | |||||
| #include "backend/session/kernel_graph.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class AtomicCleanInsertter : public Pass { | |||||
| public: | |||||
| AtomicCleanInsertter() : Pass("atomic_clean") {} | |||||
| ~AtomicCleanInsertter() override = default; | |||||
| bool Run(const FuncGraphPtr &func_graph) override; | |||||
| private: | |||||
| void ProcessOriginCNode(const AnfNodePtr &composite_node, const AnfNodePtr &new_input, | |||||
| const FuncGraphManagerPtr &mng); | |||||
| bool CanActivateAtomicAdd(const AnfNodePtr &anf_node); | |||||
| void InsertAtomicClean(const KernelGraphPtr &main_graph, const AnfNodePtr &anf_node, const FuncGraphManagerPtr &mng); | |||||
| void AddDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &clean_node, const AnfNodePtr &composite_node, | |||||
| const AnfNodePtr &user_node, int index); | |||||
| void AddControlDepend(const FuncGraphPtr &main_graph, const AnfNodePtr &pre_node, const AnfNodePtr &post_node, | |||||
| const FuncGraphManagerPtr &mng); | |||||
| void CreateInplaceAssignNodeAndCorrectReturn(const FuncGraphPtr &sub_graph, const AnfNodePtr &new_parameter); | |||||
| void CorrectAbstract(const AnfNodePtr &composite_node); | |||||
| void CorrectKernelBuildInfo(const AnfNodePtr &composite_node, const AnfNodePtr &new_input); | |||||
| CNodePtr CreateAtomicCleanCompositeNode(const KernelGraphPtr &main_graph, TypeId dst_type); | |||||
| void ProcessOriginCNodeUser(const KernelGraphPtr &main_graph, const AnfNodePtr &composite_node, | |||||
| const AnfNodePtr &broadcast_to_node, const FuncGraphManagerPtr &mng); | |||||
| CNodePtr atomic_add_node_{nullptr}; | |||||
| size_t reduce_real_output_index_{0}; | |||||
| size_t real_output_num_{0}; | |||||
| }; | |||||
| using AtomicCleanInsertterPtr = std::shared_ptr<AtomicCleanInsertter>; | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_ADD_ATOMIC_CLEAN_GPU_H_ | |||||
| @@ -30,7 +30,9 @@ bool IsCNodePrimitveEqual(const CNodePtr &main, const CNodePtr &node) { | |||||
| auto main_primitive = AnfAlgo::GetCNodePrimitive(main); | auto main_primitive = AnfAlgo::GetCNodePrimitive(main); | ||||
| auto node_primitive = AnfAlgo::GetCNodePrimitive(node); | auto node_primitive = AnfAlgo::GetCNodePrimitive(node); | ||||
| if (main_primitive != nullptr && node_primitive != nullptr) { | if (main_primitive != nullptr && node_primitive != nullptr) { | ||||
| if (main_primitive->name() != node_primitive->name()) { | |||||
| // Some ops such as Reshape is not real op, cse these type will not get gain. And for ops fusion, keep these op | |||||
| // alone can prevent some redundant output case (input -> reshape -> output). | |||||
| if (main_primitive->name() != node_primitive->name() || IsPrimitiveCNode(node, prim::kPrimReshape)) { | |||||
| return false; | return false; | ||||
| } | } | ||||
| @@ -908,5 +908,126 @@ void ReplaceNewFuseCNodeForDependPrior(std::multimap<AnfNodePtr, std::pair<AnfNo | |||||
| depend_prior->insert(item); | depend_prior->insert(item); | ||||
| } | } | ||||
| } | } | ||||
| std::string GetFormat(const AnfNodePtr &node) { | |||||
| auto kernel_info = static_cast<device::KernelInfo *>(node->kernel_info()); | |||||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||||
| auto kernel_build_info = kernel_info->select_kernel_build_info(); | |||||
| MS_EXCEPTION_IF_NULL(kernel_build_info); | |||||
| return kernel_build_info->GetOutputFormat(0); | |||||
| } | |||||
| TypePtr GetType(const AnfNodePtr &node) { | |||||
| const auto &abstract = node->abstract(); | |||||
| auto type = abstract->BuildType(); | |||||
| MS_EXCEPTION_IF_NULL(type); | |||||
| return type; | |||||
| } | |||||
| ShapeVector GetShape(const AnfNodePtr &node) { | |||||
| auto abstract = node->abstract(); | |||||
| MS_EXCEPTION_IF_NULL(abstract); | |||||
| auto shape = abstract->GetShapeTrack(); | |||||
| if (shape == nullptr || !shape->isa<abstract::Shape>()) { | |||||
| MS_LOG(EXCEPTION) << "Cannot get shape from " << node->fullname_with_scope(); | |||||
| } | |||||
| return shape->cast<abstract::ShapePtr>()->shape(); | |||||
| } | |||||
| std::vector<int64_t> GetReduceAxis(const AnfNodePtr &node) { | |||||
| auto prim = GetCNodePrimitive(node); | |||||
| MS_EXCEPTION_IF_NULL(prim); | |||||
| const auto &attrs = prim->attrs(); | |||||
| auto iter = attrs.find("axis"); | |||||
| if (iter == attrs.end()) { | |||||
| MS_LOG(EXCEPTION) << "Origin node have no attributes!"; | |||||
| } | |||||
| std::vector<int64_t> axis; | |||||
| auto &v = iter->second; | |||||
| if (v->isa<ValueList>() || v->isa<ValueTuple>()) { | |||||
| auto vec = v->isa<ValueList>() ? v->cast<ValueListPtr>()->value() : v->cast<ValueTuplePtr>()->value(); | |||||
| for (auto value : vec) { | |||||
| if (value->isa<Int64Imm>()) { | |||||
| axis.push_back(GetValue<int64_t>(value)); | |||||
| } else { | |||||
| MS_LOG(EXCEPTION) << "Reduce axis type should be int64!"; | |||||
| } | |||||
| } | |||||
| } else if (v->isa<Int64Imm>()) { | |||||
| axis.push_back(GetValue<int64_t>(v)); | |||||
| } else { | |||||
| MS_LOG(EXCEPTION) << "Reduce axis should be a list or tuple!"; | |||||
| } | |||||
| return axis; | |||||
| } | |||||
| CNodePtr CreateCNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph, const DataInfo &out_info) { | |||||
| // Limitation: 1. Node's attributes should be set out of this function; 2. only one output. | |||||
| MS_EXCEPTION_IF_NULL(out_info.type); | |||||
| auto out_type = out_info.type; | |||||
| if (auto otype = out_info.type->cast<TensorTypePtr>(); otype != nullptr) { | |||||
| out_type = otype->element(); | |||||
| } | |||||
| // Create CNode. | |||||
| auto cnode = func_graph->NewCNode(inputs); | |||||
| MS_EXCEPTION_IF_NULL(cnode); | |||||
| // Setup abstract. | |||||
| auto abs_tensor = std::make_shared<abstract::AbstractTensor>(out_type, out_info.shape); | |||||
| cnode->set_abstract(abs_tensor); | |||||
| // Setup kernel info. | |||||
| auto kernel_info = std::make_shared<device::KernelInfo>(); | |||||
| cnode->set_kernel_info(kernel_info); | |||||
| std::vector<size_t> feature_map_input_indexs; | |||||
| kernel_info->set_feature_map_flag(false); | |||||
| for (size_t i = 1; i < inputs.size(); ++i) { | |||||
| if (AnfAlgo::IsFeatureMapOutput(inputs[i])) { | |||||
| kernel_info->set_feature_map_flag(true); | |||||
| feature_map_input_indexs.push_back(i); | |||||
| } | |||||
| } | |||||
| if (inputs.size() == 1) { | |||||
| kernel_info->set_feature_map_flag(true); | |||||
| } | |||||
| if (AnfAlgo::IsRealKernel(cnode)) { | |||||
| // if the node only has the primitive(such as getNext) or the node's input has a feature map input | |||||
| // then the node's output is a feature map output | |||||
| AnfAlgo::SetNodeAttr(kIsFeatureMapOutput, MakeValue(kernel_info->is_feature_map()), cnode); | |||||
| AnfAlgo::SetNodeAttr(kIsFeatureMapInputList, MakeValue(feature_map_input_indexs), cnode); | |||||
| } | |||||
| // Setup kernel build info. | |||||
| std::vector<std::string> input_formats; | |||||
| std::vector<TypeId> input_types; | |||||
| for (size_t i = 1; i < inputs.size(); ++i) { | |||||
| auto kernel_with_index = AnfAlgo::VisitKernel(inputs[i], 0); | |||||
| auto input_format = AnfAlgo::GetOutputFormat(kernel_with_index.first, kernel_with_index.second); | |||||
| input_formats.push_back(input_format); | |||||
| auto input_type = AnfAlgo::GetOutputDeviceDataType(kernel_with_index.first, kernel_with_index.second); | |||||
| input_types.push_back(input_type); | |||||
| } | |||||
| std::vector<std::string> output_formats = {out_info.format}; | |||||
| std::vector<TypeId> output_types = {out_type->type_id()}; | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder info_builder; | |||||
| info_builder.SetInputsFormat(input_formats); | |||||
| info_builder.SetInputsDeviceType(input_types); | |||||
| info_builder.SetOutputsFormat(output_formats); | |||||
| info_builder.SetOutputsDeviceType(output_types); | |||||
| info_builder.SetProcessor(kernel::Processor::CUDA); | |||||
| info_builder.SetKernelType(KernelType::AKG_KERNEL); | |||||
| info_builder.SetFusionType(kernel::FusionType::OPAQUE); | |||||
| auto selected_info = info_builder.Build(); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(selected_info, cnode.get()); | |||||
| func_graph->AddNode(cnode); | |||||
| return cnode; | |||||
| } | |||||
| } // namespace opt | } // namespace opt | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| @@ -27,6 +27,7 @@ | |||||
| #include "ir/anf.h" | #include "ir/anf.h" | ||||
| #include "ir/func_graph.h" | #include "ir/func_graph.h" | ||||
| #include "ir/primitive.h" | #include "ir/primitive.h" | ||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "backend/session/kernel_graph.h" | #include "backend/session/kernel_graph.h" | ||||
| #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h" | #include "backend/kernel_compiler/akg/akg_kernel_json_generator.h" | ||||
| #include <nlohmann/json.hpp> | #include <nlohmann/json.hpp> | ||||
| @@ -38,6 +39,8 @@ inline const PrimitivePtr kPrimGkDropout = std::make_shared<Primitive>("GkDropou | |||||
| namespace opt { | namespace opt { | ||||
| using kernel::DumpOption; | using kernel::DumpOption; | ||||
| constexpr auto kIsFeatureMapOutput = "IsFeatureMapOutput"; | |||||
| constexpr auto kIsFeatureMapInputList = "IsFeatureMapInputList"; | |||||
| constexpr auto kGraphKernelModule = "mindspore._extends.graph_kernel"; | constexpr auto kGraphKernelModule = "mindspore._extends.graph_kernel"; | ||||
| constexpr auto kGraphKernelSplitFunc = "split_with_json"; | constexpr auto kGraphKernelSplitFunc = "split_with_json"; | ||||
| constexpr auto kGetGraphKernelOpExpander = "get_op_expander"; | constexpr auto kGetGraphKernelOpExpander = "get_op_expander"; | ||||
| @@ -45,6 +48,12 @@ constexpr auto kJsonKeyMultiGraph = "multi_graph"; | |||||
| constexpr auto kJsonKeyGraphDesc = "graph_desc"; | constexpr auto kJsonKeyGraphDesc = "graph_desc"; | ||||
| constexpr auto kJsonKeyGraphMode = "graph_mode"; | constexpr auto kJsonKeyGraphMode = "graph_mode"; | ||||
| struct DataInfo { | |||||
| std::string format{kOpFormat_DEFAULT}; | |||||
| ShapeVector shape{1}; | |||||
| TypePtr type{nullptr}; | |||||
| }; | |||||
| bool ConvertNonscalarTensorToParameter(const FuncGraphPtr &fg, AnfNodePtrList *inputs_ptr); | bool ConvertNonscalarTensorToParameter(const FuncGraphPtr &fg, AnfNodePtrList *inputs_ptr); | ||||
| std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> MixedNodesTransToGraph(const AnfNodePtrList &fuse_nodes, | std::tuple<FuncGraphPtr, AnfNodePtrList, AnfNodePtrList> MixedNodesTransToGraph(const AnfNodePtrList &fuse_nodes, | ||||
| AnfNodePtrList *src_outputs = nullptr); | AnfNodePtrList *src_outputs = nullptr); | ||||
| @@ -74,6 +83,49 @@ void UpdateControlDependNode(std::multimap<AnfNodePtr, std::pair<AnfNodePtr, Anf | |||||
| const AnfNodePtr &control_depend_node, const AnfNodePtr &new_control_depend); | const AnfNodePtr &control_depend_node, const AnfNodePtr &new_control_depend); | ||||
| void ReplaceNewFuseCNodeForDependPrior(std::multimap<AnfNodePtr, std::pair<AnfNodePtr, AnfNodePtr>> *depend_prior, | void ReplaceNewFuseCNodeForDependPrior(std::multimap<AnfNodePtr, std::pair<AnfNodePtr, AnfNodePtr>> *depend_prior, | ||||
| const AnfNodePtr &new_fuse_cnode, const AnfNodePtrList &outputs); | const AnfNodePtr &new_fuse_cnode, const AnfNodePtrList &outputs); | ||||
| std::string GetFormat(const AnfNodePtr &node); | |||||
| TypePtr GetType(const AnfNodePtr &node); | |||||
| ShapeVector GetShape(const AnfNodePtr &node); | |||||
| std::vector<int64_t> GetReduceAxis(const AnfNodePtr &node); | |||||
| CNodePtr CreateCNode(const std::vector<AnfNodePtr> &inputs, const FuncGraphPtr &func_graph, const DataInfo &out_info); | |||||
| template <typename T> | |||||
| ValueNodePtr CreateScalarTensorValueNode(const DataInfo &info, T value, size_t data_length) { | |||||
| // Create tensor value. | |||||
| if (info.shape.size() != 1 && info.shape[0] != 1) { | |||||
| MS_LOG(EXCEPTION) << "Only support create scalar tensor value node!!!"; | |||||
| } | |||||
| if (info.type == nullptr) { | |||||
| MS_LOG(EXCEPTION) << "Data type is needed!!!"; | |||||
| } | |||||
| tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(info.type->type_id(), info.shape); | |||||
| MS_EXCEPTION_IF_NULL(tensor); | |||||
| tensor::DeviceInfo device_info{info.format, info.type}; | |||||
| tensor->set_device_info(device_info); | |||||
| auto data_ptr = tensor->data_c(); | |||||
| MS_EXCEPTION_IF_NULL(data_ptr); | |||||
| auto ret_code = memcpy_s(data_ptr, static_cast<size_t>(tensor->data().nbytes()), &value, data_length); | |||||
| if (ret_code != 0) { | |||||
| MS_LOG(EXCEPTION) << "Failed to copy data into scalar tensor."; | |||||
| } | |||||
| // Create value node. | |||||
| ValueNodePtr new_value_node = std::make_shared<ValueNode>(tensor); | |||||
| new_value_node->set_abstract(tensor->ToAbstract()); | |||||
| auto kernel_info = std::make_shared<device::KernelInfo>(); | |||||
| new_value_node->set_kernel_info(kernel_info); | |||||
| auto kernel_build_info_builder = std::make_shared<kernel::KernelBuildInfo::KernelBuildInfoBuilder>(); | |||||
| kernel_build_info_builder->SetOutputsFormat(std::vector<std::string>{info.format}); | |||||
| std::vector<TypeId> types = {info.type->type_id()}; | |||||
| kernel_build_info_builder->SetOutputsDeviceType(types); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info_builder->Build(), new_value_node.get()); | |||||
| return new_value_node; | |||||
| } | |||||
| } // namespace opt | } // namespace opt | ||||
| } // namespace mindspore | } // namespace mindspore | ||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_ | #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_GRAPH_KERNEL_HELPER_H_ | ||||
| @@ -35,6 +35,7 @@ | |||||
| #include "backend/optimizer/gpu/remove_format_transform_pair.h" | #include "backend/optimizer/gpu/remove_format_transform_pair.h" | ||||
| #include "backend/optimizer/gpu/remove_redundant_format_transform.h" | #include "backend/optimizer/gpu/remove_redundant_format_transform.h" | ||||
| #include "backend/optimizer/gpu/reduce_precision_fusion.h" | #include "backend/optimizer/gpu/reduce_precision_fusion.h" | ||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | |||||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | ||||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | ||||
| #include "backend/optimizer/graph_kernel/composite_ops_fusion.h" | #include "backend/optimizer/graph_kernel/composite_ops_fusion.h" | ||||
| @@ -176,6 +177,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_ | |||||
| // After Simplify and Splitter, a lot of redundant getitem/maketuple | // After Simplify and Splitter, a lot of redundant getitem/maketuple | ||||
| // will be exposed, use GetitemTuple Pass to delete them. | // will be exposed, use GetitemTuple Pass to delete them. | ||||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | pm->AddPass(std::make_shared<opt::GetitemTuple>()); | ||||
| pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>()); | |||||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | ||||
| optimizer->AddPassManager(pm); | optimizer->AddPassManager(pm); | ||||
| (void)optimizer->Optimize(kernel_graph); | (void)optimizer->Optimize(kernel_graph); | ||||
| @@ -0,0 +1,124 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| import numpy as np | |||||
| import pytest | |||||
| import mindspore.context as context | |||||
| from mindspore import Tensor | |||||
| from mindspore.nn import Cell | |||||
| import mindspore.ops.operations as P | |||||
| class SumOutNet(Cell): | |||||
| def __init__(self): | |||||
| super(SumOutNet, self).__init__() | |||||
| self.square = P.Square() | |||||
| self.sum = P.ReduceSum() | |||||
| def construct(self, x): | |||||
| mul_res = self.square(x) | |||||
| return self.sum(mul_res, (0,)) | |||||
| class SingleOutNet(Cell): | |||||
| def __init__(self): | |||||
| super(SingleOutNet, self).__init__() | |||||
| self.add = P.TensorAdd() | |||||
| self.mul = P.Mul() | |||||
| self.sum = P.ReduceSum() | |||||
| def construct(self, x, y): | |||||
| mul_res = self.mul(x, y) | |||||
| sum_res = self.sum(mul_res, ()) | |||||
| return self.add(sum_res, x) | |||||
| class MultiOutNet(Cell): | |||||
| def __init__(self): | |||||
| super(MultiOutNet, self).__init__() | |||||
| self.add = P.TensorAdd() | |||||
| self.mul = P.Mul() | |||||
| self.sum = P.ReduceSum() | |||||
| def construct(self, x, y): | |||||
| add_res = self.add(x, y) | |||||
| mul_res = self.mul(add_res, add_res) | |||||
| sum_res = self.sum(mul_res, ()) | |||||
| return self.add(add_res, sum_res) | |||||
| def atomic_add_sum_output(): | |||||
| np.random.seed(0) | |||||
| input_x = np.random.normal(0, 1, [2, 3, 4, 3]).astype(np.float32) | |||||
| expect = np.sum(np.square(input_x), axis=(0,)) | |||||
| net = SumOutNet() | |||||
| result = net(Tensor(input_x)) | |||||
| res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) | |||||
| assert res | |||||
| def atomic_add_single_output(): | |||||
| np.random.seed(0) | |||||
| input_x = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) | |||||
| input_y = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) | |||||
| expect = np.sum(input_x * input_y) + input_x | |||||
| net = SingleOutNet() | |||||
| result = net(Tensor(input_x), Tensor(input_y)) | |||||
| res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) | |||||
| assert res | |||||
| def atomic_add_multi_output(): | |||||
| np.random.seed(0) | |||||
| input_x = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) | |||||
| input_y = np.random.normal(0, 1, [2, 2, 2, 256]).astype(np.float32) | |||||
| expect = np.sum(np.square(input_x + input_y)) + (input_x + input_y) | |||||
| net = MultiOutNet() | |||||
| result = net(Tensor(input_x), Tensor(input_y)) | |||||
| res = np.allclose(expect, result.asnumpy(), rtol=1.e-4, atol=1.e-7, equal_nan=True) | |||||
| assert res | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_gpu_training | |||||
| @pytest.mark.env_onecard | |||||
| def test_atomic_add_sum_output_gpu(): | |||||
| context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") | |||||
| atomic_add_sum_output() | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_gpu_training | |||||
| @pytest.mark.env_onecard | |||||
| def test_atomic_add_single_output_gpu(): | |||||
| context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") | |||||
| atomic_add_single_output() | |||||
| @pytest.mark.level0 | |||||
| @pytest.mark.platform_x86_gpu_training | |||||
| @pytest.mark.env_onecard | |||||
| def test_atomic_add_multi_output_gpu(): | |||||
| context.set_context(mode=context.GRAPH_MODE, enable_graph_kernel=True, device_target="GPU") | |||||
| atomic_add_multi_output() | |||||