| @@ -1 +1 @@ | |||||
| Subproject commit f8f4e60bf3c435cec41cbe48fe24901277ef9556 | |||||
| Subproject commit 72b359ad457ed8f4f254c8a3bd2bde88967202fb | |||||
| @@ -558,7 +558,7 @@ bool AkgKernelJsonGenerator::CollectJson(const AnfNodePtr &anf_node, nlohmann::j | |||||
| bool AkgKernelJsonGenerator::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, | bool AkgKernelJsonGenerator::CollectFusedJson(const std::vector<AnfNodePtr> &anf_nodes, | ||||
| const std::vector<AnfNodePtr> &input_list, | const std::vector<AnfNodePtr> &input_list, | ||||
| const std::vector<AnfNodePtr> &output_list, nlohmann::json *kernel_json) { | const std::vector<AnfNodePtr> &output_list, nlohmann::json *kernel_json) { | ||||
| if (anf_nodes.empty() || input_list.empty()) { | |||||
| if (anf_nodes.empty()) { | |||||
| MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size() | MS_LOG(ERROR) << "Invalid input size, anf_nodes [" << anf_nodes.size() << "], input_list [" << input_list.size() | ||||
| << "]."; | << "]."; | ||||
| return false; | return false; | ||||
| @@ -374,13 +374,10 @@ CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphP | |||||
| // Create composite op's sub-graph. | // Create composite op's sub-graph. | ||||
| auto new_sub_graph = std::make_shared<FuncGraph>(); | auto new_sub_graph = std::make_shared<FuncGraph>(); | ||||
| auto parameter = new_sub_graph->add_parameter(); | |||||
| parameter->set_abstract(value_node->abstract()); | |||||
| parameter->set_kernel_info(value_node->kernel_info_ptr()); | |||||
| AnfNodePtr broadcast_input_node = parameter; | |||||
| AnfNodePtr broadcast_input_node = value_node; | |||||
| if (dst_type == kNumberTypeFloat16) { | if (dst_type == kNumberTypeFloat16) { | ||||
| AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), parameter}; | |||||
| AnfNodePtrList cast_inputs = {NewValueNode(prim::kPrimCast), value_node}; | |||||
| auto cast_node_inner = | auto cast_node_inner = | ||||
| CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); | CreateCNode(cast_inputs, new_sub_graph, {.format = format, .shape = {1}, .type = TypeIdToType(dst_type)}); | ||||
| AnfAlgo::SetNodeAttr("dst_type", MakeValue("float32"), cast_node_inner); | AnfAlgo::SetNodeAttr("dst_type", MakeValue("float32"), cast_node_inner); | ||||
| @@ -400,12 +397,13 @@ CNodePtr AtomicCleanInsertter::CreateAtomicCleanCompositeNode(const KernelGraphP | |||||
| // Makeup sub-graph. | // Makeup sub-graph. | ||||
| new_sub_graph->set_output(broadcast_to_node_inner); | new_sub_graph->set_output(broadcast_to_node_inner); | ||||
| auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph), value_node}); | |||||
| auto broadcast_to_composite_node = main_graph->NewCNode({NewValueNode(new_sub_graph)}); | |||||
| broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); | broadcast_to_composite_node->set_abstract(broadcast_to_node_inner->abstract()); | ||||
| SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {value_node}, {broadcast_to_node_inner}, | |||||
| kernel::Processor::CUDA); | |||||
| SetNewKernelInfo(broadcast_to_composite_node, new_sub_graph, {}, {broadcast_to_node_inner}, | |||||
| AnfAlgo::GetProcessor(atomic_add_node_)); | |||||
| auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); | auto graph_attr = ExtractGraphKernelName(TopoSort(new_sub_graph->get_return()), "", "atomic_clean"); | ||||
| new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); | new_sub_graph->set_attr(FUNC_GRAPH_ATTR_GRAPH_KERNEL, MakeValue(graph_attr)); | ||||
| new_sub_graph->set_attr("composite_type", MakeValue("atomic_clean")); | |||||
| return broadcast_to_composite_node; | return broadcast_to_composite_node; | ||||
| } | } | ||||
| @@ -0,0 +1,123 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||||
| #include <algorithm> | |||||
| #include <map> | |||||
| #include <string> | |||||
| #include <utility> | |||||
| #include <vector> | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "backend/kernel_compiler/common_utils.h" | |||||
| #include "backend/optimizer/graph_kernel/graph_kernel_helper.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| namespace { | |||||
| ShapeVector GetValidShape(const AnfNodePtr &node) { | |||||
| // Shape will not contain 1 in head. | |||||
| auto shape = GetShape(node); | |||||
| ShapeVector valid_shape; | |||||
| bool valid = false; | |||||
| for (auto s : shape) { | |||||
| if (!valid && s == 1) { | |||||
| continue; | |||||
| } | |||||
| valid = true; | |||||
| valid_shape.push_back(s); | |||||
| } | |||||
| return valid_shape; | |||||
| } | |||||
| bool IsAtomicCleanNode(const AnfNodePtr &node) { | |||||
| MS_EXCEPTION_IF_NULL(node); | |||||
| auto cnode = node->cast<CNodePtr>(); | |||||
| MS_EXCEPTION_IF_NULL(cnode); | |||||
| auto func_graph = GetValueNode<FuncGraphPtr>(cnode->input(kAnfPrimitiveIndex)); | |||||
| MS_EXCEPTION_IF_NULL(func_graph); | |||||
| if (!func_graph->has_attr("composite_type")) { | |||||
| return false; | |||||
| } | |||||
| auto ctype_value = func_graph->get_attr("composite_type"); | |||||
| if (!ctype_value->isa<StringImm>()) { | |||||
| MS_LOG(EXCEPTION) << "Attribute composite_type should be a string!"; | |||||
| } | |||||
| auto ctype = GetValue<std::string>(ctype_value); | |||||
| return ctype == "atomic_clean"; | |||||
| } | |||||
| std::vector<AnfNodePtrList> SplitVectorByWidth(const AnfNodePtrList &nodes, int width) { | |||||
| std::vector<AnfNodePtrList> splitted_nodes; | |||||
| if (!nodes.empty()) { | |||||
| return splitted_nodes; | |||||
| } | |||||
| int num = (nodes.size() - 1) / width + 1; | |||||
| splitted_nodes.resize(num); | |||||
| for (size_t i = 0; i < nodes.size(); ++i) { | |||||
| splitted_nodes[i / width].push_back(nodes[i]); | |||||
| } | |||||
| return splitted_nodes; | |||||
| } | |||||
| } // namespace | |||||
| bool CleanAllInOnce::Run(const FuncGraphPtr &func_graph) { | |||||
| MS_EXCEPTION_IF_NULL(func_graph); | |||||
| auto mng = func_graph->manager(); | |||||
| if (mng == nullptr) { | |||||
| mng = Manage(func_graph, true); | |||||
| func_graph->set_manager(mng); | |||||
| } | |||||
| auto todos = TopoSort(func_graph->get_return()); | |||||
| std::map<ShapeVector, AnfNodePtrList> clean_map; | |||||
| std::for_each(todos.cbegin(), todos.cend(), [&clean_map](const AnfNodePtr &node) { | |||||
| if (AnfAlgo::IsGraphKernel(node) && IsAtomicCleanNode(node)) { | |||||
| auto valid_shape = GetValidShape(node); | |||||
| auto iter = clean_map.find(valid_shape); | |||||
| if (iter != clean_map.end()) { | |||||
| iter->second.push_back(node); | |||||
| } else { | |||||
| clean_map.insert({valid_shape, {node}}); | |||||
| } | |||||
| } | |||||
| }); | |||||
| bool changed = false; | |||||
| if (!clean_map.empty()) { | |||||
| for (auto iter : clean_map) { | |||||
| // Do all in once is not good, so do ten in once. | |||||
| auto splitted_nodes = SplitVectorByWidth(iter.second, 10); | |||||
| for (auto &snodes : splitted_nodes) { | |||||
| if (snodes.size() < 2) { | |||||
| continue; | |||||
| } | |||||
| AnfNodePtr clean_all_node; | |||||
| std::tie(clean_all_node, std::ignore) = FuseNodesToSubGraph(snodes, func_graph, "clean_all"); | |||||
| MS_LOG(INFO) << "Add node to clean batch buffers in once(" << clean_all_node->fullname_with_scope() | |||||
| << ") for atomic add!"; | |||||
| changed = true; | |||||
| } | |||||
| } | |||||
| } | |||||
| if (changed) { | |||||
| mng->RemoveRoots(); | |||||
| mng->KeepRoots({func_graph}); | |||||
| } | |||||
| return changed; | |||||
| } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_ | |||||
| #include <map> | |||||
| #include <memory> | |||||
| #include "backend/optimizer/common/pass.h" | |||||
| #include "ir/func_graph.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class CleanAllInOnce : public Pass { | |||||
| public: | |||||
| CleanAllInOnce() : Pass("clean_all_in_once") {} | |||||
| ~CleanAllInOnce() override = default; | |||||
| bool Run(const FuncGraphPtr &func_graph); | |||||
| }; | |||||
| using CleanAllInOncePtr = std::shared_ptr<CleanAllInOnce>; | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GRAPH_KERNEL_CLEAN_ALL_IN_ONCE_H_ | |||||
| @@ -38,6 +38,7 @@ | |||||
| #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | #include "backend/optimizer/graph_kernel/add_atomic_clean_gpu.h" | ||||
| #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | #include "backend/optimizer/graph_kernel/arithmetic_simplify.h" | ||||
| #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | #include "backend/optimizer/graph_kernel/basic_ops_fusion.h" | ||||
| #include "backend/optimizer/graph_kernel/clean_all_in_once.h" | |||||
| #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | #include "backend/optimizer/graph_kernel/eliminate_redundant_output.h" | ||||
| #include "backend/optimizer/graph_kernel/tensor_promotion.h" | #include "backend/optimizer/graph_kernel/tensor_promotion.h" | ||||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | ||||
| @@ -182,6 +183,7 @@ void GPUSession::GraphKernelOptimize(const std::shared_ptr<KernelGraph> &kernel_ | |||||
| // will be exposed, use GetitemTuple Pass to delete them. | // will be exposed, use GetitemTuple Pass to delete them. | ||||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | pm->AddPass(std::make_shared<opt::GetitemTuple>()); | ||||
| pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>()); | pm->AddPass(std::make_shared<opt::AtomicCleanInsertter>()); | ||||
| pm->AddPass(std::make_shared<opt::CleanAllInOnce>()); | |||||
| pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | pm->AddPass(std::make_shared<opt::BindValueToGraph>()); | ||||
| optimizer->AddPassManager(pm); | optimizer->AddPassManager(pm); | ||||
| (void)optimizer->Optimize(kernel_graph); | (void)optimizer->Optimize(kernel_graph); | ||||