Merge pull request !7256 from VectorSL/reduceprecisiontags/v1.1.0
| @@ -74,6 +74,38 @@ std::string GpuKernelFactory::SupportedTypeList(const std::string &kernel_name) | |||||
| return type_lists; | return type_lists; | ||||
| } | } | ||||
| bool GpuKernelFactory::ReducePrecision( | |||||
| const std::string &kernel_name, std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder) { | |||||
| auto kernel_info = builder->Build(); | |||||
| auto iter = map_kernel_name_to_creater_.find(kernel_name); | |||||
| if (map_kernel_name_to_creater_.end() == iter) { | |||||
| MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!"; | |||||
| return false; | |||||
| } | |||||
| reduce_flag_.first.clear(); | |||||
| for (size_t attr_index = 0; attr_index < (iter->second).size(); ++attr_index) { | |||||
| auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize(); | |||||
| for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) { | |||||
| if (kernel_info->GetInputDeviceType(input_index) == kNumberTypeInt64 && | |||||
| (iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first == kNumberTypeInt32) { | |||||
| builder->SetInputDeviceType(kNumberTypeInt32, input_index); | |||||
| reduce_flag_.first.push_back(input_index); | |||||
| MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast input " << input_index | |||||
| << " to int32."; | |||||
| } | |||||
| } | |||||
| for (size_t output_index = 0; output_index < kernel_info->GetOutputNum(); output_index++) { | |||||
| if (kernel_info->GetOutputDeviceType(output_index) == kNumberTypeInt64 && | |||||
| (iter->second)[attr_index].first.GetOutputAttr(output_index % attr_size).first == kNumberTypeInt32) { | |||||
| builder->SetOutputDeviceType(kNumberTypeInt32, output_index); | |||||
| MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast output " << output_index | |||||
| << " to int32."; | |||||
| } | |||||
| } | |||||
| } | |||||
| return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build()); | |||||
| } | |||||
| std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name, | std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name, | ||||
| const KernelBuildInfo *kernel_info) { | const KernelBuildInfo *kernel_info) { | ||||
| auto iter = map_kernel_name_to_creater_.find(kernel_name); | auto iter = map_kernel_name_to_creater_.find(kernel_name); | ||||
| @@ -21,6 +21,7 @@ | |||||
| #include <string> | #include <string> | ||||
| #include <vector> | #include <vector> | ||||
| #include <utility> | #include <utility> | ||||
| #include <memory> | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | #include "backend/kernel_compiler/gpu/gpu_kernel.h" | ||||
| #include "runtime/device/gpu/kernel_info_setter.h" | #include "runtime/device/gpu/kernel_info_setter.h" | ||||
| #include "backend/kernel_compiler/kernel_build_info.h" | #include "backend/kernel_compiler/kernel_build_info.h" | ||||
| @@ -43,6 +44,11 @@ class GpuKernelFactory { | |||||
| std::string SupportedTypeList(const std::string &kernel_name); | std::string SupportedTypeList(const std::string &kernel_name); | ||||
| bool ReducePrecision(const std::string &kernel_name, | |||||
| std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder); | |||||
| std::pair<std::vector<size_t>, TypeId> reduce_flag_{{}, kNumberTypeInt64}; | |||||
| private: | private: | ||||
| GpuKernelFactory() = default; | GpuKernelFactory() = default; | ||||
| @@ -0,0 +1,85 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/gpu/reduce_precision_fusion.h" | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "ir/primitive.h" | |||||
| #include "utils/utils.h" | |||||
| #include "backend/optimizer/common/helper.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| namespace { | |||||
| void ReducePrecision(const FuncGraphPtr &graph, const AnfNodePtr &node, size_t i, const TypeId &src_type, | |||||
| const TypeId &cast_type) { | |||||
| auto prim = std::make_shared<Primitive>(prim::kPrimCast->name()); | |||||
| MS_EXCEPTION_IF_NULL(prim); | |||||
| std::vector<AnfNodePtr> inputs = {NewValueNode(prim), AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), i)}; | |||||
| auto cast = graph->NewCNode(inputs); | |||||
| auto cast_shape = {AnfAlgo::GetInputDeviceShape(node, i)}; | |||||
| AnfAlgo::SetOutputInferTypeAndShape({cast_type}, cast_shape, cast.get()); | |||||
| FuncGraphManagerPtr manager = graph->manager(); | |||||
| MS_EXCEPTION_IF_NULL(manager); | |||||
| manager->SetEdge(node, i + 1, cast); | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder builder; | |||||
| builder.SetInputsFormat({kOpFormat_DEFAULT}); | |||||
| builder.SetOutputsFormat({kOpFormat_DEFAULT}); | |||||
| builder.SetInputsDeviceType({src_type}); | |||||
| builder.SetOutputsDeviceType({cast_type}); | |||||
| builder.SetKernelType(AKG_KERNEL); | |||||
| builder.SetProcessor(kernel::Processor::CUDA); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get()); | |||||
| } | |||||
| } // namespace | |||||
| bool ReducePrecisionFusion::Run(const FuncGraphPtr &graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return()); | |||||
| for (auto node : node_list) { | |||||
| if (node != nullptr && node->isa<CNode>() && AnfAlgo::IsRealKernel(node)) { | |||||
| size_t input_num = AnfAlgo::GetInputTensorNum(node); | |||||
| size_t output_num = AnfAlgo::GetOutputTensorNum(node); | |||||
| for (size_t i = 0; i < input_num; i++) { | |||||
| auto inferType = AnfAlgo::GetPrevNodeOutputInferDataType(node, i); | |||||
| auto deviceType = AnfAlgo::GetInputDeviceDataType(node, i); | |||||
| if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) { | |||||
| ReducePrecision(graph, node, i, inferType, deviceType); | |||||
| MS_LOG(WARNING) << "Reduce precision for [" << AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(node)) | |||||
| << "] input " << i; | |||||
| } | |||||
| } | |||||
| for (size_t i = 0; i < output_num; i++) { | |||||
| auto inferType = AnfAlgo::GetOutputInferDataType(node, i); | |||||
| auto deviceType = AnfAlgo::GetOutputDeviceDataType(node, i); | |||||
| if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) { | |||||
| auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i); | |||||
| for (size_t j = 0; j < used_node_list->size(); j++) { | |||||
| auto used_node = used_node_list->at(j).first; | |||||
| auto used_node_index = used_node_list->at(j).second - 1; | |||||
| ReducePrecision(graph, used_node, used_node_index, deviceType, inferType); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return true; | |||||
| } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| #include "backend/optimizer/common/optimizer.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class ReducePrecisionFusion : public Pass { | |||||
| public: | |||||
| explicit ReducePrecisionFusion(const std::string &name) : Pass("reduce_precision") {} | |||||
| ~ReducePrecisionFusion() override = default; | |||||
| bool Run(const FuncGraphPtr &graph) override; | |||||
| }; | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ | |||||
| @@ -38,6 +38,7 @@ | |||||
| #include "backend/optimizer/gpu/remove_format_transform_pair.h" | #include "backend/optimizer/gpu/remove_format_transform_pair.h" | ||||
| #include "backend/optimizer/gpu/remove_redundant_format_transform.h" | #include "backend/optimizer/gpu/remove_redundant_format_transform.h" | ||||
| #include "backend/optimizer/gpu/cudnn_inplace_fusion.h" | #include "backend/optimizer/gpu/cudnn_inplace_fusion.h" | ||||
| #include "backend/optimizer/gpu/reduce_precision_fusion.h" | |||||
| #include "backend/optimizer/graph_kernel/value_graph_binder.h" | #include "backend/optimizer/graph_kernel/value_graph_binder.h" | ||||
| #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" | ||||
| #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" | ||||
| @@ -101,6 +102,7 @@ void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_gra | |||||
| pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>()); | pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>()); | ||||
| pm->AddPass(std::make_shared<opt::AllReduceFusion>()); | pm->AddPass(std::make_shared<opt::AllReduceFusion>()); | ||||
| pm->AddPass(std::make_shared<opt::GetitemTuple>()); | pm->AddPass(std::make_shared<opt::GetitemTuple>()); | ||||
| pm->AddPass(std::make_shared<opt::ReducePrecisionFusion>("reduce_precision")); | |||||
| optimizer->AddPassManager(pm); | optimizer->AddPassManager(pm); | ||||
| (void)optimizer->Optimize(kernel_graph); | (void)optimizer->Optimize(kernel_graph); | ||||
| kernel_graph->SetExecOrderByDefault(); | kernel_graph->SetExecOrderByDefault(); | ||||
| @@ -371,6 +371,9 @@ void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) { | |||||
| if (kernel_type == UNKNOWN_KERNEL_TYPE) { | if (kernel_type == UNKNOWN_KERNEL_TYPE) { | ||||
| result = | result = | ||||
| kernel::GpuKernelFactory::GetInstance().SearchRegistered(AnfAlgo::GetCNodeName(kernel_node), builder->Build()); | kernel::GpuKernelFactory::GetInstance().SearchRegistered(AnfAlgo::GetCNodeName(kernel_node), builder->Build()); | ||||
| if (!result) { | |||||
| result = kernel::GpuKernelFactory::GetInstance().ReducePrecision(AnfAlgo::GetCNodeName(kernel_node), builder); | |||||
| } | |||||
| if (!result) { | if (!result) { | ||||
| result = SelectAkgKernel(kernel_node, builder->Build()); | result = SelectAkgKernel(kernel_node, builder->Build()); | ||||
| kernel_type = AKG_KERNEL; | kernel_type = AKG_KERNEL; | ||||