diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc index a2750f3243..ee720c6e0a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc @@ -74,6 +74,38 @@ std::string GpuKernelFactory::SupportedTypeList(const std::string &kernel_name) return type_lists; } +bool GpuKernelFactory::ReducePrecision( + const std::string &kernel_name, std::shared_ptr builder) { + auto kernel_info = builder->Build(); + auto iter = map_kernel_name_to_creater_.find(kernel_name); + if (map_kernel_name_to_creater_.end() == iter) { + MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!"; + return false; + } + reduce_flag_.first.clear(); + for (size_t attr_index = 0; attr_index < (iter->second).size(); ++attr_index) { + auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize(); + for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) { + if (kernel_info->GetInputDeviceType(input_index) == kNumberTypeInt64 && + (iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first == kNumberTypeInt32) { + builder->SetInputDeviceType(kNumberTypeInt32, input_index); + reduce_flag_.first.push_back(input_index); + MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast input " << input_index + << " to int32."; + } + } + for (size_t output_index = 0; output_index < kernel_info->GetOutputNum(); output_index++) { + if (kernel_info->GetOutputDeviceType(output_index) == kNumberTypeInt64 && + (iter->second)[attr_index].first.GetOutputAttr(output_index % attr_size).first == kNumberTypeInt32) { + builder->SetOutputDeviceType(kNumberTypeInt32, output_index); + MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast output " << output_index + << " to int32."; + } + } + } + return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build()); +} + std::pair GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name, const KernelBuildInfo *kernel_info) { auto iter = map_kernel_name_to_creater_.find(kernel_name); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h index 967f143aa2..c4667c56c8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "backend/kernel_compiler/gpu/gpu_kernel.h" #include "runtime/device/gpu/kernel_info_setter.h" #include "backend/kernel_compiler/kernel_build_info.h" @@ -43,6 +44,11 @@ class GpuKernelFactory { std::string SupportedTypeList(const std::string &kernel_name); + bool ReducePrecision(const std::string &kernel_name, + std::shared_ptr builder); + + std::pair, TypeId> reduce_flag_{{}, kNumberTypeInt64}; + private: GpuKernelFactory() = default; diff --git a/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc new file mode 100644 index 0000000000..445236a4f7 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc @@ -0,0 +1,85 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "backend/optimizer/gpu/reduce_precision_fusion.h" + +#include +#include +#include + +#include "backend/session/anf_runtime_algorithm.h" +#include "ir/primitive.h" +#include "utils/utils.h" +#include "backend/optimizer/common/helper.h" + +namespace mindspore { +namespace opt { +namespace { +void ReducePrecision(const FuncGraphPtr &graph, const AnfNodePtr &node, size_t i, const TypeId &src_type, + const TypeId &cast_type) { + auto prim = std::make_shared(prim::kPrimCast->name()); + MS_EXCEPTION_IF_NULL(prim); + std::vector inputs = {NewValueNode(prim), AnfAlgo::GetInputNode(utils::cast(node), i)}; + auto cast = graph->NewCNode(inputs); + auto cast_shape = {AnfAlgo::GetInputDeviceShape(node, i)}; + AnfAlgo::SetOutputInferTypeAndShape({cast_type}, cast_shape, cast.get()); + FuncGraphManagerPtr manager = graph->manager(); + MS_EXCEPTION_IF_NULL(manager); + manager->SetEdge(node, i + 1, cast); + kernel::KernelBuildInfo::KernelBuildInfoBuilder builder; + builder.SetInputsFormat({kOpFormat_DEFAULT}); + builder.SetOutputsFormat({kOpFormat_DEFAULT}); + builder.SetInputsDeviceType({src_type}); + builder.SetOutputsDeviceType({cast_type}); + builder.SetKernelType(AKG_KERNEL); + builder.SetProcessor(kernel::Processor::CUDA); + AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get()); +} + +} // namespace +bool ReducePrecisionFusion::Run(const FuncGraphPtr &graph) { + MS_EXCEPTION_IF_NULL(graph); + std::vector node_list = TopoSort(graph->get_return()); + for (auto node : node_list) { + if (node != nullptr && node->isa() && AnfAlgo::IsRealKernel(node)) { + size_t input_num = AnfAlgo::GetInputTensorNum(node); + size_t output_num = AnfAlgo::GetOutputTensorNum(node); + for (size_t i = 0; i < input_num; i++) { + auto inferType = AnfAlgo::GetPrevNodeOutputInferDataType(node, i); + auto deviceType = AnfAlgo::GetInputDeviceDataType(node, i); + if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) { + ReducePrecision(graph, node, i, inferType, deviceType); + MS_LOG(WARNING) << "Reduce precision for [" << AnfAlgo::GetCNodeName(utils::cast(node)) + << "] input " << i; + } + } + for (size_t i = 0; i < output_num; i++) { + auto inferType = AnfAlgo::GetOutputInferDataType(node, i); + auto deviceType = AnfAlgo::GetOutputDeviceDataType(node, i); + if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) { + auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i); + for (size_t j = 0; j < used_node_list->size(); j++) { + auto used_node = used_node_list->at(j).first; + auto used_node_index = used_node_list->at(j).second - 1; + ReducePrecision(graph, used_node, used_node_index, deviceType, inferType); + } + } + } + } + } + return true; +} +} // namespace opt +} // namespace mindspore diff --git a/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h new file mode 100644 index 0000000000..fa4506bd65 --- /dev/null +++ b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h @@ -0,0 +1,34 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ +#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ + +#include +#include +#include +#include "backend/optimizer/common/optimizer.h" + +namespace mindspore { +namespace opt { +class ReducePrecisionFusion : public Pass { + public: + explicit ReducePrecisionFusion(const std::string &name) : Pass("reduce_precision") {} + ~ReducePrecisionFusion() override = default; + bool Run(const FuncGraphPtr &graph) override; +}; +} // namespace opt +} // namespace mindspore +#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_ diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc index 551561a234..9c62c1a5f6 100644 --- a/mindspore/ccsrc/backend/session/gpu_session.cc +++ b/mindspore/ccsrc/backend/session/gpu_session.cc @@ -38,6 +38,7 @@ #include "backend/optimizer/gpu/remove_format_transform_pair.h" #include "backend/optimizer/gpu/remove_redundant_format_transform.h" #include "backend/optimizer/gpu/cudnn_inplace_fusion.h" +#include "backend/optimizer/gpu/reduce_precision_fusion.h" #include "backend/optimizer/graph_kernel/value_graph_binder.h" #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h" #include "backend/optimizer/graph_kernel/graph_kernel_expander.h" @@ -101,6 +102,7 @@ void GPUSession::HardwareOptimize(const std::shared_ptr &kernel_gra pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); pm->AddPass(std::make_shared()); + pm->AddPass(std::make_shared("reduce_precision")); optimizer->AddPassManager(pm); (void)optimizer->Optimize(kernel_graph); kernel_graph->SetExecOrderByDefault(); diff --git a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc index 8d73a33329..5fd88d166e 100644 --- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc +++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc @@ -371,6 +371,9 @@ void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) { if (kernel_type == UNKNOWN_KERNEL_TYPE) { result = kernel::GpuKernelFactory::GetInstance().SearchRegistered(AnfAlgo::GetCNodeName(kernel_node), builder->Build()); + if (!result) { + result = kernel::GpuKernelFactory::GetInstance().ReducePrecision(AnfAlgo::GetCNodeName(kernel_node), builder); + } if (!result) { result = SelectAkgKernel(kernel_node, builder->Build()); kernel_type = AKG_KERNEL;