!7256 GPU add reduce precision

Merge pull request !7256 from VectorSL/reduceprecision
5 years ago · 4424873e7e
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc
@@ -74,6 +74,38 @@ std::string GpuKernelFactory::SupportedTypeList(const std::string &kernel_name)
  return type_lists;
 }

 bool GpuKernelFactory::ReducePrecision(
  const std::string &kernel_name, std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder) {
  auto kernel_info = builder->Build();
  auto iter = map_kernel_name_to_creater_.find(kernel_name);
  if (map_kernel_name_to_creater_.end() == iter) {
    MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!";
    return false;
  }
  reduce_flag_.first.clear();
  for (size_t attr_index = 0; attr_index < (iter->second).size(); ++attr_index) {
    auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize();
    for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) {
      if (kernel_info->GetInputDeviceType(input_index) == kNumberTypeInt64 &&
          (iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first == kNumberTypeInt32) {
        builder->SetInputDeviceType(kNumberTypeInt32, input_index);
        reduce_flag_.first.push_back(input_index);
        MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast input " << input_index
                        << " to int32.";
      }
    }
    for (size_t output_index = 0; output_index < kernel_info->GetOutputNum(); output_index++) {
      if (kernel_info->GetOutputDeviceType(output_index) == kNumberTypeInt64 &&
          (iter->second)[attr_index].first.GetOutputAttr(output_index % attr_size).first == kNumberTypeInt32) {
        builder->SetOutputDeviceType(kNumberTypeInt32, output_index);
        MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast output " << output_index
                        << " to int32.";
      }
    }
  }
  return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build());
 }

 std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name,
                                                             const KernelBuildInfo *kernel_info) {
  auto iter = map_kernel_name_to_creater_.find(kernel_name);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
@@ -21,6 +21,7 @@
 #include <string>
 #include <vector>
 #include <utility>
 #include <memory>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "runtime/device/gpu/kernel_info_setter.h"
 #include "backend/kernel_compiler/kernel_build_info.h"
@@ -43,6 +44,11 @@ class GpuKernelFactory {

  std::string SupportedTypeList(const std::string &kernel_name);

  bool ReducePrecision(const std::string &kernel_name,
                       std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder);

  std::pair<std::vector<size_t>, TypeId> reduce_flag_{{}, kNumberTypeInt64};

 private:
  GpuKernelFactory() = default;

--- a/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc
@@ -0,0 +1,85 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/reduce_precision_fusion.h"

 #include <memory>
 #include <vector>
 #include <string>

 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"

 namespace mindspore {
 namespace opt {
 namespace {
 void ReducePrecision(const FuncGraphPtr &graph, const AnfNodePtr &node, size_t i, const TypeId &src_type,
                     const TypeId &cast_type) {
  auto prim = std::make_shared<Primitive>(prim::kPrimCast->name());
  MS_EXCEPTION_IF_NULL(prim);
  std::vector<AnfNodePtr> inputs = {NewValueNode(prim), AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), i)};
  auto cast = graph->NewCNode(inputs);
  auto cast_shape = {AnfAlgo::GetInputDeviceShape(node, i)};
  AnfAlgo::SetOutputInferTypeAndShape({cast_type}, cast_shape, cast.get());
  FuncGraphManagerPtr manager = graph->manager();
  MS_EXCEPTION_IF_NULL(manager);
  manager->SetEdge(node, i + 1, cast);
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
  builder.SetInputsFormat({kOpFormat_DEFAULT});
  builder.SetOutputsFormat({kOpFormat_DEFAULT});
  builder.SetInputsDeviceType({src_type});
  builder.SetOutputsDeviceType({cast_type});
  builder.SetKernelType(AKG_KERNEL);
  builder.SetProcessor(kernel::Processor::CUDA);
  AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get());
 }

 }  // namespace
 bool ReducePrecisionFusion::Run(const FuncGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
  for (auto node : node_list) {
    if (node != nullptr && node->isa<CNode>() && AnfAlgo::IsRealKernel(node)) {
      size_t input_num = AnfAlgo::GetInputTensorNum(node);
      size_t output_num = AnfAlgo::GetOutputTensorNum(node);
      for (size_t i = 0; i < input_num; i++) {
        auto inferType = AnfAlgo::GetPrevNodeOutputInferDataType(node, i);
        auto deviceType = AnfAlgo::GetInputDeviceDataType(node, i);
        if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) {
          ReducePrecision(graph, node, i, inferType, deviceType);
          MS_LOG(WARNING) << "Reduce precision for [" << AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(node))
                          << "] input " << i;
        }
      }
      for (size_t i = 0; i < output_num; i++) {
        auto inferType = AnfAlgo::GetOutputInferDataType(node, i);
        auto deviceType = AnfAlgo::GetOutputDeviceDataType(node, i);
        if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) {
          auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i);
          for (size_t j = 0; j < used_node_list->size(); j++) {
            auto used_node = used_node_list->at(j).first;
            auto used_node_index = used_node_list->at(j).second - 1;
            ReducePrecision(graph, used_node, used_node_index, deviceType, inferType);
          }
        }
      }
    }
  }
  return true;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h
@@ -0,0 +1,34 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_

 #include <memory>
 #include <string>
 #include <vector>
 #include "backend/optimizer/common/optimizer.h"

 namespace mindspore {
 namespace opt {
 class ReducePrecisionFusion : public Pass {
 public:
  explicit ReducePrecisionFusion(const std::string &name) : Pass("reduce_precision") {}
  ~ReducePrecisionFusion() override = default;
  bool Run(const FuncGraphPtr &graph) override;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -38,6 +38,7 @@
 #include "backend/optimizer/gpu/remove_format_transform_pair.h"
 #include "backend/optimizer/gpu/remove_redundant_format_transform.h"
 #include "backend/optimizer/gpu/cudnn_inplace_fusion.h"
 #include "backend/optimizer/gpu/reduce_precision_fusion.h"
 #include "backend/optimizer/graph_kernel/value_graph_binder.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
 #include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
@@ -101,6 +102,7 @@ void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_gra
  pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>());
  pm->AddPass(std::make_shared<opt::AllReduceFusion>());
  pm->AddPass(std::make_shared<opt::GetitemTuple>());
  pm->AddPass(std::make_shared<opt::ReducePrecisionFusion>("reduce_precision"));
  optimizer->AddPassManager(pm);
  (void)optimizer->Optimize(kernel_graph);
  kernel_graph->SetExecOrderByDefault();
--- a/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc
@@ -371,6 +371,9 @@ void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) {
  if (kernel_type == UNKNOWN_KERNEL_TYPE) {
    result =
      kernel::GpuKernelFactory::GetInstance().SearchRegistered(AnfAlgo::GetCNodeName(kernel_node), builder->Build());
    if (!result) {
      result = kernel::GpuKernelFactory::GetInstance().ReducePrecision(AnfAlgo::GetCNodeName(kernel_node), builder);
    }
    if (!result) {
      result = SelectAkgKernel(kernel_node, builder->Build());
      kernel_type = AKG_KERNEL;