Browse Source

!7256 GPU add reduce precision

Merge pull request !7256 from VectorSL/reduceprecision
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
4424873e7e
6 changed files with 162 additions and 0 deletions
  1. +32
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc
  2. +6
    -0
      mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h
  3. +85
    -0
      mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc
  4. +34
    -0
      mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h
  5. +2
    -0
      mindspore/ccsrc/backend/session/gpu_session.cc
  6. +3
    -0
      mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc

+ 32
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.cc View File

@@ -74,6 +74,38 @@ std::string GpuKernelFactory::SupportedTypeList(const std::string &kernel_name)
return type_lists;
}

bool GpuKernelFactory::ReducePrecision(
const std::string &kernel_name, std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder) {
auto kernel_info = builder->Build();
auto iter = map_kernel_name_to_creater_.find(kernel_name);
if (map_kernel_name_to_creater_.end() == iter) {
MS_LOG(INFO) << "Not registered GPU kernel: op[" << kernel_name << "]!";
return false;
}
reduce_flag_.first.clear();
for (size_t attr_index = 0; attr_index < (iter->second).size(); ++attr_index) {
auto attr_size = (&(iter->second))->at(attr_index).first.GetInputSize();
for (size_t input_index = 0; input_index < kernel_info->GetInputNum(); input_index++) {
if (kernel_info->GetInputDeviceType(input_index) == kNumberTypeInt64 &&
(iter->second)[attr_index].first.GetInputAttr(input_index % attr_size).first == kNumberTypeInt32) {
builder->SetInputDeviceType(kNumberTypeInt32, input_index);
reduce_flag_.first.push_back(input_index);
MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast input " << input_index
<< " to int32.";
}
}
for (size_t output_index = 0; output_index < kernel_info->GetOutputNum(); output_index++) {
if (kernel_info->GetOutputDeviceType(output_index) == kNumberTypeInt64 &&
(iter->second)[attr_index].first.GetOutputAttr(output_index % attr_size).first == kNumberTypeInt32) {
builder->SetOutputDeviceType(kNumberTypeInt32, output_index);
MS_LOG(WARNING) << "Kernel [" << kernel_name << "] does not support int64, cast output " << output_index
<< " to int32.";
}
}
}
return GpuKernelFactory::SearchRegistered(kernel_name, builder->Build());
}

std::pair<bool, size_t> GpuKernelFactory::GpuKernelAttrCheck(const std::string &kernel_name,
const KernelBuildInfo *kernel_info) {
auto iter = map_kernel_name_to_creater_.find(kernel_name);


+ 6
- 0
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel_factory.h View File

@@ -21,6 +21,7 @@
#include <string>
#include <vector>
#include <utility>
#include <memory>
#include "backend/kernel_compiler/gpu/gpu_kernel.h"
#include "runtime/device/gpu/kernel_info_setter.h"
#include "backend/kernel_compiler/kernel_build_info.h"
@@ -43,6 +44,11 @@ class GpuKernelFactory {

std::string SupportedTypeList(const std::string &kernel_name);

bool ReducePrecision(const std::string &kernel_name,
std::shared_ptr<mindspore::kernel::KernelBuildInfo::KernelBuildInfoBuilder> builder);

std::pair<std::vector<size_t>, TypeId> reduce_flag_{{}, kNumberTypeInt64};

private:
GpuKernelFactory() = default;



+ 85
- 0
mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.cc View File

@@ -0,0 +1,85 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/gpu/reduce_precision_fusion.h"

#include <memory>
#include <vector>
#include <string>

#include "backend/session/anf_runtime_algorithm.h"
#include "ir/primitive.h"
#include "utils/utils.h"
#include "backend/optimizer/common/helper.h"

namespace mindspore {
namespace opt {
namespace {
void ReducePrecision(const FuncGraphPtr &graph, const AnfNodePtr &node, size_t i, const TypeId &src_type,
const TypeId &cast_type) {
auto prim = std::make_shared<Primitive>(prim::kPrimCast->name());
MS_EXCEPTION_IF_NULL(prim);
std::vector<AnfNodePtr> inputs = {NewValueNode(prim), AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), i)};
auto cast = graph->NewCNode(inputs);
auto cast_shape = {AnfAlgo::GetInputDeviceShape(node, i)};
AnfAlgo::SetOutputInferTypeAndShape({cast_type}, cast_shape, cast.get());
FuncGraphManagerPtr manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
manager->SetEdge(node, i + 1, cast);
kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
builder.SetInputsFormat({kOpFormat_DEFAULT});
builder.SetOutputsFormat({kOpFormat_DEFAULT});
builder.SetInputsDeviceType({src_type});
builder.SetOutputsDeviceType({cast_type});
builder.SetKernelType(AKG_KERNEL);
builder.SetProcessor(kernel::Processor::CUDA);
AnfAlgo::SetSelectKernelBuildInfo(builder.Build(), cast.get());
}

} // namespace
bool ReducePrecisionFusion::Run(const FuncGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
for (auto node : node_list) {
if (node != nullptr && node->isa<CNode>() && AnfAlgo::IsRealKernel(node)) {
size_t input_num = AnfAlgo::GetInputTensorNum(node);
size_t output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t i = 0; i < input_num; i++) {
auto inferType = AnfAlgo::GetPrevNodeOutputInferDataType(node, i);
auto deviceType = AnfAlgo::GetInputDeviceDataType(node, i);
if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) {
ReducePrecision(graph, node, i, inferType, deviceType);
MS_LOG(WARNING) << "Reduce precision for [" << AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(node))
<< "] input " << i;
}
}
for (size_t i = 0; i < output_num; i++) {
auto inferType = AnfAlgo::GetOutputInferDataType(node, i);
auto deviceType = AnfAlgo::GetOutputDeviceDataType(node, i);
if (inferType == kNumberTypeInt64 && deviceType == kNumberTypeInt32) {
auto used_node_list = GetRealNodeUsedListByOutputIdx(graph, node, i);
for (size_t j = 0; j < used_node_list->size(); j++) {
auto used_node = used_node_list->at(j).first;
auto used_node_index = used_node_list->at(j).second - 1;
ReducePrecision(graph, used_node, used_node_index, deviceType, inferType);
}
}
}
}
}
return true;
}
} // namespace opt
} // namespace mindspore

+ 34
- 0
mindspore/ccsrc/backend/optimizer/gpu/reduce_precision_fusion.h View File

@@ -0,0 +1,34 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_

#include <memory>
#include <string>
#include <vector>
#include "backend/optimizer/common/optimizer.h"

namespace mindspore {
namespace opt {
class ReducePrecisionFusion : public Pass {
public:
explicit ReducePrecisionFusion(const std::string &name) : Pass("reduce_precision") {}
~ReducePrecisionFusion() override = default;
bool Run(const FuncGraphPtr &graph) override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REDUCE_PRECISION_FUSION_H_

+ 2
- 0
mindspore/ccsrc/backend/session/gpu_session.cc View File

@@ -38,6 +38,7 @@
#include "backend/optimizer/gpu/remove_format_transform_pair.h"
#include "backend/optimizer/gpu/remove_redundant_format_transform.h"
#include "backend/optimizer/gpu/cudnn_inplace_fusion.h"
#include "backend/optimizer/gpu/reduce_precision_fusion.h"
#include "backend/optimizer/graph_kernel/value_graph_binder.h"
#include "backend/optimizer/graph_kernel/graph_kernel_splitter.h"
#include "backend/optimizer/graph_kernel/graph_kernel_expander.h"
@@ -101,6 +102,7 @@ void GPUSession::HardwareOptimize(const std::shared_ptr<KernelGraph> &kernel_gra
pm->AddPass(std::make_shared<opt::RemoveRedundantFormatTransform>());
pm->AddPass(std::make_shared<opt::AllReduceFusion>());
pm->AddPass(std::make_shared<opt::GetitemTuple>());
pm->AddPass(std::make_shared<opt::ReducePrecisionFusion>("reduce_precision"));
optimizer->AddPassManager(pm);
(void)optimizer->Optimize(kernel_graph);
kernel_graph->SetExecOrderByDefault();


+ 3
- 0
mindspore/ccsrc/runtime/device/gpu/kernel_info_setter.cc View File

@@ -371,6 +371,9 @@ void SetKernelInfo(const CNodePtr &kernel_node, KernelType kernel_type) {
if (kernel_type == UNKNOWN_KERNEL_TYPE) {
result =
kernel::GpuKernelFactory::GetInstance().SearchRegistered(AnfAlgo::GetCNodeName(kernel_node), builder->Build());
if (!result) {
result = kernel::GpuKernelFactory::GetInstance().ReducePrecision(AnfAlgo::GetCNodeName(kernel_node), builder);
}
if (!result) {
result = SelectAkgKernel(kernel_node, builder->Build());
kernel_type = AKG_KERNEL;


Loading…
Cancel
Save