gpu add combine cast fusion

5 years ago · f36c2721af
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cu
@@ -0,0 +1,40 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <cuda_runtime.h>
 #include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh"
 template <typename T, typename S>
 __global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) {
    for (size_t i = 0; i < num; i++) {
        for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size[i]; pos += blockDim.x * gridDim.x) {
            output[i][pos] = static_cast<S>(inputs[i][pos]);
        }
    }
 }
 template <typename T, typename S>
 void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, const size_t *size,
                   cudaStream_t stream) {
    CastAll<<<GET_BLOCKS(max), GET_THREADS, 0, stream>>>(inputs, output, num, size);
    return;
 }
 template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num,
                            const size_t *size, cudaStream_t stream);
 template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num,
                            const size_t *size, cudaStream_t stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh
@@ -0,0 +1,24 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
 #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
 #include <vector>
 #include "runtime/device/gpu/cuda_common.h"
 template <typename T, typename S>
 void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, cudaStream_t stream);
 #endif  // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.cc
@@ -0,0 +1,28 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h"
 namespace mindspore {
 namespace kernel {
 MS_REG_GPU_KERNEL_TWO(
  CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16),
  CastAllGpuFwdKernel, float, half)
 MS_REG_GPU_KERNEL_TWO(
  CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32),
  CastAllGpuFwdKernel, half, float)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h
@@ -0,0 +1,104 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
 #include <memory>
 #include <vector>
 #include <map>
 #include <string>
 #include "backend/kernel_compiler/gpu/gpu_kernel.h"
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh"
 namespace mindspore {
 namespace kernel {
 template <typename T, typename S>
 class CastAllGpuFwdKernel : public GpuKernel {
 public:
  CastAllGpuFwdKernel() : max_(0), input_size_(0), output_size_(0), num_input_(0) {}
  ~CastAllGpuFwdKernel() override = default;
  const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; }
  const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; }
  const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; }
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    auto stream = reinterpret_cast<cudaStream_t>(stream_ptr);
    auto in_addr = std::make_unique<T *[]>(num_input_);
    auto out_addr = std::make_unique<S *[]>(num_input_);
    for (size_t i = 0; i < num_input_; i++) {
      in_addr[i] = GetDeviceAddress<T>(inputs, i);
      out_addr[i] = GetDeviceAddress<S>(outputs, i);
    }
    T **inputs_dev = GetDeviceAddress<T *>(workspace, 0);
    S **outputs_dev = GetDeviceAddress<S *>(workspace, 1);
    size_t *size_dev = GetDeviceAddress<size_t>(workspace, 2);
    CHECK_CUDA_RET_WITH_EXCEPT(
      cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream),
      "cudaMemCPY failed")
    CHECK_CUDA_RET_WITH_EXCEPT(
      cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream),
      "cudaMemCPY failed")
    CHECK_CUDA_RET_WITH_EXCEPT(
      cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream),
      "cudaMemCPY failed")
    CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream);
    return true;
  }
  bool Init(const CNodePtr &kernel_node) override {
    num_input_ = GetAttr<size_t>(kernel_node, "n");
    size_ = std::make_unique<size_t[]>(num_input_);
    for (size_t i = 0; i < num_input_; i++) {
      auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
      size_t s = 1;
      for (auto x : shape) {
        s = s * x;
      }
      if (max_ < s) {
        max_ = s;
      }
      size_[i] = s;
      input_size_ = sizeof(T) * s;
      output_size_ = sizeof(S) * s;
      InitSizeLists();
    }
    workspace_size_list_.push_back(sizeof(T *) * num_input_);
    workspace_size_list_.push_back(sizeof(S *) * num_input_);
    workspace_size_list_.push_back(sizeof(size_t) * num_input_);
    return true;
  }
 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(input_size_);
    output_size_list_.push_back(output_size_);
  }
  std::vector<size_t> input_size_list_;
  std::vector<size_t> output_size_list_;
  std::vector<size_t> workspace_size_list_;
  std::unique_ptr<size_t[]> size_;
  size_t max_;
  size_t input_size_;
  size_t output_size_;
  size_t num_input_;
 };
 }  // namespace kernel
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H
--- a/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.cc
@@ -0,0 +1,140 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #include "backend/optimizer/gpu/combine_cast_fusion.h"
 #include <memory>
 #include <vector>
 #include <string>
 #include "backend/session/anf_runtime_algorithm.h"
 #include "ir/primitive.h"
 #include "utils/utils.h"
 #include "backend/optimizer/common/helper.h"
 namespace mindspore {
 namespace opt {
 namespace {
 kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const std::vector<AnfNodePtr> &node_list) {
  std::vector<std::string> inputs_device_format;
  std::vector<std::string> outputs_device_format;
  std::vector<TypeId> inputs_device_type;
  std::vector<TypeId> outputs_device_type;
  std::vector<std::vector<size_t>> outputs_shape;
  kernel::KernelBuildInfo::KernelBuildInfoBuilder builder;
  for (size_t idx = 0; idx < node_list.size(); ++idx) {
    auto cnode = utils::cast<CNodePtr>(node_list[idx]);
    MS_EXCEPTION_IF_NULL(cnode);
    for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) {
      inputs_device_format.push_back(kOpFormat_DEFAULT);
      inputs_device_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index));
    }
    for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(cnode); ++output_index) {
      outputs_device_format.push_back(kOpFormat_DEFAULT);
      outputs_device_type.push_back(AnfAlgo::GetOutputInferDataType(cnode, output_index));
      outputs_shape.push_back(AnfAlgo::GetOutputInferShape(cnode, output_index));
    }
  }
  builder.SetInputsFormat(inputs_device_format);
  builder.SetOutputsFormat(outputs_device_format);
  builder.SetInputsDeviceType(inputs_device_type);
  builder.SetOutputsDeviceType(outputs_device_type);
  return builder.Build();
 }
 bool GetDealList(const std::vector<AnfNodePtr> &node_list, std::vector<std::vector<AnfNodePtr>> *deal_list) {
  std::vector<AnfNodePtr> cast_32to16_list;
  std::vector<AnfNodePtr> cast_16to32_list;
  for (auto &cast_node : node_list) {
    // currently, we only deal with the construct : [Param->Cast->] to avoid being a cycle.
    if (cast_node != nullptr && cast_node->isa<CNode>() && AnfAlgo::GetCNodeName(cast_node) == "Cast" &&
        (AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_node), 0))->isa<Parameter>()) {
      auto dst = AnfAlgo::GetOutputInferDataType(cast_node, 0);
      auto src = AnfAlgo::GetPrevNodeOutputInferDataType(cast_node, 0);
      if (dst == kNumberTypeFloat16 && src == kNumberTypeFloat32) {
        cast_32to16_list.push_back(cast_node);
      } else if (dst == kNumberTypeFloat32 && src == kNumberTypeFloat16) {
        cast_16to32_list.push_back(cast_node);
      }
    }
  }
  if (cast_32to16_list.size() <= 1 && cast_16to32_list.size() <= 1) {
    return false;
  }
  if (cast_32to16_list.size() > 1) {
    deal_list->push_back(cast_32to16_list);
  }
  if (cast_16to32_list.size() > 1) {
    deal_list->push_back(cast_16to32_list);
  }
  return true;
 }
 }  // namespace
 bool CastAllFusion::Run(const FuncGraphPtr &graph) {
  MS_EXCEPTION_IF_NULL(graph);
  auto manager = graph->manager();
  std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return());
  // 1 get all the cast node
  std::vector<std::vector<AnfNodePtr>> deal_list;
  if (!GetDealList(node_list, &deal_list)) {
    return false;
  }
  for (auto cast_list : deal_list) {
    // 2 create node CastAll
    auto prim = std::make_shared<Primitive>("CastAll");
    std::vector<AnfNodePtr> inputs = {NewValueNode(prim)};
    // set inputs for CastAll
    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
      inputs.push_back(AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_list[idx]), 0));
    }
    auto cast_all = graph->NewCNode(inputs);
    auto kernel_info = std::make_shared<device::KernelInfo>();
    MS_EXCEPTION_IF_NULL(kernel_info);
    cast_all->set_kernel_info(kernel_info);
    AbstractBasePtrList abstract_list;
    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
      auto cnode = utils::cast<CNodePtr>(cast_list[idx]);
      MS_EXCEPTION_IF_NULL(cnode);
      abstract_list.push_back(cnode->abstract());
    }
    auto kernel_build_info = GenerateKernelBuildInfo(cast_list);
    AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info, cast_all.get());
    auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list);
    MS_EXCEPTION_IF_NULL(abstract_tuple);
    cast_all->set_abstract(abstract_tuple);
    AnfAlgo::SetNodeAttr("n", MakeValue(cast_list.size()), cast_all);
    // 3 replace all the cast by CastAllv tuplegetitem[castall, idx]
    for (size_t idx = 0; idx < cast_list.size(); ++idx) {
      std::vector<AnfNodePtr> tuple_getitem_input;
      tuple_getitem_input.push_back(NewValueNode(prim::kPrimTupleGetItem));
      tuple_getitem_input.push_back(cast_all);
      auto index = NewValueNode(SizeToInt(idx));
      auto imm = std::make_shared<Int32Imm>(idx);
      auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm);
      MS_EXCEPTION_IF_NULL(abstract_scalar);
      index->set_abstract(abstract_scalar);
      tuple_getitem_input.push_back(index);
      AnfNodePtr tuple_getitem = graph->NewCNode(tuple_getitem_input);
      MS_EXCEPTION_IF_NULL(tuple_getitem);
      tuple_getitem->set_abstract(cast_list[idx]->abstract());
      if (!manager->Replace(cast_list[idx], tuple_getitem)) {
        MS_LOG(EXCEPTION) << "manager replace node failed";
      }
    }
  }
  return true;
 }
 }  // namespace opt
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/combine_cast_fusion.h
@@ -0,0 +1,34 @@
 /**
 * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_
 #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_
 #include <memory>
 #include <string>
 #include <vector>
 #include "backend/optimizer/common/optimizer.h"
 namespace mindspore {
 namespace opt {
 class CastAllFusion : public Pass {
 public:
  explicit CastAllFusion(const std::string &name) : Pass("cast_all") {}
  ~CastAllFusion() override = default;
  bool Run(const FuncGraphPtr &graph) override;
 };
 }  // namespace opt
 }  // namespace mindspore
 #endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_