| @@ -0,0 +1,40 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include <stdio.h> | |||||
| #include <stdint.h> | |||||
| #include <cuda_runtime.h> | |||||
| #include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh" | |||||
| template <typename T, typename S> | |||||
| __global__ void CastAll(T** inputs, S** output, const size_t num, const size_t *size) { | |||||
| for (size_t i = 0; i < num; i++) { | |||||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size[i]; pos += blockDim.x * gridDim.x) { | |||||
| output[i][pos] = static_cast<S>(inputs[i][pos]); | |||||
| } | |||||
| } | |||||
| } | |||||
| template <typename T, typename S> | |||||
| void CastAllKernel(T** inputs, S** output, const size_t max, const size_t num, const size_t *size, | |||||
| cudaStream_t stream) { | |||||
| CastAll<<<GET_BLOCKS(max), GET_THREADS, 0, stream>>>(inputs, output, num, size); | |||||
| return; | |||||
| } | |||||
| template void CastAllKernel(half** inputs, float** output, const size_t max, const size_t num, | |||||
| const size_t *size, cudaStream_t stream); | |||||
| template void CastAllKernel(float** inputs, half** output, const size_t max, const size_t num, | |||||
| const size_t *size, cudaStream_t stream); | |||||
| @@ -0,0 +1,24 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ | |||||
| #define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ | |||||
| #include <vector> | |||||
| #include "runtime/device/gpu/cuda_common.h" | |||||
| template <typename T, typename S> | |||||
| void CastAllKernel(T **inputs, S **output, const size_t max, const size_t num, const size_t *size, cudaStream_t stream); | |||||
| #endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_CAST_ALL_H_ | |||||
| @@ -0,0 +1,28 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/kernel_compiler/gpu/math/cast_all_gpu_kernel.h" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat16), | |||||
| CastAllGpuFwdKernel, float, half) | |||||
| MS_REG_GPU_KERNEL_TWO( | |||||
| CastAll, KernelAttr().AddAllSameAttr(true).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat32), | |||||
| CastAllGpuFwdKernel, half, float) | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,104 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H | |||||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include <map> | |||||
| #include <string> | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||||
| #include "backend/kernel_compiler/gpu/cuda_impl/cast_all_impl.cuh" | |||||
| namespace mindspore { | |||||
| namespace kernel { | |||||
| template <typename T, typename S> | |||||
| class CastAllGpuFwdKernel : public GpuKernel { | |||||
| public: | |||||
| CastAllGpuFwdKernel() : max_(0), input_size_(0), output_size_(0), num_input_(0) {} | |||||
| ~CastAllGpuFwdKernel() override = default; | |||||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) override { | |||||
| auto stream = reinterpret_cast<cudaStream_t>(stream_ptr); | |||||
| auto in_addr = std::make_unique<T *[]>(num_input_); | |||||
| auto out_addr = std::make_unique<S *[]>(num_input_); | |||||
| for (size_t i = 0; i < num_input_; i++) { | |||||
| in_addr[i] = GetDeviceAddress<T>(inputs, i); | |||||
| out_addr[i] = GetDeviceAddress<S>(outputs, i); | |||||
| } | |||||
| T **inputs_dev = GetDeviceAddress<T *>(workspace, 0); | |||||
| S **outputs_dev = GetDeviceAddress<S *>(workspace, 1); | |||||
| size_t *size_dev = GetDeviceAddress<size_t>(workspace, 2); | |||||
| CHECK_CUDA_RET_WITH_EXCEPT( | |||||
| cudaMemcpyAsync(inputs_dev, in_addr.get(), sizeof(T *) * num_input_, cudaMemcpyHostToDevice, stream), | |||||
| "cudaMemCPY failed") | |||||
| CHECK_CUDA_RET_WITH_EXCEPT( | |||||
| cudaMemcpyAsync(outputs_dev, out_addr.get(), sizeof(S *) * num_input_, cudaMemcpyHostToDevice, stream), | |||||
| "cudaMemCPY failed") | |||||
| CHECK_CUDA_RET_WITH_EXCEPT( | |||||
| cudaMemcpyAsync(size_dev, size_.get(), sizeof(size_t) * num_input_, cudaMemcpyHostToDevice, stream), | |||||
| "cudaMemCPY failed") | |||||
| CastAllKernel(inputs_dev, outputs_dev, max_, num_input_, size_dev, stream); | |||||
| return true; | |||||
| } | |||||
| bool Init(const CNodePtr &kernel_node) override { | |||||
| num_input_ = GetAttr<size_t>(kernel_node, "n"); | |||||
| size_ = std::make_unique<size_t[]>(num_input_); | |||||
| for (size_t i = 0; i < num_input_; i++) { | |||||
| auto shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i); | |||||
| size_t s = 1; | |||||
| for (auto x : shape) { | |||||
| s = s * x; | |||||
| } | |||||
| if (max_ < s) { | |||||
| max_ = s; | |||||
| } | |||||
| size_[i] = s; | |||||
| input_size_ = sizeof(T) * s; | |||||
| output_size_ = sizeof(S) * s; | |||||
| InitSizeLists(); | |||||
| } | |||||
| workspace_size_list_.push_back(sizeof(T *) * num_input_); | |||||
| workspace_size_list_.push_back(sizeof(S *) * num_input_); | |||||
| workspace_size_list_.push_back(sizeof(size_t) * num_input_); | |||||
| return true; | |||||
| } | |||||
| protected: | |||||
| void InitSizeLists() override { | |||||
| input_size_list_.push_back(input_size_); | |||||
| output_size_list_.push_back(output_size_); | |||||
| } | |||||
| std::vector<size_t> input_size_list_; | |||||
| std::vector<size_t> output_size_list_; | |||||
| std::vector<size_t> workspace_size_list_; | |||||
| std::unique_ptr<size_t[]> size_; | |||||
| size_t max_; | |||||
| size_t input_size_; | |||||
| size_t output_size_; | |||||
| size_t num_input_; | |||||
| }; | |||||
| } // namespace kernel | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_MATH_CAST_ALL_GPU_KERNEL_H | |||||
| @@ -0,0 +1,140 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "backend/optimizer/gpu/combine_cast_fusion.h" | |||||
| #include <memory> | |||||
| #include <vector> | |||||
| #include <string> | |||||
| #include "backend/session/anf_runtime_algorithm.h" | |||||
| #include "ir/primitive.h" | |||||
| #include "utils/utils.h" | |||||
| #include "backend/optimizer/common/helper.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| namespace { | |||||
| kernel::KernelBuildInfoPtr GenerateKernelBuildInfo(const std::vector<AnfNodePtr> &node_list) { | |||||
| std::vector<std::string> inputs_device_format; | |||||
| std::vector<std::string> outputs_device_format; | |||||
| std::vector<TypeId> inputs_device_type; | |||||
| std::vector<TypeId> outputs_device_type; | |||||
| std::vector<std::vector<size_t>> outputs_shape; | |||||
| kernel::KernelBuildInfo::KernelBuildInfoBuilder builder; | |||||
| for (size_t idx = 0; idx < node_list.size(); ++idx) { | |||||
| auto cnode = utils::cast<CNodePtr>(node_list[idx]); | |||||
| MS_EXCEPTION_IF_NULL(cnode); | |||||
| for (size_t input_index = 0; input_index < AnfAlgo::GetInputTensorNum(cnode); ++input_index) { | |||||
| inputs_device_format.push_back(kOpFormat_DEFAULT); | |||||
| inputs_device_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(cnode, input_index)); | |||||
| } | |||||
| for (size_t output_index = 0; output_index < AnfAlgo::GetOutputTensorNum(cnode); ++output_index) { | |||||
| outputs_device_format.push_back(kOpFormat_DEFAULT); | |||||
| outputs_device_type.push_back(AnfAlgo::GetOutputInferDataType(cnode, output_index)); | |||||
| outputs_shape.push_back(AnfAlgo::GetOutputInferShape(cnode, output_index)); | |||||
| } | |||||
| } | |||||
| builder.SetInputsFormat(inputs_device_format); | |||||
| builder.SetOutputsFormat(outputs_device_format); | |||||
| builder.SetInputsDeviceType(inputs_device_type); | |||||
| builder.SetOutputsDeviceType(outputs_device_type); | |||||
| return builder.Build(); | |||||
| } | |||||
| bool GetDealList(const std::vector<AnfNodePtr> &node_list, std::vector<std::vector<AnfNodePtr>> *deal_list) { | |||||
| std::vector<AnfNodePtr> cast_32to16_list; | |||||
| std::vector<AnfNodePtr> cast_16to32_list; | |||||
| for (auto &cast_node : node_list) { | |||||
| // currently, we only deal with the construct : [Param->Cast->] to avoid being a cycle. | |||||
| if (cast_node != nullptr && cast_node->isa<CNode>() && AnfAlgo::GetCNodeName(cast_node) == "Cast" && | |||||
| (AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_node), 0))->isa<Parameter>()) { | |||||
| auto dst = AnfAlgo::GetOutputInferDataType(cast_node, 0); | |||||
| auto src = AnfAlgo::GetPrevNodeOutputInferDataType(cast_node, 0); | |||||
| if (dst == kNumberTypeFloat16 && src == kNumberTypeFloat32) { | |||||
| cast_32to16_list.push_back(cast_node); | |||||
| } else if (dst == kNumberTypeFloat32 && src == kNumberTypeFloat16) { | |||||
| cast_16to32_list.push_back(cast_node); | |||||
| } | |||||
| } | |||||
| } | |||||
| if (cast_32to16_list.size() <= 1 && cast_16to32_list.size() <= 1) { | |||||
| return false; | |||||
| } | |||||
| if (cast_32to16_list.size() > 1) { | |||||
| deal_list->push_back(cast_32to16_list); | |||||
| } | |||||
| if (cast_16to32_list.size() > 1) { | |||||
| deal_list->push_back(cast_16to32_list); | |||||
| } | |||||
| return true; | |||||
| } | |||||
| } // namespace | |||||
| bool CastAllFusion::Run(const FuncGraphPtr &graph) { | |||||
| MS_EXCEPTION_IF_NULL(graph); | |||||
| auto manager = graph->manager(); | |||||
| std::vector<AnfNodePtr> node_list = TopoSort(graph->get_return()); | |||||
| // 1 get all the cast node | |||||
| std::vector<std::vector<AnfNodePtr>> deal_list; | |||||
| if (!GetDealList(node_list, &deal_list)) { | |||||
| return false; | |||||
| } | |||||
| for (auto cast_list : deal_list) { | |||||
| // 2 create node CastAll | |||||
| auto prim = std::make_shared<Primitive>("CastAll"); | |||||
| std::vector<AnfNodePtr> inputs = {NewValueNode(prim)}; | |||||
| // set inputs for CastAll | |||||
| for (size_t idx = 0; idx < cast_list.size(); ++idx) { | |||||
| inputs.push_back(AnfAlgo::GetInputNode(utils::cast<CNodePtr>(cast_list[idx]), 0)); | |||||
| } | |||||
| auto cast_all = graph->NewCNode(inputs); | |||||
| auto kernel_info = std::make_shared<device::KernelInfo>(); | |||||
| MS_EXCEPTION_IF_NULL(kernel_info); | |||||
| cast_all->set_kernel_info(kernel_info); | |||||
| AbstractBasePtrList abstract_list; | |||||
| for (size_t idx = 0; idx < cast_list.size(); ++idx) { | |||||
| auto cnode = utils::cast<CNodePtr>(cast_list[idx]); | |||||
| MS_EXCEPTION_IF_NULL(cnode); | |||||
| abstract_list.push_back(cnode->abstract()); | |||||
| } | |||||
| auto kernel_build_info = GenerateKernelBuildInfo(cast_list); | |||||
| AnfAlgo::SetSelectKernelBuildInfo(kernel_build_info, cast_all.get()); | |||||
| auto abstract_tuple = std::make_shared<abstract::AbstractTuple>(abstract_list); | |||||
| MS_EXCEPTION_IF_NULL(abstract_tuple); | |||||
| cast_all->set_abstract(abstract_tuple); | |||||
| AnfAlgo::SetNodeAttr("n", MakeValue(cast_list.size()), cast_all); | |||||
| // 3 replace all the cast by CastAllv tuplegetitem[castall, idx] | |||||
| for (size_t idx = 0; idx < cast_list.size(); ++idx) { | |||||
| std::vector<AnfNodePtr> tuple_getitem_input; | |||||
| tuple_getitem_input.push_back(NewValueNode(prim::kPrimTupleGetItem)); | |||||
| tuple_getitem_input.push_back(cast_all); | |||||
| auto index = NewValueNode(SizeToInt(idx)); | |||||
| auto imm = std::make_shared<Int32Imm>(idx); | |||||
| auto abstract_scalar = std::make_shared<abstract::AbstractScalar>(imm); | |||||
| MS_EXCEPTION_IF_NULL(abstract_scalar); | |||||
| index->set_abstract(abstract_scalar); | |||||
| tuple_getitem_input.push_back(index); | |||||
| AnfNodePtr tuple_getitem = graph->NewCNode(tuple_getitem_input); | |||||
| MS_EXCEPTION_IF_NULL(tuple_getitem); | |||||
| tuple_getitem->set_abstract(cast_list[idx]->abstract()); | |||||
| if (!manager->Replace(cast_list[idx], tuple_getitem)) { | |||||
| MS_LOG(EXCEPTION) << "manager replace node failed"; | |||||
| } | |||||
| } | |||||
| } | |||||
| return true; | |||||
| } | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| @@ -0,0 +1,34 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_ | |||||
| #define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_ | |||||
| #include <memory> | |||||
| #include <string> | |||||
| #include <vector> | |||||
| #include "backend/optimizer/common/optimizer.h" | |||||
| namespace mindspore { | |||||
| namespace opt { | |||||
| class CastAllFusion : public Pass { | |||||
| public: | |||||
| explicit CastAllFusion(const std::string &name) : Pass("cast_all") {} | |||||
| ~CastAllFusion() override = default; | |||||
| bool Run(const FuncGraphPtr &graph) override; | |||||
| }; | |||||
| } // namespace opt | |||||
| } // namespace mindspore | |||||
| #endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_COMBINE_CAST_FUSION_H_ | |||||