| @@ -0,0 +1,226 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <algorithm> | |||
| #include "maxpool_with_argmax_grad_impl.cuh" | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| #include "include/cuda_fp16.h" | |||
| template <typename T, typename S> | |||
| __global__ void MaxPoolWithArgmaxGrad(const T* x, | |||
| const T* dy, | |||
| const S* index, | |||
| const int n, | |||
| const int c, | |||
| const int xHeight, | |||
| const int xWidth, | |||
| const int dyHeight, | |||
| const int dyWidth, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int xNCHW, | |||
| const int xCHW, | |||
| const int xHW, | |||
| const int dyCHW, | |||
| const int dyHW, | |||
| T* dx) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||
| pos < (xNCHW); | |||
| pos += blockDim.x * gridDim.x) { | |||
| const int posn = pos / xCHW; | |||
| const int posc = pos / xHW % c; | |||
| const int posh = pos / xHeight % xHeight; | |||
| const int posw = pos % xWidth; | |||
| const S posIdx = posh*xWidth + posw; | |||
| int hstart = posh+padTop; | |||
| if (hstart < windowHeight) { | |||
| hstart = 0; | |||
| } else { | |||
| hstart = (hstart-windowHeight)/strideHeight + 1; | |||
| } | |||
| int wstart = posw+padLeft; | |||
| if (wstart < windowWidth) { | |||
| wstart = 0; | |||
| } else { | |||
| wstart = (wstart-windowWidth)/strideWidth + 1; | |||
| } | |||
| const int hend = min((posh+padTop)/strideHeight +1, dyHeight); | |||
| const int wend = min((posw+padLeft)/strideWidth +1, dyWidth); | |||
| const int channelStart = posn*dyCHW + posc*dyHW; | |||
| T dySum = static_cast<T>(0.0); | |||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||
| const int curIdx = hcur*dyWidth + wcur; | |||
| S maxIdx = index[channelStart+curIdx]; | |||
| if (maxIdx == posIdx) { | |||
| dySum += dy[channelStart+curIdx]; | |||
| } | |||
| } | |||
| } | |||
| dx[pos] = dySum; | |||
| } | |||
| return; | |||
| } | |||
| template <> | |||
| __global__ void MaxPoolWithArgmaxGrad(const half* x, | |||
| const half* dy, | |||
| const int* index, | |||
| const int n, | |||
| const int c, | |||
| const int xHeight, | |||
| const int xWidth, | |||
| const int dyHeight, | |||
| const int dyWidth, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int xNCHW, | |||
| const int xCHW, | |||
| const int xHW, | |||
| const int dyCHW, | |||
| const int dyHW, | |||
| half* dx) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||
| pos < (xNCHW); | |||
| pos += blockDim.x * gridDim.x) { | |||
| const int posn = pos / xCHW; | |||
| const int posc = pos / xHW % c; | |||
| const int posh = pos / xHeight % xHeight; | |||
| const int posw = pos % xWidth; | |||
| const int posIdx = posh*xWidth + posw; | |||
| int hstart = posh+padTop; | |||
| if (hstart < windowHeight) { | |||
| hstart = 0; | |||
| } else { | |||
| hstart = (hstart-windowHeight)/strideHeight + 1; | |||
| } | |||
| int wstart = posw+padLeft; | |||
| if (wstart < windowWidth) { | |||
| wstart = 0; | |||
| } else { | |||
| wstart = (wstart-windowWidth)/strideWidth + 1; | |||
| } | |||
| const int hend = min((posh+padTop)/strideHeight +1, dyHeight); | |||
| const int wend = min((posw+padLeft)/strideWidth +1, dyWidth); | |||
| const int channelStart = posn*dyCHW + posc*dyHW; | |||
| float dySum = 0.0f; | |||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||
| const int curIdx = hcur*dyWidth + wcur; | |||
| int maxIdx = index[channelStart+curIdx]; | |||
| if (maxIdx == posIdx) { | |||
| dySum += __half2float(dy[channelStart+curIdx]); | |||
| } | |||
| } | |||
| } | |||
| dx[pos] = __float2half(dySum); | |||
| } | |||
| return; | |||
| } | |||
| template <typename T, typename S> | |||
| void CalMaxPoolWithArgmaxGrad(const T* x, | |||
| const T* dy, | |||
| const S* index, | |||
| const int n, | |||
| const int c, | |||
| const int xHeight, | |||
| const int xWidth, | |||
| const int dyHeight, | |||
| const int dyWidth, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| T* dx, | |||
| cudaStream_t cuda_stream) { | |||
| const int xHW = xHeight*xWidth; | |||
| const int xCHW = c*xHW; | |||
| const int xNCHW = n*xCHW; | |||
| const int dyHW = dyHeight*dyWidth; | |||
| const int dyCHW = c*dyHW; | |||
| MaxPoolWithArgmaxGrad<<<GET_BLOCKS(xNCHW), | |||
| GET_THREADS, | |||
| 0, | |||
| cuda_stream>>>( | |||
| x, | |||
| dy, | |||
| index, | |||
| n, | |||
| c, | |||
| xHeight, | |||
| xWidth, | |||
| dyHeight, | |||
| dyWidth, | |||
| windowHeight, | |||
| windowWidth, | |||
| strideHeight, | |||
| strideWidth, | |||
| padTop, | |||
| padLeft, | |||
| xNCHW, | |||
| xCHW, | |||
| xHW, | |||
| dyCHW, | |||
| dyHW, | |||
| dx); | |||
| return; | |||
| } | |||
| template void CalMaxPoolWithArgmaxGrad<float, int>(const float* x, | |||
| const float* dy, | |||
| const int* index, | |||
| const int n, | |||
| const int c, | |||
| const int xHeight, | |||
| const int xWidth, | |||
| const int dyHeight, | |||
| const int dyWidth, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| float* dx, | |||
| cudaStream_t cuda_stream); | |||
| template void CalMaxPoolWithArgmaxGrad<half, int>(const half* x, | |||
| const half* dy, | |||
| const int* index, | |||
| const int n, | |||
| const int c, | |||
| const int xHeight, | |||
| const int xWidth, | |||
| const int dyHeight, | |||
| const int dyWidth, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| half* dx, | |||
| cudaStream_t cuda_stream); | |||
| @@ -0,0 +1,25 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||
| template <typename T, typename S> | |||
| void CalMaxPoolWithArgmaxGrad(const T* x, const T* dy, const S* index, const int n, const int c, const int xHeight, | |||
| const int xWidth, const int dyHeight, const int dyWidth, const int windowHeight, | |||
| const int windowWidth, const int strideHeight, const int strideWidth, const int padTop, | |||
| const int padLeft, T* dx, cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_GRAD_H_ | |||
| @@ -0,0 +1,149 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include <algorithm> | |||
| #include "maxpool_with_argmax_impl.cuh" | |||
| #include "runtime/device/gpu/cuda_common.h" | |||
| #include "include/cuda_fp16.h" | |||
| template <typename T, typename S> | |||
| __global__ void MaxPoolWithArgmax(const T* input, | |||
| const int n, | |||
| const int c, | |||
| const int h, | |||
| const int w, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int outputHeight, | |||
| const int outputWidth, | |||
| const int outputNCHW, | |||
| const int outputCHW, | |||
| const int outputHW, | |||
| T* output, | |||
| S *index) { | |||
| for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; | |||
| pos < (outputNCHW); | |||
| pos += blockDim.x * gridDim.x) { | |||
| const int posn = pos / outputCHW; | |||
| const int posc = pos / outputHW % c; | |||
| const int posh = pos / outputHeight % outputHeight; | |||
| const int posw = pos % outputWidth; | |||
| int hstart = posh * strideHeight - padTop; | |||
| int wstart = posw * strideWidth - padLeft; | |||
| const int hend = min(hstart + windowHeight, h); | |||
| const int wend = min(wstart + windowWidth, w); | |||
| hstart = max(hstart, 0); | |||
| wstart = max(wstart, 0); | |||
| S inputStart = posn*c*h*w + posc*h*w; | |||
| S maxIdx = hstart*w + wstart; | |||
| T maxData = input[inputStart+maxIdx]; | |||
| for (int hcur = hstart; hcur < hend; ++hcur) { | |||
| for (int wcur = wstart; wcur < wend; ++wcur) { | |||
| S inputIdx = hcur*w + wcur; | |||
| T inputData = input[inputStart+inputIdx]; | |||
| if (inputData > maxData) { | |||
| maxIdx = inputIdx; | |||
| maxData = inputData; | |||
| } | |||
| } | |||
| } | |||
| output[pos] = maxData; | |||
| index[pos] = maxIdx; | |||
| } | |||
| return; | |||
| } | |||
| template <typename T, typename S> | |||
| void CalMaxPoolWithArgmax(const T* input, | |||
| const int n, | |||
| const int c, | |||
| const int h, | |||
| const int w, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int outputHeight, | |||
| const int outputWidth, | |||
| T* output, | |||
| S *index, | |||
| cudaStream_t cuda_stream) { | |||
| const int outputNCHW = n*c*outputHeight*outputWidth; | |||
| const int outputCHW = c*outputHeight*outputWidth; | |||
| const int outputHW = outputHeight*outputWidth; | |||
| MaxPoolWithArgmax<<<GET_BLOCKS(n*c*outputHeight*outputWidth), | |||
| GET_THREADS, | |||
| 0, | |||
| cuda_stream>>>( | |||
| input, | |||
| n, | |||
| c, | |||
| h, | |||
| w, | |||
| windowHeight, | |||
| windowWidth, | |||
| strideHeight, | |||
| strideWidth, | |||
| padTop, | |||
| padLeft, | |||
| outputHeight, | |||
| outputWidth, | |||
| outputNCHW, | |||
| outputCHW, | |||
| outputHW, | |||
| output, | |||
| index); | |||
| return; | |||
| } | |||
| template void CalMaxPoolWithArgmax<float, int>(const float* input, | |||
| const int n, | |||
| const int c, | |||
| const int h, | |||
| const int w, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int outputHeight, | |||
| const int outputWidth, | |||
| float* output, | |||
| int* index, | |||
| cudaStream_t cuda_stream); | |||
| template void CalMaxPoolWithArgmax<half, int>(const half* input, | |||
| const int n, | |||
| const int c, | |||
| const int h, | |||
| const int w, | |||
| const int windowHeight, | |||
| const int windowWidth, | |||
| const int strideHeight, | |||
| const int strideWidth, | |||
| const int padTop, | |||
| const int padLeft, | |||
| const int outputHeight, | |||
| const int outputWidth, | |||
| half* output, | |||
| int* index, | |||
| cudaStream_t cuda_stream); | |||
| @@ -0,0 +1,25 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||
| template <typename T, typename S> | |||
| void CalMaxPoolWithArgmax(const T* input, const int n, const int c, const int h, const int w, const int windowHeight, | |||
| const int windowWidth, const int strideHeight, const int strideWidth, const int padTop, | |||
| const int padLeft, const int outputHeight, const int outputWidth, T* output, S *index, | |||
| cudaStream_t cuda_stream); | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_MAXPOOLWITHARGMAX_H_ | |||
| @@ -0,0 +1,30 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| MaxPoolWithArgmax, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32), | |||
| MaxPoolWithArgmaxGpuFwdKernel, float, int) | |||
| MS_REG_GPU_KERNEL_TWO( | |||
| MaxPoolWithArgmax, | |||
| KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeInt32), | |||
| MaxPoolWithArgmaxGpuFwdKernel, half, int) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,160 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_impl.cuh" | |||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T, typename S> | |||
| class MaxPoolWithArgmaxGpuFwdKernel : public GpuKernel { | |||
| public: | |||
| MaxPoolWithArgmaxGpuFwdKernel() | |||
| : n_(0), | |||
| c_(0), | |||
| input_height_(0), | |||
| input_width_(0), | |||
| window_height_(0), | |||
| window_width_(0), | |||
| pad_height_(0), | |||
| pad_width_(0), | |||
| pad_top_(0), | |||
| pad_left_(0), | |||
| stride_height_(0), | |||
| stride_width_(0), | |||
| output_height_(0), | |||
| output_width_(0), | |||
| input_size_(0), | |||
| output_size_(0) {} | |||
| ~MaxPoolWithArgmaxGpuFwdKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| T *input_addr = GetDeviceAddress<T>(inputs, 0); | |||
| T *output_addr = GetDeviceAddress<T>(outputs, 0); | |||
| S *index_addr = GetDeviceAddress<S>(outputs, 1); | |||
| CalMaxPoolWithArgmax(input_addr, n_, c_, input_height_, input_width_, window_height_, window_width_, stride_height_, | |||
| stride_width_, pad_top_, pad_left_, output_height_, output_width_, output_addr, index_addr, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 1) { | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolWithArgmax needs 1 inputs."; | |||
| return false; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 2) { | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolWithArgmax needs 2 output."; | |||
| return false; | |||
| } | |||
| auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| input_size_ = sizeof(T); | |||
| for (auto x : input_shape) { | |||
| input_size_ *= x; | |||
| } | |||
| output_size_ = sizeof(T); | |||
| for (auto x : output_shape) { | |||
| output_size_ *= x; | |||
| } | |||
| n_ = SizeToInt(input_shape[0]); | |||
| c_ = SizeToInt(input_shape[1]); | |||
| input_height_ = SizeToInt(input_shape[2]); | |||
| input_width_ = SizeToInt(input_shape[3]); | |||
| output_height_ = SizeToInt(output_shape[2]); | |||
| output_width_ = SizeToInt(output_shape[3]); | |||
| auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize")); | |||
| window_height_ = window[1]; | |||
| window_width_ = window[2]; | |||
| auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides")); | |||
| stride_height_ = stride[1]; | |||
| stride_width_ = stride[2]; | |||
| pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding")); | |||
| pad_top_ = 0; | |||
| pad_left_ = 0; | |||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) { | |||
| SetPad(); | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { | |||
| input_size_list_.push_back(input_size_); | |||
| output_size_list_.push_back(output_size_); | |||
| output_size_list_.push_back(output_size_ / sizeof(T) * sizeof(S)); | |||
| } | |||
| private: | |||
| void SetPad() { | |||
| pad_height_ = std::max<int>( | |||
| 0, (((input_height_ / stride_height_) * stride_height_ == input_height_ ? (input_height_ / stride_height_) | |||
| : (input_height_ / stride_height_) + 1) - | |||
| 1) * | |||
| stride_height_ + | |||
| window_height_ - input_height_); | |||
| pad_width_ = std::max<int>( | |||
| 0, (((input_width_ / stride_width_) * stride_width_ == input_width_ ? (input_width_ / stride_width_) | |||
| : (input_width_ / stride_width_) + 1) - | |||
| 1) * | |||
| stride_width_ + | |||
| window_width_ - input_width_); | |||
| pad_top_ = pad_height_ / 2; | |||
| pad_left_ = pad_width_ / 2; | |||
| } | |||
| std::string pad_mode_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| int n_; | |||
| int c_; | |||
| int input_height_; | |||
| int input_width_; | |||
| int window_height_; | |||
| int window_width_; | |||
| int pad_height_; | |||
| int pad_width_; | |||
| int pad_top_; | |||
| int pad_left_; | |||
| int stride_height_; | |||
| int stride_width_; | |||
| int output_height_; | |||
| int output_width_; | |||
| size_t input_size_; | |||
| size_t output_size_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GPU_KERNEL_H_ | |||
| @@ -0,0 +1,36 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #include "backend/kernel_compiler/gpu/nn/maxpool_with_argmax_grad_gpu_kernel.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeFloat32) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat32), | |||
| MaxPoolWithArgmaxGradGpuKernel, float, int) | |||
| MS_REG_GPU_KERNEL_TWO(MaxPoolGradWithArgmax, | |||
| KernelAttr() | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeFloat16) | |||
| .AddInputAttr(kNumberTypeInt32) | |||
| .AddOutputAttr(kNumberTypeFloat16), | |||
| MaxPoolWithArgmaxGradGpuKernel, half, int) | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| @@ -0,0 +1,168 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||
| #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||
| #include <algorithm> | |||
| #include <vector> | |||
| #include <string> | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel.h" | |||
| #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h" | |||
| #include "backend/kernel_compiler/gpu/cuda_impl/maxpool_with_argmax_grad_impl.cuh" | |||
| #include "backend/kernel_compiler/gpu/kernel_constants.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| template <typename T, typename S> | |||
| class MaxPoolWithArgmaxGradGpuKernel : public GpuKernel { | |||
| public: | |||
| MaxPoolWithArgmaxGradGpuKernel() | |||
| : n_(0), | |||
| c_(0), | |||
| x_height_(0), | |||
| x_width_(0), | |||
| dy_height_(0), | |||
| dy_width_(0), | |||
| x_size_(0), | |||
| dy_size_(0), | |||
| index_size_(0), | |||
| dx_size_(0) {} | |||
| ~MaxPoolWithArgmaxGradGpuKernel() override = default; | |||
| const std::vector<size_t> &GetInputSizeList() const override { return input_size_list_; } | |||
| const std::vector<size_t> &GetOutputSizeList() const override { return output_size_list_; } | |||
| const std::vector<size_t> &GetWorkspaceSizeList() const override { return workspace_size_list_; } | |||
| bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace, | |||
| const std::vector<AddressPtr> &outputs, void *stream_ptr) { | |||
| T *x_addr = GetDeviceAddress<T>(inputs, 0); | |||
| T *dy_addr = GetDeviceAddress<T>(inputs, 1); | |||
| S *index_addr = GetDeviceAddress<S>(inputs, 2); | |||
| T *dx_addr = GetDeviceAddress<T>(outputs, 0); | |||
| CalMaxPoolWithArgmaxGrad(x_addr, dy_addr, index_addr, n_, c_, x_height_, x_width_, dy_height_, dy_width_, | |||
| window_height_, window_width_, stride_height_, stride_width_, pad_top_, pad_left_, dx_addr, | |||
| reinterpret_cast<cudaStream_t>(stream_ptr)); | |||
| return true; | |||
| } | |||
| bool Init(const CNodePtr &kernel_node) { | |||
| size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); | |||
| if (input_num != 3) { | |||
| MS_LOG(ERROR) << "Input number is " << input_num << ", but MaxPoolGradWithArgmax needs 3 inputs."; | |||
| return false; | |||
| } | |||
| size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); | |||
| if (output_num != 1) { | |||
| MS_LOG(ERROR) << "Output number is " << output_num << ", but MaxPoolGradWithArgmax needs 1 output."; | |||
| return false; | |||
| } | |||
| auto x_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0); | |||
| auto dy_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1); | |||
| auto index_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 2); | |||
| auto dx_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); | |||
| x_size_ = sizeof(T); | |||
| for (auto x : x_shape) { | |||
| x_size_ *= x; | |||
| } | |||
| dy_size_ = sizeof(T); | |||
| for (auto x : dy_shape) { | |||
| dy_size_ *= x; | |||
| } | |||
| index_size_ = sizeof(S); | |||
| for (auto x : index_shape) { | |||
| index_size_ *= x; | |||
| } | |||
| dx_size_ = sizeof(T); | |||
| for (auto x : dx_shape) { | |||
| dx_size_ *= x; | |||
| } | |||
| n_ = SizeToInt(x_shape[0]); | |||
| c_ = SizeToInt(x_shape[1]); | |||
| x_height_ = SizeToInt(x_shape[2]); | |||
| x_width_ = SizeToInt(x_shape[3]); | |||
| dy_height_ = SizeToInt(dy_shape[2]); | |||
| dy_width_ = SizeToInt(dy_shape[3]); | |||
| auto window = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("ksize")); | |||
| window_height_ = window[1]; | |||
| window_width_ = window[2]; | |||
| auto stride = GetValue<std::vector<int>>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("strides")); | |||
| stride_height_ = stride[1]; | |||
| stride_width_ = stride[2]; | |||
| pad_mode_ = GetValue<std::string>(AnfAlgo::GetCNodePrimitive(kernel_node)->GetAttr("padding")); | |||
| pad_top_ = 0; | |||
| pad_left_ = 0; | |||
| if (pad_mode_ == kSamePadModeUpperCase || pad_mode_ == kSamePadModeLowerCase) { | |||
| SetPad(); | |||
| } | |||
| InitSizeLists(); | |||
| return true; | |||
| } | |||
| protected: | |||
| void InitSizeLists() override { | |||
| input_size_list_.push_back(x_size_); | |||
| input_size_list_.push_back(dy_size_); | |||
| input_size_list_.push_back(index_size_); | |||
| output_size_list_.push_back(dx_size_); | |||
| } | |||
| private: | |||
| void SetPad() { | |||
| pad_height_ = std::max<int>( | |||
| 0, (((x_height_ / stride_height_) * stride_height_ == x_height_ ? (x_height_ / stride_height_) | |||
| : (x_height_ / stride_height_) + 1) - | |||
| 1) * | |||
| stride_height_ + | |||
| window_height_ - x_height_); | |||
| pad_width_ = | |||
| std::max<int>(0, (((x_width_ / stride_width_) * stride_width_ == x_width_ ? (x_width_ / stride_width_) | |||
| : (x_width_ / stride_width_) + 1) - | |||
| 1) * | |||
| stride_width_ + | |||
| window_width_ - x_width_); | |||
| pad_top_ = pad_height_ / 2; | |||
| pad_left_ = pad_width_ / 2; | |||
| } | |||
| std::string pad_mode_; | |||
| std::vector<size_t> input_size_list_; | |||
| std::vector<size_t> output_size_list_; | |||
| std::vector<size_t> workspace_size_list_; | |||
| int n_; | |||
| int c_; | |||
| int x_height_; | |||
| int x_width_; | |||
| int dy_height_; | |||
| int dy_width_; | |||
| int window_height_; | |||
| int window_width_; | |||
| int pad_height_; | |||
| int pad_width_; | |||
| int pad_top_; | |||
| int pad_left_; | |||
| int stride_height_; | |||
| int stride_width_; | |||
| size_t x_size_; | |||
| size_t dy_size_; | |||
| size_t index_size_; | |||
| size_t dx_size_; | |||
| }; | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_NN_MAXPOOLWITHARGMAX_GRAD_GPU_KERNEL_H_ | |||
| @@ -1180,6 +1180,7 @@ class MaxPoolWithArgmax(_Pool): | |||
| def __init__(self, ksize=1, strides=1, padding="valid"): | |||
| super(MaxPoolWithArgmax, self).__init__(ksize, strides, padding) | |||
| self.is_tbe = context.get_context("device_target") == "Ascend" | |||
| self.is_gpu = context.get_context("device_target") == "GPU" | |||
| def infer_shape(self, x_shape): | |||
| out_shape = _Pool.infer_shape(self, x_shape) | |||
| @@ -1206,6 +1207,8 @@ class MaxPoolWithArgmax(_Pool): | |||
| out_dtype = x_dtype | |||
| validator.check_tensor_type_same({"x": x_dtype}, (mstype.float16, mstype.float32), self.name) | |||
| argmax_dtype = mstype.uint16 | |||
| if self.is_gpu: | |||
| argmax_dtype = mstype.int32 | |||
| return out_dtype, argmax_dtype | |||
| @@ -0,0 +1,147 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.context as context | |||
| import mindspore.nn as nn | |||
| from mindspore import Tensor | |||
| from mindspore.ops import operations as P | |||
| class Net_Pool(nn.Cell): | |||
| def __init__(self): | |||
| super(Net_Pool, self).__init__() | |||
| self.maxpool_fun = P.MaxPoolWithArgmax(ksize=2, strides=2, padding="VALID") | |||
| def construct(self, x): | |||
| return self.maxpool_fun(x) | |||
| class Net_Pool2(nn.Cell): | |||
| def __init__(self): | |||
| super(Net_Pool2, self).__init__() | |||
| self.maxpool_fun = P.MaxPoolWithArgmax(ksize=3, strides=2, padding="SAME") | |||
| def construct(self, x): | |||
| return self.maxpool_fun(x) | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.env_onecard | |||
| def test_maxpool_with_argmax_2d(): | |||
| x = Tensor(np.array([[[ | |||
| [0, 1, 2, 3, -4, -5], | |||
| [6, 7, 8, 9, -10, -11], | |||
| [12, 13, 14, -15, -16, -17], | |||
| [18, 19, 20, 21, 22, 23], | |||
| [24, 25, 26, 27, 28, 29], | |||
| [30, 31, 32, 33, 34, 35] | |||
| ]]]).astype(np.float32)) | |||
| expect_result = (np.array([[[ | |||
| [7, 9, -4], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]])) | |||
| expect_result2 = (np.array([[[ | |||
| [14, 14, -4], | |||
| [26, 28, 29], | |||
| [32, 34, 35] | |||
| ]]])) | |||
| expect_index_result = (np.array([[[ | |||
| [7, 9, 4], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]])) | |||
| expect__index_result2 = (np.array([[[ | |||
| [14, 14, 4], | |||
| [26, 28, 29], | |||
| [32, 34, 35] | |||
| ]]])) | |||
| context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") | |||
| maxpool2d = Net_Pool() | |||
| maxpool2d2 = Net_Pool2() | |||
| output2, index2 = maxpool2d2(x) | |||
| output, index = maxpool2d(x) | |||
| assert (output.asnumpy() == expect_result).all() | |||
| assert (output2.asnumpy() == expect_result2).all() | |||
| assert (index.asnumpy() == expect_index_result).all() | |||
| assert (index2.asnumpy() == expect__index_result2).all() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| maxpool2d = Net_Pool() | |||
| maxpool2d2 = Net_Pool2() | |||
| output2, index2 = maxpool2d2(x) | |||
| output, index = maxpool2d(x) | |||
| assert (output.asnumpy() == expect_result).all() | |||
| assert (output2.asnumpy() == expect_result2).all() | |||
| assert (index.asnumpy() == expect_index_result).all() | |||
| assert (index2.asnumpy() == expect__index_result2).all() | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.env_onecard | |||
| def test_maxpool_with_argmax_2d_fp16(): | |||
| x = Tensor(np.array([[[ | |||
| [0, 1, 2, 3, -4, -5], | |||
| [6, 7, 8, 9, -10, -11], | |||
| [12, 13, 14, -15, -16, -17], | |||
| [18, 19, 20, 21, 22, 23], | |||
| [24, 25, 26, 27, 28, 29], | |||
| [30, 31, 32, 33, 34, 35] | |||
| ]]]).astype(np.float16)) | |||
| expect_result = (np.array([[[ | |||
| [7, 9, -4], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]])) | |||
| expect_result2 = (np.array([[[ | |||
| [14, 14, -4], | |||
| [26, 28, 29], | |||
| [32, 34, 35] | |||
| ]]])) | |||
| expect_index_result = (np.array([[[ | |||
| [7, 9, 4], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]])) | |||
| expect__index_result2 = (np.array([[[ | |||
| [14, 14, 4], | |||
| [26, 28, 29], | |||
| [32, 34, 35] | |||
| ]]])) | |||
| context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") | |||
| maxpool2d = Net_Pool() | |||
| maxpool2d2 = Net_Pool2() | |||
| output2, index2 = maxpool2d2(x) | |||
| output, index = maxpool2d(x) | |||
| assert (output.asnumpy() == expect_result).all() | |||
| assert (output2.asnumpy() == expect_result2).all() | |||
| assert (index.asnumpy() == expect_index_result).all() | |||
| assert (index2.asnumpy() == expect__index_result2).all() | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| maxpool2d = Net_Pool() | |||
| maxpool2d2 = Net_Pool2() | |||
| output2, index2 = maxpool2d2(x) | |||
| output, index = maxpool2d(x) | |||
| assert (output.asnumpy() == expect_result).all() | |||
| assert (output2.asnumpy() == expect_result2).all() | |||
| assert (index.asnumpy() == expect_index_result).all() | |||
| assert (index2.asnumpy() == expect__index_result2).all() | |||
| @@ -0,0 +1,115 @@ | |||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||
| # | |||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||
| # you may not use this file except in compliance with the License. | |||
| # You may obtain a copy of the License at | |||
| # | |||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||
| # | |||
| # Unless required by applicable law or agreed to in writing, software | |||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| # See the License for the specific language governing permissions and | |||
| # limitations under the License. | |||
| # ============================================================================ | |||
| import numpy as np | |||
| import pytest | |||
| import mindspore.context as context | |||
| import mindspore.nn as nn | |||
| from mindspore import Tensor | |||
| from mindspore.ops.operations import _grad_ops as G | |||
| class Net_Pool_Grad(nn.Cell): | |||
| def __init__(self): | |||
| super(Net_Pool_Grad, self).__init__() | |||
| self.maxpool_grad_fun = G.MaxPoolGradWithArgmax(padding="VALID", ksize=2, strides=2) | |||
| def construct(self, x, dy, index): | |||
| return self.maxpool_grad_fun(x, dy, index) | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.env_onecard | |||
| def test_maxpool2d_grad(): | |||
| x = Tensor(np.array([[[ | |||
| [0, 1, 2, 3, 4, 5], | |||
| [6, 7, 8, 9, 10, 11], | |||
| [12, 13, 14, 15, 16, 17], | |||
| [18, 19, 20, 21, 22, 23], | |||
| [24, 25, 26, 27, 28, 29], | |||
| [30, 31, 32, 33, 34, 35] | |||
| ]]]).astype(np.float32)) | |||
| dy = Tensor(np.array([[[ | |||
| [0.7, 0.9, 0.11], | |||
| [0.19, 0.21, 0.23], | |||
| [0.31, 0.33, 0.35] | |||
| ]]]).astype(np.float32)) | |||
| index = Tensor(np.array([[[ | |||
| [7, 9, 11], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]]).astype(np.int32)) | |||
| expect_result = (np.array([[[ | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.7, 0., 0.9, 0., 0.11], | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.19, 0., 0.21, 0., 0.23], | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.31, 0., 0.33, 0., 0.35] | |||
| ]]])) | |||
| context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") | |||
| maxpool2d_grad = Net_Pool_Grad() | |||
| output = maxpool2d_grad(x, dy, index) | |||
| assert np.allclose(expect_result, output.asnumpy()) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| maxpool2d_grad = Net_Pool_Grad() | |||
| output = maxpool2d_grad(x, dy, index) | |||
| assert np.allclose(expect_result, output.asnumpy()) | |||
| @pytest.mark.level0 | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.env_onecard | |||
| def test_maxpool2d_grad_fp16(): | |||
| x = Tensor(np.array([[[ | |||
| [0, 1, 2, 3, 4, 5], | |||
| [6, 7, 8, 9, 10, 11], | |||
| [12, 13, 14, 15, 16, 17], | |||
| [18, 19, 20, 21, 22, 23], | |||
| [24, 25, 26, 27, 28, 29], | |||
| [30, 31, 32, 33, 34, 35] | |||
| ]]]).astype(np.float16)) | |||
| dy = Tensor(np.array([[[ | |||
| [0.7, 0.9, 0.11], | |||
| [0.19, 0.21, 0.23], | |||
| [0.31, 0.33, 0.35] | |||
| ]]]).astype(np.float16)) | |||
| index = Tensor(np.array([[[ | |||
| [7, 9, 11], | |||
| [19, 21, 23], | |||
| [31, 33, 35] | |||
| ]]]).astype(np.int32)) | |||
| expect_result = np.array([[[ | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.7, 0., 0.9, 0., 0.11], | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.19, 0., 0.21, 0., 0.23], | |||
| [0., 0., 0., 0., 0., 0.], | |||
| [0., 0.31, 0., 0.33, 0., 0.35] | |||
| ]]]).astype(np.float16) | |||
| context.set_context(mode=context.PYNATIVE_MODE, device_target="GPU") | |||
| maxpool2d_grad = Net_Pool_Grad() | |||
| output = maxpool2d_grad(x, dy, index) | |||
| assert np.allclose(expect_result, output.asnumpy()) | |||
| context.set_context(mode=context.GRAPH_MODE, device_target="GPU") | |||
| maxpool2d_grad = Net_Pool_Grad() | |||
| output = maxpool2d_grad(x, dy, index) | |||
| assert np.allclose(expect_result, output.asnumpy()) | |||