stridedslice/stridedslicegrad 4D to 7D

5 years ago · e6335739d2
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_gpu_kernel.h
@@ -26,7 +26,7 @@

 namespace mindspore {
 namespace kernel {
 constexpr int MAX_DIMS = 4;
 constexpr int MAX_DIMS = 7;
 template <typename T>
 class StridedSliceGpuKernel : public GpuKernel {
 public:
@@ -65,8 +65,17 @@ class StridedSliceGpuKernel : public GpuKernel {

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
    output_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
    size_t size = sizeof(T);
    for (size_t i = 0; i < MAX_DIMS; i++) {
      size *= input_shape_[i];
    }
    input_size_list_.push_back(size);

    int size1 = sizeof(T);
    for (size_t i = 0; i < MAX_DIMS; i++) {
      size1 *= output_shape_[i];
    }
    output_size_list_.push_back(size1);
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/strided_slice_grad_gpu_kernel.h
@@ -26,7 +26,7 @@

 namespace mindspore {
 namespace kernel {
 constexpr int MAX_DIMS = 4;
 constexpr int MAX_DIMS = 7;
 template <typename T>
 class StridedSliceGradGpuKernel : public GpuKernel {
 public:
@@ -66,8 +66,17 @@ class StridedSliceGradGpuKernel : public GpuKernel {

 protected:
  void InitSizeLists() override {
    input_size_list_.push_back(output_shape_[0] * output_shape_[1] * output_shape_[2] * output_shape_[3] * sizeof(T));
    output_size_list_.push_back(input_shape_[0] * input_shape_[1] * input_shape_[2] * input_shape_[3] * sizeof(T));
    int size = sizeof(T);
    for (size_t i = 0; i < MAX_DIMS; i++) {
      size *= output_shape_[i];
    }
    input_size_list_.push_back(size);

    int size1 = sizeof(T);
    for (size_t i = 0; i < MAX_DIMS; i++) {
      size1 *= input_shape_[i];
    }
    output_size_list_.push_back(size1);
  }

 private:
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/slice_impl.cu
@@ -82,18 +82,25 @@ void CalSliceGrad(const size_t input_size, const T *dy, const std::vector<int> i
 }

 template <typename T>
 __global__ void StridedSliceKernel(const int b0, const int b1, const int b2, const int b3, const int s0, const int s1,
                                   const int s2, const int s3, const int i0, const int i1, const int i2, const int i3,
                                   const int o0, const int o1, const int o2, const int o3, const T *input_addr,
                                   T *output_addr) {
  int output_num = o0 * o1 * o2 * o3;
 __global__ void StridedSliceKernel(const int b0, const int b1, const int b2, const int b3, const int b4,
                                   const int b5, const int b6, const int s0, const int s1, const int s2,
                                   const int s3, const int s4, const int s5, const int s6, const int i0,
                                   const int i1, const int i2, const int i3, const int i4, const int i5,
                                   const int i6, const int o0, const int o1, const int o2, const int o3,
                                   const int o4, const int o5, const int o6, const T *input_addr, T *output_addr) {
  int output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
    int k = pos / o3 % o2;
    int l = pos % o3;
    int i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
    int j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
    int k = pos / (o3 * o4 * o5 * o6) % o2;
    int l = pos / (o4 * o5 * o6) % o3;
    int m = pos / (o5 * o6) % o4;
    int n = pos / (o6) % o5;
    int o = pos % o6;

    int input_idx = (i * s0 + b0) * i1 * i2 * i3 + (j * s1 + b1) * i2 * i3 + (k * s2 + b2) * i3 + (l * s3 + b3);
    int input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 \
                  + (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 \
                  + (n * s5 + b5) * i6 + (o * s6 + b6);
    output_addr[pos] = input_addr[input_idx];
  }
 }
@@ -102,26 +109,36 @@ template <typename T>
 void StridedSlice(const std::vector<size_t> &input_shape, const std::vector<int> &begin,
                  const std::vector<int> &strides, const std::vector<int> &output_shape, const T *input, T *output,
                  cudaStream_t cuda_stream) {
  int size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3];
  int size = output_shape[0] * output_shape[1] * output_shape[2] * output_shape[3]  \
           * output_shape[4] * output_shape[5] * output_shape[6];
  StridedSliceKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
    begin[0], begin[1], begin[2], begin[3], strides[0], strides[1], strides[2], strides[3], input_shape[0],
    input_shape[1], input_shape[2], input_shape[3], output_shape[0], output_shape[1], output_shape[2], output_shape[3],
    input, output);
    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6],
    strides[0], strides[1], strides[2], strides[3], strides[4], strides[5], strides[6],
    input_shape[0], input_shape[1], input_shape[2], input_shape[3], input_shape[4], input_shape[5], input_shape[6],
    output_shape[0], output_shape[1], output_shape[2], output_shape[3], output_shape[4], output_shape[5],
    output_shape[6], input, output);
 }

 template <typename T>
 __global__ void StridedSliceGradKernel(const int b0, const int b1, const int b2, const int b3, const int s0,
                                       const int s1, const int s2, const int s3, const int i0, const int i1,
                                       const int i2, const int i3, const int o0, const int o1, const int o2,
                                       const int o3, const T *dy, T *dx) {
  int output_num = o0 * o1 * o2 * o3;
 __global__ void StridedSliceGradKernel(const int b0, const int b1, const int b2, const int b3, const int b4,
                                       const int b5, const int b6, const int s0, const int s1, const int s2,
                                       const int s3, const int s4, const int s5, const int s6, const int i0,
                                       const int i1, const int i2, const int i3, const int i4, const int i5,
                                       const int i6, const int o0, const int o1, const int o2, const int o3,
                                       const int o4, const int o5, const int o6, const T *dy, T *dx) {
  int output_num = o0 * o1 * o2 * o3 * o4 * o5 * o6;
  for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < output_num; pos += blockDim.x * gridDim.x) {
    int i = pos / (o1 * o2 * o3) % o0;
    int j = pos / (o2 * o3) % o1;
    int k = pos / o3 % o2;
    int l = pos % o3;
    int i = pos / (o1 * o2 * o3 * o4 * o5 * o6) % o0;
    int j = pos / (o2 * o3 * o4 * o5 * o6) % o1;
    int k = pos / (o3 * o4 * o5 * o6) % o2;
    int l = pos / (o4 * o5 * o6) % o3;
    int m = pos / (o5 * o6) % o4;
    int n = pos / (o6) % o5;
    int o = pos % o6;

    int input_idx = (i * s0 + b0) * i1 * i2 * i3 + (j * s1 + b1) * i2 * i3 + (k * s2 + b2) * i3 + (l * s3 + b3);
    int input_idx = (i * s0 + b0) * i1 * i2 * i3 * i4 * i5 * i6 + (j * s1 + b1) * i2 * i3 * i4 * i5 * i6 \
                  + (k * s2 + b2) * i3 * i4 * i5 * i6 + (l * s3 + b3) * i4 * i5 * i6 + (m * s4 + b4) * i5 * i6 \
                  + (n * s5 + b5) * i6 + (o * s6 + b6);
    dx[input_idx] = dy[pos];
  }
  return;
@@ -130,10 +147,13 @@ __global__ void StridedSliceGradKernel(const int b0, const int b1, const int b2,
 template <typename T>
 void StridedSliceGrad(const std::vector<int> &dy_shape, const std::vector<int> &begin, const std::vector<int> &strides,
                      const std::vector<int> &dx_shape, const T *dy, T *dx, cudaStream_t cuda_stream) {
  int size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3];
  int size = dy_shape[0] * dy_shape[1] * dy_shape[2] * dy_shape[3] * dy_shape[4] * dy_shape[5] * dy_shape[6];
  StridedSliceGradKernel<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(
    begin[0], begin[1], begin[2], begin[3], strides[0], strides[1], strides[2], strides[3], dx_shape[0], dx_shape[1],
    dx_shape[2], dx_shape[3], dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy, dx);
    begin[0], begin[1], begin[2], begin[3], begin[4], begin[5], begin[6],
    strides[0], strides[1], strides[2], strides[3], strides[4], strides[5], strides[6],
    dx_shape[0], dx_shape[1], dx_shape[2], dx_shape[3], dx_shape[4], dx_shape[5], dx_shape[6],
    dy_shape[0], dy_shape[1], dy_shape[2], dy_shape[3], dy_shape[4], dy_shape[5], dy_shape[6],
    dy, dx);
 }

 template void FillDeviceArray<float>(const size_t input_size, float *addr, const float value, cudaStream_t cuda_stream);
--- a/tests/st/ops/gpu/test_stridedslice_grad_op.py
+++ b/tests/st/ops/gpu/test_stridedslice_grad_op.py
@@ -274,3 +274,37 @@ def test_strided_slice_grad():
                        [0., 0., 0., 0., 0.],
                        [0., 0., 0., 0., 0.]]])
    assert np.allclose(dx[0].asnumpy(), expect)

    x = Tensor(np.arange(0, 1 * 1 * 1 * 2 * 3 * 4 * 5).reshape(1, 1, 1, 2, 3, 4, 5).astype(np.float32))
    net = StridedSliceNet((0, 0, 0, 1, 1, 2, 2), (1, 1, 1, 2, 3, 3, 4), (1, 1, 1, 1, 1, 1, 1))
    dx = GradData(net)(x)
    expect = np.array([[[[[[[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.]],

                           [[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.]],

                           [[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.]]],

                          [[[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.]],

                           [[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 1., 1., 0.],
                            [0., 0., 0., 0., 0.]],

                           [[0., 0., 0., 0., 0.],
                            [0., 0., 0., 0., 0.],
                            [0., 0., 1., 1., 0.],
                            [0., 0., 0., 0., 0.]]]]]]])
    assert np.allclose(dx[0].asnumpy(), expect)
--- a/tests/st/ops/gpu/test_stridedslice_op.py
+++ b/tests/st/ops/gpu/test_stridedslice_op.py
@@ -93,3 +93,15 @@ def test_stridedslice():
    y = Tensor(x_np)[:, ::-1]
    expect = x_np[:, ::-1]
    assert np.allclose(y.asnumpy(), expect)

    x = Tensor(np.arange(0, 2 * 3 * 4 * 5 * 4 * 3 * 2).reshape(2, 3, 4, 5, 4, 3, 2).astype(np.float32))
    y = P.StridedSlice()(x, (1, 0, 0, 2, 1, 2, 0), (2, 2, 2, 4, 2, 3, 2), (1, 1, 1, 1, 1, 1, 2))
    expect = np.array([[[[[[[1498.]]],
                          [[[1522.]]]],
                         [[[[1618.]]],
                          [[[1642.]]]]],
                        [[[[[1978.]]],
                          [[[2002.]]]],
                         [[[[2098.]]],
                          [[[2122.]]]]]]])
    assert np.allclose(y.asnumpy(), expect)