You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

gather.cu 2.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #include <iostream>
  17. #include "kernel/gpu/cuda_impl/gather.cuh"
  18. #include "device/gpu/cuda_common.h"
  19. template <typename T, typename S>
  20. __global__ void GatherKernel(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1,
  21. size_t output_dim2, size_t input_dim1) {
  22. int num = output_dim0 * output_dim1 * output_dim2;
  23. int i, j, k;
  24. for (int write_index = blockIdx.x * blockDim.x + threadIdx.x; write_index < num;
  25. write_index += blockDim.x * gridDim.x) {
  26. i = write_index / (output_dim1 * output_dim2) % output_dim0;
  27. j = write_index / output_dim2 % output_dim1;
  28. k = write_index % output_dim2;
  29. if ((indices[j] >= 0) && (indices[j] < input_dim1)) {
  30. int read_index = i * input_dim1 * output_dim2 + indices[j] * output_dim2 + k;
  31. output[write_index] = input[read_index];
  32. } else {
  33. output[write_index] = 0;
  34. }
  35. }
  36. return;
  37. }
  38. template <typename T, typename S>
  39. void Gather(T *input, S *indices, T *output, size_t output_dim0, size_t output_dim1, size_t output_dim2,
  40. size_t input_dim1, cudaStream_t stream) {
  41. int size = output_dim0 * output_dim1 * output_dim2;
  42. GatherKernel<<<GET_BLOCKS(size), GET_THREADS, 0, stream>>>(input, indices, output, output_dim0, output_dim1,
  43. output_dim2, input_dim1);
  44. return;
  45. }
  46. template void Gather<float, int>(float *input, int *indices, float *output, size_t output_dim0, size_t output_dim1,
  47. size_t output_dim2, size_t input_dim1, cudaStream_t stream);
  48. template void Gather<half, int>(half *input, int *indices, half *output, size_t output_dim0, size_t output_dim1,
  49. size_t output_dim2, size_t input_dim1, cudaStream_t stream);