Browse Source

fix gpu Split and Concat memory allocation bug

tags/v0.6.0-beta
zhaoting 5 years ago
parent
commit
b750e3e130
6 changed files with 19 additions and 19 deletions
  1. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h
  2. +2
    -2
      mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h
  3. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu
  4. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh
  5. +5
    -5
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu
  6. +1
    -1
      mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh

+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/concatv2_gpu_kernel.h View File

@@ -74,12 +74,12 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
inputs_host_ = std::make_unique<T *[]>(input_num_);
len_axis_ = std::make_unique<int[]>(input_num_);
for (int i = 0; i < input_num_; i++) {
int input_size = 1;
size_t input_size = 1;
auto input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i);
for (size_t j = 0; j < input_shape.size(); j++) {
input_size *= SizeToInt(input_shape[j]);
input_size *= input_shape[j];
}
input_size_list_.push_back(IntToSize(input_size * sizeof(T)));
input_size_list_.push_back(input_size * sizeof(T));
len_axis_[i] = SizeToInt(input_shape[axis_]);
}
workspace_size_list_.push_back(sizeof(T *) * input_num_);
@@ -97,7 +97,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
all_size_before_axis_ *= output_shape[i];
}
}
output_size_list_.push_back(IntToSize(output_size_ * sizeof(T)));
output_size_list_.push_back(output_size_ * sizeof(T));

InitSizeLists();
return true;
@@ -117,7 +117,7 @@ class ConcatV2GpuFwdKernel : public GpuKernel {
}
int axis_;
int input_num_;
int output_size_;
size_t output_size_;
int all_size_before_axis_;
int all_size_axis_;
std::unique_ptr<T *[]> inputs_host_;


+ 2
- 2
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/split_gpu_kernel.h View File

@@ -83,7 +83,7 @@ class SplitGpuFwdKernel : public GpuKernel {
all_size_before_axis_ *= input_shape[i];
}
}
input_size_list_.push_back(IntToSize(input_size_ * sizeof(T)));
input_size_list_.push_back(input_size_ * sizeof(T));
axis_step_ = input_shape[axis_] / output_num_;

for (int i = 0; i < output_num_; i++) {
@@ -138,7 +138,7 @@ class SplitGpuFwdKernel : public GpuKernel {
}
int axis_;
int output_num_;
int input_size_;
size_t input_size_;
int axis_step_;
int all_size_before_axis_;
int all_size_axis_;


+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cu View File

@@ -19,7 +19,7 @@
#include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh"
template <typename T>
__global__ void Concat(const int size, const int input_num,
__global__ void Concat(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output) {
for (int pos = blockIdx.x * blockDim.x + threadIdx.x; pos < (size); pos += blockDim.x * gridDim.x) {
@@ -45,7 +45,7 @@ __global__ void Concat(const int size, const int input_num,
}

template <typename T>
void ConcatKernel(const int size, const int input_num,
void ConcatKernel(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output,
cudaStream_t cuda_stream) {
@@ -55,15 +55,15 @@ void ConcatKernel(const int size, const int input_num,
return;
}

template void ConcatKernel(const int size, const int input_num,
template void ConcatKernel(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, float** inputs, float* output,
cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
template void ConcatKernel(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, int** inputs, int* output,
cudaStream_t cuda_stream);
template void ConcatKernel(const int size, const int input_num,
template void ConcatKernel(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, half** inputs, half* output,
cudaStream_t cuda_stream);

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/concatv2_impl.cuh View File

@@ -19,7 +19,7 @@

#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void ConcatKernel(const int size, const int input_num,
void ConcatKernel(const size_t size, const int input_num,
const int all_size_before_axis, const int all_size_axis,
int* len_axis, T** inputs, T* output,
cudaStream_t cuda_stream);


+ 5
- 5
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cu View File

@@ -19,7 +19,7 @@
#include <cuda_runtime.h>
#include "backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh"
template <typename T>
__global__ void Split(const int size, const int axis_step, const int all_size_before_axis,
__global__ void Split(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs) {
for (size_t pos = blockIdx.x * blockDim.x + threadIdx.x; pos < size; pos += blockDim.x * gridDim.x) {
int num = pos % all_size_before_axis / all_size_axis;
@@ -32,19 +32,19 @@ __global__ void Split(const int size, const int axis_step, const int all_size_be
}

template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream) {
Split<<<GET_BLOCKS(size), GET_THREADS, 0, cuda_stream>>>(size, axis_step, all_size_before_axis,
all_size_axis, input, outputs);
return;
}

template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const float* input, float** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const int* input, int** outputs,
cudaStream_t cuda_stream);
template void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
template void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const half* input, half** outputs,
cudaStream_t cuda_stream);

+ 1
- 1
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/split_impl.cuh View File

@@ -19,6 +19,6 @@

#include "runtime/device/gpu/cuda_common.h"
template <typename T>
void SplitKernel(const int size, const int axis_step, const int all_size_before_axis,
void SplitKernel(const size_t size, const int axis_step, const int all_size_before_axis,
const int all_size_axis, const T* input, T** outputs, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_SPLIT_H_

Loading…
Cancel
Save