From 8c7cc7943de8574d3eee5b75ebded24557368335 Mon Sep 17 00:00:00 2001 From: Danish Farid Date: Tue, 15 Sep 2020 18:01:45 -0400 Subject: [PATCH] NMS perf boost fix: revert the hash to orig comment fix remove area array changed first workspace_size variable initialization fixed variable name cases --- .../gpu/cuda_impl/nms_with_mask_impl.cu | 83 ++++++++++--------- .../gpu/cuda_impl/nms_with_mask_impl.cuh | 8 +- .../gpu/math/nms_with_mask_gpu_kernel.h | 18 ++-- 3 files changed, 53 insertions(+), 56 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu index 0dbe67823f..413c5f0d4a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu @@ -18,7 +18,7 @@ #include #include -int NMSRoundUpPower2(int v) { +int NmsRoundUpPower2(int v) { v--; v |= v >> 1; v |= v >> 2; @@ -46,12 +46,12 @@ __global__ void MaskInit(int numSq, bool *row_mask) { // copy data from input to output array sorted by indices returned from bitonic sort // flips boxes if asked to, default - false -> if (x1/y1 > x2/y2) template -__global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size_, +__global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size, bool flip_mode = false) { for (int box_num = blockIdx.x * blockDim.x + threadIdx.x; box_num < num; box_num += blockDim.x * gridDim.x) { int correct_index = index_buff[(num - 1) - box_num]; // flip the array around - int correct_arr_start = correct_index * box_size_; - int current_arr_start = box_num * box_size_; + int correct_arr_start = correct_index * box_size; + int current_arr_start = box_num * box_size; if (flip_mode) { // flip boxes // check x if (data_in[correct_arr_start + 0] > data_in[correct_arr_start + 2]) { @@ -79,7 +79,7 @@ __global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const i } template -__inline__ __device__ bool IOUDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start, T *area, +__inline__ __device__ bool IouDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start, float IOU_value) { T x_1 = max(output[box_A_start + 0], output[box_B_start + 0]); T y_1 = max(output[box_A_start + 1], output[box_B_start + 1]); @@ -87,37 +87,37 @@ __inline__ __device__ bool IOUDecision(T *output, int box_A_ix, int box_B_ix, in T y_2 = min(output[box_A_start + 3], output[box_B_start + 3]); T width = max(x_2 - x_1, T(0)); // in case of no overlap T height = max(y_2 - y_1, T(0)); - T combined_area = area[box_A_ix] + area[box_B_ix]; - // return decision to keep or remove box + + T area1 = (output[box_A_start + 2] - output[box_A_start + 0]) * (output[box_A_start + 3] - output[box_A_start + 1]); + T area2 = (output[box_B_start + 2] - output[box_B_start + 0]) * (output[box_B_start + 3] - output[box_B_start + 1]); + + T combined_area = area1 + area2; return !(((width * height) / (combined_area - (width * height))) > IOU_value); } -// calculate areas for boxes -> sorted by output boxes // populated return mask (init to all true) and return index array template -__global__ void Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *output, int box_size_) { +__global__ void Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *output, int box_size) { for (int box_num = blockIdx.x * blockDim.x + threadIdx.x; box_num < num; box_num += blockDim.x * gridDim.x) { sel_idx[box_num] = box_num; sel_boxes[box_num] = true; - area[box_num] = (output[(box_num * box_size_) + 2] - output[(box_num * box_size_) + 0]) * - (output[(box_num * box_size_) + 3] - output[(box_num * box_size_) + 1]); } } // Run parallel NMS pass -// Every box updates it's own mask in row_mask in sep threads +// Every position in the row_mask array is updated wit correct IOU decision after being init to all True template -__global__ void NMSPass(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_, +__global__ void NmsPass(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size, bool *row_mask) { - int box_i_start_index, box_j_start_index; // actual input data indexing - int mask_offset = 0; - for (int box_i = blockIdx.x * blockDim.x + threadIdx.x; box_i < num - 1; box_i += blockDim.x * gridDim.x) { - mask_offset = box_i * num; - box_i_start_index = box_i * box_size_; // adjust starting index - for (int box_j = box_i + 1; box_j < num; box_j++) { - box_j_start_index = box_j * box_size_; - row_mask[mask_offset + box_j] = - IOUDecision(output, box_i, box_j, box_i_start_index, box_j_start_index, area, IOU_value); + int box_i, box_j, box_i_start_index, box_j_start_index; // actual input data indexing + for (int mask_index = blockIdx.x * blockDim.x + threadIdx.x; mask_index < num * num; + mask_index += blockDim.x * gridDim.x) { + box_i = mask_index / num; // row in 2d row_mask array + box_j = mask_index % num; // col in 2d row_mask array + if (box_j > box_i) { // skip when box_j index lower/equal to box_i - will remain true + box_i_start_index = box_i * box_size; // adjust starting indices + box_j_start_index = box_j * box_size; + row_mask[mask_index] = IouDecision(output, box_i, box_j, box_i_start_index, box_j_start_index, IOU_value); } } } @@ -139,10 +139,10 @@ __global__ void ReducePass(const int num, bool *sel_boxes, bool *row_mask) { // Sorting function based on BitonicSort from TopK kernel template -__global__ void NMS_BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input, - T *data_buff, int *index_buff, int box_size_) { +__global__ void NmsBitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input, + T *data_buff, int *index_buff, int box_size) { for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) { - data_buff[i] = (i < inner) ? input[(i * box_size_) + 4] : std::numeric_limits::max(); + data_buff[i] = (i < inner) ? input[(i * box_size) + 4] : std::numeric_limits::max(); index_buff[i] = i; } __syncthreads(); @@ -171,37 +171,38 @@ __global__ void NMS_BitonicSortByKeyKernel(const int outer, const int inner, con } template -void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *input, T *output, int *index_buff, - int box_size_, bool *row_mask, cudaStream_t cuda_stream) { +void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size, + bool *row_mask, cudaStream_t cuda_stream) { int total_val = num * num; MaskInit<<>>(total_val, row_mask); // default for flipping boxes -> false (provision available to flip if API updated) - PopulateOutput<<>>(input, output, index_buff, num, box_size_, false); - Preprocess<<>>(num, sel_idx, sel_boxes, area, output, box_size_); + PopulateOutput<<>>(input, output, index_buff, num, box_size, false); + Preprocess<<>>(num, sel_idx, sel_boxes, output, box_size); } template -void CalSort(const int &num, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_, +void CalSort(const int &num, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size, cudaStream_t stream) { - int ceil_p_2 = NMSRoundUpPower2(num); + int ceil_p_2 = NmsRoundUpPower2(num); int thread = std::min(ceil_p_2, GET_THREADS); - NMS_BitonicSortByKeyKernel<<<1, thread, 0, stream>>>(1, num, ceil_p_2, data_in, data_buff, index_buff, box_size_); + NmsBitonicSortByKeyKernel<<<1, thread, 0, stream>>>(1, num, ceil_p_2, data_in, data_buff, index_buff, box_size); } template -void CalNMS(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_, bool *row_mask, +void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size, bool *row_mask, cudaStream_t cuda_stream) { - NMSPass<<>>(num, IOU_value, output, area, sel_boxes, box_size_, - row_mask); + // run kernel for every position in row_mask array = (num * num) size + int row_mask_size = num * num; + NmsPass<<>>(num, IOU_value, output, sel_boxes, box_size, + row_mask); ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask); } template void CalSort(const int &inner, float *data_in, float *data_out, int *index_buff, float *data_buff, - int box_size_, cudaStream_t stream); + int box_size, cudaStream_t stream); -template void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, float *area, float *input, - float *output, int *index_buff, int box_size_, bool *row_mask, - cudaStream_t cuda_stream); +template void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, float *input, float *output, + int *index_buff, int box_size, bool *row_mask, cudaStream_t cuda_stream); -template void CalNMS(const int num, const float IOU_value, float *output, float *area, bool *sel_boxes, - int box_size_, bool *row_mask, cudaStream_t cuda_stream); +template void CalNms(const int num, const float IOU_value, float *output, bool *sel_boxes, int box_size, + bool *row_mask, cudaStream_t cuda_stream); diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh index f3a81f73c2..c1f89b0461 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh @@ -24,13 +24,13 @@ void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data cudaStream_t stream); template -void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *input, T *output, int *index_buff, - int box_size_, bool *row_mask, cudaStream_t cuda_stream); +void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size_, + bool *row_mask, cudaStream_t cuda_stream); template -void CalNMS(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_, bool *row_mask, +void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_, bool *row_mask, cudaStream_t cuda_stream); -int NMSRoundUpPower2(int v); +int NmsRoundUpPower2(int v); #endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_ diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h index d219b9d0ca..697187ce2e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h @@ -41,19 +41,17 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel { bool Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs, void *stream_ptr) override { T *input = GetDeviceAddress(inputs, 0); - T *area = GetDeviceAddress(workspace, 0); - T *data_buff = GetDeviceAddress(workspace, 1); - int *index_buff = GetDeviceAddress(workspace, 2); - bool *row_mask = GetDeviceAddress(workspace, 3); + T *data_buff = GetDeviceAddress(workspace, 0); + int *index_buff = GetDeviceAddress(workspace, 1); + bool *row_mask = GetDeviceAddress(workspace, 2); T *output = GetDeviceAddress(outputs, 0); int *sel_idx = GetDeviceAddress(outputs, 1); bool *sel_boxes = GetDeviceAddress(outputs, 2); CalSort(num_input_, input, output, index_buff, data_buff, box_size_, reinterpret_cast(stream_ptr)); - CalPreprocess(num_input_, sel_idx, sel_boxes, area, input, output, index_buff, box_size_, row_mask, + CalPreprocess(num_input_, sel_idx, sel_boxes, input, output, index_buff, box_size_, row_mask, reinterpret_cast(stream_ptr)); - CalNMS(num_input_, iou_value_, output, area, sel_boxes, box_size_, row_mask, - reinterpret_cast(stream_ptr)); + CalNms(num_input_, iou_value_, output, sel_boxes, box_size_, row_mask, reinterpret_cast(stream_ptr)); return true; } @@ -80,13 +78,12 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel { } num_input_ = input_shape[0]; // Get N value in [N,5] data - ceil_power_2 = NMSRoundUpPower2(num_input_); + ceil_power_2 = NmsRoundUpPower2(num_input_); input_size_ = num_input_ * sizeof(T) * box_size_; // 5 values per bbox output_size_ = (input_size_) + (num_input_ * sizeof(int)) + (num_input_ * sizeof(bool)); - workspace_size_ = num_input_ * sizeof(int); // storing areas - workspace_size_ += ceil_power_2 * (sizeof(T) + sizeof(int)); // sorting buffers + workspace_size_ = ceil_power_2 * (sizeof(T) + sizeof(int)); // sorting buffers workspace_size_ += (num_input_ * num_input_ * sizeof(bool)); // Row mask - NMS InitSizeLists(); @@ -102,7 +99,6 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel { output_size_list_.push_back(num_input_ * sizeof(bool)); // N sized workspace arrs - workspace_size_list_.push_back(num_input_ * sizeof(T)); // area list workspace_size_list_.push_back(ceil_power_2 * sizeof(T)); // data buff workspace_size_list_.push_back(ceil_power_2 * sizeof(int)); // index buff workspace_size_list_.push_back(num_input_ * num_input_ * sizeof(bool)); // mask list