From 8c7cc7943de8574d3eee5b75ebded24557368335 Mon Sep 17 00:00:00 2001
From: Danish Farid <danish.farid@huawei.com>
Date: Tue, 15 Sep 2020 18:01:45 -0400
Subject: [PATCH] NMS perf boost

fix:

revert the hash to orig

comment fix

remove area array

changed first workspace_size variable initialization

fixed variable name cases
---
 .../gpu/cuda_impl/nms_with_mask_impl.cu       | 83 ++++++++++---------
 .../gpu/cuda_impl/nms_with_mask_impl.cuh      |  8 +-
 .../gpu/math/nms_with_mask_gpu_kernel.h       | 18 ++--
 3 files changed, 53 insertions(+), 56 deletions(-)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu
index 0dbe67823f..413c5f0d4a 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cu
@@ -18,7 +18,7 @@
 #include <limits>
 #include <algorithm>
 
-int NMSRoundUpPower2(int v) {
+int NmsRoundUpPower2(int v) {
   v--;
   v |= v >> 1;
   v |= v >> 2;
@@ -46,12 +46,12 @@ __global__ void MaskInit(int numSq, bool *row_mask) {
 // copy data from input to output array sorted by indices returned from bitonic sort
 // flips boxes if asked to,  default - false -> if (x1/y1 > x2/y2)
 template <typename T>
-__global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size_,
+__global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const int num, int box_size,
                                bool flip_mode = false) {
   for (int box_num = blockIdx.x * blockDim.x + threadIdx.x; box_num < num; box_num += blockDim.x * gridDim.x) {
     int correct_index = index_buff[(num - 1) - box_num];  // flip the array around
-    int correct_arr_start = correct_index * box_size_;
-    int current_arr_start = box_num * box_size_;
+    int correct_arr_start = correct_index * box_size;
+    int current_arr_start = box_num * box_size;
     if (flip_mode) {  // flip boxes
       // check x
       if (data_in[correct_arr_start + 0] > data_in[correct_arr_start + 2]) {
@@ -79,7 +79,7 @@ __global__ void PopulateOutput(T *data_in, T *data_out, int *index_buff, const i
 }
 
 template <typename T>
-__inline__ __device__ bool IOUDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start, T *area,
+__inline__ __device__ bool IouDecision(T *output, int box_A_ix, int box_B_ix, int box_A_start, int box_B_start,
                                        float IOU_value) {
   T x_1 = max(output[box_A_start + 0], output[box_B_start + 0]);
   T y_1 = max(output[box_A_start + 1], output[box_B_start + 1]);
@@ -87,37 +87,37 @@ __inline__ __device__ bool IOUDecision(T *output, int box_A_ix, int box_B_ix, in
   T y_2 = min(output[box_A_start + 3], output[box_B_start + 3]);
   T width = max(x_2 - x_1, T(0));  // in case of no overlap
   T height = max(y_2 - y_1, T(0));
-  T combined_area = area[box_A_ix] + area[box_B_ix];
-  // return decision to keep or remove box
+
+  T area1 = (output[box_A_start + 2] - output[box_A_start + 0]) * (output[box_A_start + 3] - output[box_A_start + 1]);
+  T area2 = (output[box_B_start + 2] - output[box_B_start + 0]) * (output[box_B_start + 3] - output[box_B_start + 1]);
+
+  T combined_area = area1 + area2;
   return !(((width * height) / (combined_area - (width * height))) > IOU_value);
 }
 
-// calculate areas for boxes -> sorted by output boxes
 // populated return mask (init to all true) and return index array
 template <typename T>
-__global__ void Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *output, int box_size_) {
+__global__ void Preprocess(const int num, int *sel_idx, bool *sel_boxes, T *output, int box_size) {
   for (int box_num = blockIdx.x * blockDim.x + threadIdx.x; box_num < num; box_num += blockDim.x * gridDim.x) {
     sel_idx[box_num] = box_num;
     sel_boxes[box_num] = true;
-    area[box_num] = (output[(box_num * box_size_) + 2] - output[(box_num * box_size_) + 0]) *
-                    (output[(box_num * box_size_) + 3] - output[(box_num * box_size_) + 1]);
   }
 }
 
 // Run parallel NMS pass
-// Every box updates it's own mask in row_mask in sep threads
+// Every position in the row_mask array is updated wit correct IOU decision after being init to all True
 template <typename T>
-__global__ void NMSPass(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_,
+__global__ void NmsPass(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size,
                         bool *row_mask) {
-  int box_i_start_index, box_j_start_index;  // actual input data indexing
-  int mask_offset = 0;
-  for (int box_i = blockIdx.x * blockDim.x + threadIdx.x; box_i < num - 1; box_i += blockDim.x * gridDim.x) {
-    mask_offset = box_i * num;
-    box_i_start_index = box_i * box_size_;  // adjust starting index
-    for (int box_j = box_i + 1; box_j < num; box_j++) {
-      box_j_start_index = box_j * box_size_;
-      row_mask[mask_offset + box_j] =
-        IOUDecision(output, box_i, box_j, box_i_start_index, box_j_start_index, area, IOU_value);
+  int box_i, box_j, box_i_start_index, box_j_start_index;  // actual input data indexing
+  for (int mask_index = blockIdx.x * blockDim.x + threadIdx.x; mask_index < num * num;
+       mask_index += blockDim.x * gridDim.x) {
+    box_i = mask_index / num;                // row in 2d row_mask array
+    box_j = mask_index % num;                // col in 2d row_mask array
+    if (box_j > box_i) {                     // skip when box_j index lower/equal to box_i - will remain true
+      box_i_start_index = box_i * box_size;  // adjust starting indices
+      box_j_start_index = box_j * box_size;
+      row_mask[mask_index] = IouDecision(output, box_i, box_j, box_i_start_index, box_j_start_index, IOU_value);
     }
   }
 }
@@ -139,10 +139,10 @@ __global__ void ReducePass(const int num, bool *sel_boxes, bool *row_mask) {
 
 // Sorting function based on BitonicSort from TopK kernel
 template <typename T>
-__global__ void NMS_BitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
-                                           T *data_buff, int *index_buff, int box_size_) {
+__global__ void NmsBitonicSortByKeyKernel(const int outer, const int inner, const int ceil_power2, T *input,
+                                           T *data_buff, int *index_buff, int box_size) {
   for (int i = threadIdx.x; i < ceil_power2; i += blockDim.x) {
-    data_buff[i] = (i < inner) ? input[(i * box_size_) + 4] : std::numeric_limits<T>::max();
+    data_buff[i] = (i < inner) ? input[(i * box_size) + 4] : std::numeric_limits<T>::max();
     index_buff[i] = i;
   }
   __syncthreads();
@@ -171,37 +171,38 @@ __global__ void NMS_BitonicSortByKeyKernel(const int outer, const int inner, con
 }
 
 template <typename T>
-void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *input, T *output, int *index_buff,
-                   int box_size_, bool *row_mask, cudaStream_t cuda_stream) {
+void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size,
+                   bool *row_mask, cudaStream_t cuda_stream) {
   int total_val = num * num;
   MaskInit<<<GET_BLOCKS(total_val), GET_THREADS, 0, cuda_stream>>>(total_val, row_mask);
   // default for flipping boxes -> false (provision available to flip if API updated)
-  PopulateOutput<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(input, output, index_buff, num, box_size_, false);
-  Preprocess<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(num, sel_idx, sel_boxes, area, output, box_size_);
+  PopulateOutput<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(input, output, index_buff, num, box_size, false);
+  Preprocess<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(num, sel_idx, sel_boxes, output, box_size);
 }
 
 template <typename T>
-void CalSort(const int &num, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size_,
+void CalSort(const int &num, T *data_in, T *data_out, int *index_buff, T *data_buff, int box_size,
              cudaStream_t stream) {
-  int ceil_p_2 = NMSRoundUpPower2(num);
+  int ceil_p_2 = NmsRoundUpPower2(num);
   int thread = std::min(ceil_p_2, GET_THREADS);
-  NMS_BitonicSortByKeyKernel<<<1, thread, 0, stream>>>(1, num, ceil_p_2, data_in, data_buff, index_buff, box_size_);
+  NmsBitonicSortByKeyKernel<<<1, thread, 0, stream>>>(1, num, ceil_p_2, data_in, data_buff, index_buff, box_size);
 }
 
 template <typename T>
-void CalNMS(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_, bool *row_mask,
+void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size, bool *row_mask,
             cudaStream_t cuda_stream) {
-  NMSPass<<<GET_BLOCKS(num), GET_THREADS, 0, cuda_stream>>>(num, IOU_value, output, area, sel_boxes, box_size_,
-                                                            row_mask);
+  // run kernel for every position in row_mask array = (num * num) size
+  int row_mask_size = num * num;
+  NmsPass<<<GET_BLOCKS(row_mask_size), GET_THREADS, 0, cuda_stream>>>(num, IOU_value, output, sel_boxes, box_size,
+                                                                      row_mask);
   ReducePass<<<1, GET_THREADS, 0, cuda_stream>>>(num, sel_boxes, row_mask);
 }
 
 template void CalSort<float>(const int &inner, float *data_in, float *data_out, int *index_buff, float *data_buff,
-                             int box_size_, cudaStream_t stream);
+                             int box_size, cudaStream_t stream);
 
-template void CalPreprocess<float>(const int num, int *sel_idx, bool *sel_boxes, float *area, float *input,
-                                   float *output, int *index_buff, int box_size_, bool *row_mask,
-                                   cudaStream_t cuda_stream);
+template void CalPreprocess<float>(const int num, int *sel_idx, bool *sel_boxes, float *input, float *output,
+                                   int *index_buff, int box_size, bool *row_mask, cudaStream_t cuda_stream);
 
-template void CalNMS<float>(const int num, const float IOU_value, float *output, float *area, bool *sel_boxes,
-                            int box_size_, bool *row_mask, cudaStream_t cuda_stream);
+template void CalNms<float>(const int num, const float IOU_value, float *output, bool *sel_boxes, int box_size,
+                            bool *row_mask, cudaStream_t cuda_stream);
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh
index f3a81f73c2..c1f89b0461 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/nms_with_mask_impl.cuh
@@ -24,13 +24,13 @@ void CalSort(const int &inner, T *data_in, T *data_out, int *index_buff, T *data
              cudaStream_t stream);
 
 template <typename T>
-void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *area, T *input, T *output, int *index_buff,
-                   int box_size_, bool *row_mask, cudaStream_t cuda_stream);
+void CalPreprocess(const int num, int *sel_idx, bool *sel_boxes, T *input, T *output, int *index_buff, int box_size_,
+                   bool *row_mask, cudaStream_t cuda_stream);
 
 template <typename T>
-void CalNMS(const int num, const float IOU_value, T *output, T *area, bool *sel_boxes, int box_size_, bool *row_mask,
+void CalNms(const int num, const float IOU_value, T *output, bool *sel_boxes, int box_size_, bool *row_mask,
             cudaStream_t cuda_stream);
 
-int NMSRoundUpPower2(int v);
+int NmsRoundUpPower2(int v);
 
 #endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_GPU_CUDA_IMPL_NMS_WITH_MASK_IMPL_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h
index d219b9d0ca..697187ce2e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/nms_with_mask_gpu_kernel.h
@@ -41,19 +41,17 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel {
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
               const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
     T *input = GetDeviceAddress<T>(inputs, 0);
-    T *area = GetDeviceAddress<T>(workspace, 0);
-    T *data_buff = GetDeviceAddress<T>(workspace, 1);
-    int *index_buff = GetDeviceAddress<int>(workspace, 2);
-    bool *row_mask = GetDeviceAddress<bool>(workspace, 3);
+    T *data_buff = GetDeviceAddress<T>(workspace, 0);
+    int *index_buff = GetDeviceAddress<int>(workspace, 1);
+    bool *row_mask = GetDeviceAddress<bool>(workspace, 2);
     T *output = GetDeviceAddress<T>(outputs, 0);
     int *sel_idx = GetDeviceAddress<int>(outputs, 1);
     bool *sel_boxes = GetDeviceAddress<bool>(outputs, 2);
 
     CalSort(num_input_, input, output, index_buff, data_buff, box_size_, reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalPreprocess(num_input_, sel_idx, sel_boxes, area, input, output, index_buff, box_size_, row_mask,
+    CalPreprocess(num_input_, sel_idx, sel_boxes, input, output, index_buff, box_size_, row_mask,
                   reinterpret_cast<cudaStream_t>(stream_ptr));
-    CalNMS(num_input_, iou_value_, output, area, sel_boxes, box_size_, row_mask,
-           reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalNms(num_input_, iou_value_, output, sel_boxes, box_size_, row_mask, reinterpret_cast<cudaStream_t>(stream_ptr));
     return true;
   }
 
@@ -80,13 +78,12 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel {
     }
 
     num_input_ = input_shape[0];  // Get N value in [N,5] data
-    ceil_power_2 = NMSRoundUpPower2(num_input_);
+    ceil_power_2 = NmsRoundUpPower2(num_input_);
 
     input_size_ = num_input_ * sizeof(T) * box_size_;  // 5 values per bbox
     output_size_ = (input_size_) + (num_input_ * sizeof(int)) + (num_input_ * sizeof(bool));
 
-    workspace_size_ = num_input_ * sizeof(int);                   // storing areas
-    workspace_size_ += ceil_power_2 * (sizeof(T) + sizeof(int));  // sorting buffers
+    workspace_size_ = ceil_power_2 * (sizeof(T) + sizeof(int));   // sorting buffers
     workspace_size_ += (num_input_ * num_input_ * sizeof(bool));  // Row mask - NMS
 
     InitSizeLists();
@@ -102,7 +99,6 @@ class NMSWithMaskGpuFwdKernel : public GpuKernel {
     output_size_list_.push_back(num_input_ * sizeof(bool));
 
     // N sized workspace arrs
-    workspace_size_list_.push_back(num_input_ * sizeof(T));                  // area list
     workspace_size_list_.push_back(ceil_power_2 * sizeof(T));                // data buff
     workspace_size_list_.push_back(ceil_power_2 * sizeof(int));              // index buff
     workspace_size_list_.push_back(num_input_ * num_input_ * sizeof(bool));  // mask list