!4727 [MS][LITE][GPU]softmax support Buffer output

Merge pull request !4727 from chenzupeng/master-lite
5 years ago · b4b76b61e4
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/softmax1x1.cl
@@ -48,3 +48,57 @@ __kernel void SoftMax1x1_IMG(__read_only image2d_t input, __write_only image2d_t
    slices_count++;
  } while (slices_count < slices_x32);
 }
 __kernel void SoftMax1x1_BUF(__read_only image2d_t input, __global float4 *output, const float4 mask, const int slices,
                             const int slices_x32) {
  int tid = get_local_id(0);
  float sum = 0.0f;
  for (size_t i = tid; i < slices - 1; i += 32) {
    float4 src = read_imagef(input, smp_none, (int2)(i, 0));
    sum += dot((float4)(1.0f), exp(src));
  }
  if ((slices - 1) % 32 == tid) {
    float4 src = read_imagef(input, smp_none, (int2)(slices - 1, 0));
    sum += dot(mask, exp(src));
  }
  __local float4 tmp[8];
  __local float *tmpx1 = (__local float *)tmp;
  tmpx1[tid] = sum;
  barrier(CLK_LOCAL_MEM_FENCE);
  if (tid == 0) {
    sum = dot((float4)(1.0f), tmp[0]);
    sum += dot((float4)(1.0f), tmp[1]);
    sum += dot((float4)(1.0f), tmp[2]);
    sum += dot((float4)(1.0f), tmp[3]);
    sum += dot((float4)(1.0f), tmp[4]);
    sum += dot((float4)(1.0f), tmp[5]);
    sum += dot((float4)(1.0f), tmp[6]);
    sum += dot((float4)(1.0f), tmp[7]);
    tmpx1[0] = 1.0f / sum;
  }
  barrier(CLK_LOCAL_MEM_FENCE);
  sum = tmpx1[0];
  for (size_t i = tid; i < slices - 1; i += 32) {
    float4 result = read_imagef(input, smp_none, (int2)(i, 0));
    result = exp(result) * sum;
    output[i] = result;
  }
  if ((slices - 1) % 32 == tid) {
    float4 result = read_imagef(input, smp_none, (int2)(slices - 1, 0));
    result = exp(result) * sum;
    __global float4 *remain_ptr4 = output;
    remain_ptr4 += slices - 1;
    __global float *remain_ptr = (__global float *)remain_ptr4;
    remain_ptr[0] = result.x;
    if (mask.y > 0.f) {
      remain_ptr[1] = result.y;
    }
    if (mask.z > 0.f) {
      remain_ptr[2] = result.z;
    }
    if (mask.w > 0.f) {
      remain_ptr[3] = result.w;
    }
  }
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -62,7 +62,7 @@ int ReshapeOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
  int h = shapex[1];
  int w = shapex[2];
  int c = shapex[3];
  im_dst_x = UP_DIV(w * c, C4NUM);
  im_dst_x = w * UP_DIV(c, C4NUM);
  im_dst_y = h;
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc
@@ -90,7 +90,8 @@ int SoftmaxOpenCLKernel::Init() {
  std::string program_name = "SoftMax";
  std::string source = softmax_source_fp32;
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  // framework not set this param yet! just use default.
  parameter_->axis_ = 1;
  if (in_tensors_[0]->shape().size() == 4 && parameter_->axis_ == 3) {
    // support 4d tensor
    onexone_flag_ = false;
@@ -106,7 +107,10 @@ int SoftmaxOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  if (mem_type_ == MEM_TYPE::BUF) {
  if (!is_image_out_) {
    out_mem_type_ = OpenCLMemType::BUF;
  }
  if (out_mem_type_ == OpenCLMemType::BUF) {
    kernel_name += "_BUF";
    program_name += "_BUF";
  } else {
@@ -119,6 +123,10 @@ int SoftmaxOpenCLKernel::Init() {
 #endif
  ori_format_ = out_tensors_[0]->GetFormat();
  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
  if (!is_image_out_) {
    ori_format_ = schema::Format_NC;
    out_tensors_[0]->SetFormat(schema::Format_NC);
  }
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return lite::RET_OK;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h
@@ -41,17 +41,17 @@ class SoftmaxOpenCLKernel : public OpenCLKernel {
  int InitGlobalSize();
  int SetWorkGroupSize1x1();
  int SetWorkGroupSize();
    std::vector<float> GetMaskForLastChannel(int channels);
  std::vector<float> GetMaskForLastChannel(int channels);
 private:
  cl::Kernel kernel_;
  SoftmaxParameter *parameter_;
  lite::opencl::OpenCLRuntime *runtime_;
  enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG};
  bool onexone_flag_{false};
  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
  bool is_image_out_{false};
 };
 }  // namespace mindspore::kernel