fix bug in pooling and matmul

5 years ago · 9674491150
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/avg_pool2d.cl
@@ -1,4 +1,7 @@
 __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 __kernel void AvgPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
@@ -10,10 +13,10 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
    return;
  }

  float4 r = (float4)(0.0f);
  float window_size = 0.0f;
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;
  FLT4 r = (FLT4)(0.0f);
  FLT window_size = 0.0f;
  int xs = X * stride.x - padding.x;
  int ys = Y * stride.y - padding.y;

  for (int kx = 0; kx < kernel_size.x; ++kx) {
    int x_c = xs + kx;
@@ -21,11 +24,11 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output,
    for (int ky = 0; ky < kernel_size.y; ++ky) {
      int y_c = ys + ky;
      bool outside = outside_x || y_c < 0 || y_c >= input_shape.y;
      r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (float4)(0.0f);
      r += !outside ? input[(input_shape.y * x_c + y_c) * output_shape.w + Z] : (FLT4)(0.0f);
      window_size += !outside ? 1.0f : 0.0f;
    }
  }
  float4 result = convert_float4(r / window_size);
  FLT4 result = TO_FLT4(r / window_size);
  output[(output_shape.y * X + Y) * output_shape.w + Z] = result;
 }

@@ -43,10 +46,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    return;
  }

  float4 r = (float4)(0.0f);
  float window_size = 0.0f;
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;
  FLT4 r = (FLT4)(0.0f);
  FLT window_size = 0.0f;
  int xs = X * stride.x - padding.x;
  int ys = Y * stride.y - padding.y;

  for (int ky = 0; ky < kernel_size.y; ++ky) {
    int y_c = ys + ky;
@@ -54,10 +57,10 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    for (int kx = 0; kx < kernel_size.x; ++kx) {
      int x_c = xs + kx;
      bool outside = outside_y || x_c < 0 || x_c >= input_shape.x;
      r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c));
      r += !outside ? READ_IMAGE(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)) : (float4)(0.0f);
      window_size += !outside ? 1.0f : 0.0f;
    }
  }
  float4 result = convert_float4(r / window_size);
  write_imagef(output, (int2)(Y * output_shape.w + Z, X), result);
  FLT4 result = TO_FLT4(r / window_size);
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), result);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/max_pool2d.cl
@@ -1,4 +1,7 @@
 __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
 #ifdef cl_khr_fp16
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 #endif
 __kernel void MaxPooling2d_BUF(__global FLT4 *input, __global FLT4 *output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
@@ -10,9 +13,9 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
    return;
  }

  float4 maximum = (float4)(-10000.0f);
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;
  FLT4 maximum = (FLT4)(-10000.0f);
  int xs = X * stride.x - padding.x;
  int ys = Y * stride.y - padding.y;

  for (int kx = 0; kx < kernel_size.x; ++kx) {
    int x_c = xs + kx;
@@ -24,7 +27,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output,
      if (y_c < 0 || y_c >= input_shape.y) {
        continue;
      }
      float4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
      FLT4 src = input[(input_shape.y * x_c + y_c) * input_shape.w + Z];
      maximum = max(src, maximum);
    }
  }
@@ -45,18 +48,18 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d
    return;
  }

  float4 maximum = (float4)(-10000.0f);
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;
  FLT4 maximum = (FLT4)(-10000.0f);
  int xs = X * stride.x - padding.x;
  int ys = Y * stride.y - padding.y;
  for (int ky = 0; ky < kernel_size.y; ++ky) {
    int y_c = ys + ky;
    if (y_c < 0 || y_c >= input_shape.y) continue;
    for (int kx = 0; kx < kernel_size.x; ++kx) {
      int x_c = xs + kx;
      if (x_c < 0 || x_c >= input_shape.x) continue;
      float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
      FLT4 src = READ_IMAGE(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c));
      maximum = max(src, maximum);
    }
  }
  write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum);
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + Z, X), maximum);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc
@@ -58,14 +58,13 @@ int MatMulOpenCLKernel::Init() {
  sizeCO = {co, UP_DIV(co, C4NUM)};
  PadWeight();
  in_ori_format_ = in_tensors_[0]->GetFormat();
  in_tensors_[0]->SetFormat(schema::Format_NHWC4);
  out_ori_format_ = out_tensors_[0]->GetFormat();
  out_tensors_[0]->SetFormat(schema::Format_NHWC4);
  if (out_tensors_[0]->shape().size() == 2) {
    out_ori_format_ = schema::Format_NC;
    out_tensors_[0]->SetFormat(schema::Format_NC4);
    in_ori_format_ = schema::Format_NC;
    in_tensors_[0]->SetFormat(schema::Format_NC4);
  } else {
    in_tensors_[0]->SetFormat(schema::Format_NHWC4);
    out_tensors_[0]->SetFormat(schema::Format_NHWC4);
  }
  MS_LOG(DEBUG) << kernel_name << " Init Done!";
  return RET_OK;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -60,7 +60,7 @@ int PoolingOpenCLKernel::Init() {
    return RET_INVALID_OP_NAME;
  }
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();

  enable_fp16_ = ocl_runtime->GetFp16Enable();
 #ifdef PROGRAM_WITH_IL
  kernel_ = ocl_runtime->GetKernelFromBinary(kernel_name);
 #else
@@ -96,11 +96,10 @@ int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size)
  size_t im_dst_x, im_dst_y;
  im_dst_x = out_tensors_[0]->Width() * CO4;
  im_dst_y = out_tensors_[0]->Height();
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;
 #else
  size_t img_dtype = CL_FLOAT;
 #endif
  if (enable_fp16_) {
    img_dtype = CL_HALF_FLOAT;
  }
  img_size->clear();
  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
  *img_size = vec;
@@ -161,5 +160,6 @@ kernel::LiteKernel *OpenCLPooling2dKernelCreator(const std::vector<lite::tensor:
 }

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Pooling, OpenCLPooling2dKernelCreator)
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@@ -44,6 +44,7 @@ class PoolingOpenCLKernel : public OpenCLKernel {
  std::vector<size_t> InitGlobalSize() const;
  PoolingParameter *parameter_;
  cl::Kernel kernel_;
  bool enable_fp16_{false};
 };

 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -50,6 +50,7 @@ class OpenCLKernel : public LiteKernel {
  }
  OpenCLMemType GetMemType() { return out_mem_type_; }
  void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; }
  void SetFormatType(schema::Format format_type) { op_format_ = format_type; }
  schema::Format GetInOriFormat() { return in_ori_format_; }
  schema::Format GetOutOriFormat() { return out_ori_format_; }

@@ -57,6 +58,7 @@ class OpenCLKernel : public LiteKernel {
  OpenCLMemType out_mem_type_{OpenCLMemType::IMG};
  schema::Format in_ori_format_{schema::Format_NHWC};
  schema::Format out_ori_format_{schema::Format_NHWC4};
  schema::Format op_format_{schema::Format_NC4HW4};
 };
 }  // namespace mindspore::kernel