!4135 [MS][LITE][Develop]opencl: image2d for pooling

Merge pull request !4135 from chenzhongming/new_master
5 years ago · 1694c88231
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/max_pool2d.cl
@@ -1,5 +1,5 @@
 __kernel void MaxPooling2d(__global float4 *input, __global float4 *output, const int4 input_shape,
                           const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
 __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
  int Y = get_global_id(1);
@@ -31,38 +31,37 @@ __kernel void MaxPooling2d(__global float4 *input, __global float4 *output, cons
  output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum;
 }

 // __constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 __constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 //__kernel void MaxPooling2dImage2d(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
 //                                  const int4 output_shape, const int2 stride, const int2 kernel_size,
 //                                  const int2 padding) {
 //  // axis to dst tensor coordinate
 //  int X = get_global_id(0);
 //  int Y = get_global_id(1);
 //  int Z = get_global_id(2);
 //
 //  // boundary check
 //  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
 //    return;
 //  }
 //
 //  float4 maximum = (float4)(-10000.0f);
 //  int xs = X * stride.x + padding.x;
 //  int ys = Y * stride.y + padding.y;
 //
 //  for (int ky = 0; ky < kernel_size.y; ++ky) {
 //    int y_c = ys + ky;
 //    if (y_c < 0 || y_c >= input_shape.y) {
 //      continue;
 //    }
 //    for (int kx = 0; kx < kernel_size.x; ++kx) {
 //      int x_c = xs + kx;
 //      if (x_c < 0 || x_c >= input_shape.x) {
 //        continue;
 //      }
 //      float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z));
 //      maximum = max(src, maximum);
 //    }
 //  }
 //  write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum);
 //}
 __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape,
                               const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) {
  // axis to dst tensor coordinate
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);

  // boundary check
  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) {
    return;
  }

  float4 maximum = (float4)(-10000.0f);
  int xs = X * stride.x + padding.x;
  int ys = Y * stride.y + padding.y;

  for (int kx = 0; kx < kernel_size.x; ++kx) {
    int x_c = xs + kx;
    if (x_c < 0 || x_c >= input_shape.x) {
      continue;
    }
    for (int ky = 0; ky < kernel_size.y; ++ky) {
      int y_c = ys + ky;
      if (y_c < 0 || y_c >= input_shape.y) {
        continue;
      }
      float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z));
      maximum = max(src, maximum);
    }
  }
  write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -13,13 +13,14 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "src/runtime/kernel/opencl/kernel/arithmetic.h"
 #include <set>
 #include <vector>
 #include <string>
 #include "schema/model_generated.h"
 #include "src/kernel_registry.h"
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/runtime/kernel/opencl/kernel/arithmetic.h"
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl.inc"
 #include "src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl.inc"
@@ -41,8 +42,8 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
 void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
  global_size_ = InitGlobalSize();
  int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
  local_size_ = GetLocalSize(global_size_, max_work_group_size);
  global_size_ = GetGlobalSize(local_size_, global_size_);
  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
 }

 void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc
@@ -31,12 +31,11 @@

 #endif


 using mindspore::schema::PrimitiveType_DepthwiseConv2D;
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_DepthwiseConv2D;

 namespace mindspore::kernel {

@@ -117,11 +116,9 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() {
  return RET_OK;
 }

 int DepthwiseConv2dOpenCLKernel::ReSize() {
  return RET_OK;
 }
 int DepthwiseConv2dOpenCLKernel::ReSize() { return RET_OK; }

 int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) {
 int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  size_t im_dst_x, im_dst_y;
  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
@@ -141,16 +138,18 @@ int DepthwiseConv2dOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* i
  *img_size = vec;
  return RET_OK;
 }
 int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t>* global_size) {

 int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  std::vector <size_t> global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4};
  std::vector<size_t> global = {(size_t)outputs_[0]->Width(), (size_t)outputs_[0]->Height(), CO4};
  *global_size = std::move(global);
  return RET_OK;
 }
 int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t>& global_size,
    std::vector<size_t>* local_size) {

 int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size,
                                              std::vector<size_t> *local_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  std::vector <size_t> local = {1, 1, CO4};
  std::vector<size_t> local = {1, 1, CO4};
  *local_size = std::move(local);
  return RET_OK;
 }
@@ -161,8 +160,8 @@ int DepthwiseConv2dOpenCLKernel::Run() {
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  size_t CI4 = UP_DIV(inputs_[0]->Channel(), C4NUM);
  std::vector <size_t> global = {(size_t) outputs_[0]->Width(), (size_t) outputs_[0]->Height(), CO4};
  std::vector <size_t> local;
  std::vector<size_t> global = {(size_t)outputs_[0]->Width(), (size_t)outputs_[0]->Height(), CO4};
  std::vector<size_t> local;
  GetLocalSize(0, global, &local);

  float relu_clip1 = 6.0;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h
@@ -28,11 +28,10 @@ namespace mindspore::kernel {
 class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {
 public:
  explicit DepthwiseConv2dOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
      const std::vector<lite::tensor::Tensor *> &outputs)
      : OpenCLKernel(parameter, inputs, outputs),
        packed_weight_(nullptr), bias_data_(nullptr), kernel_(nullptr) {}
                                       const std::vector<lite::tensor::Tensor *> &outputs)
      : OpenCLKernel(parameter, inputs, outputs), packed_weight_(nullptr), bias_data_(nullptr), kernel_(nullptr) {}

  ~DepthwiseConv2dOpenCLKernel() override {};
  ~DepthwiseConv2dOpenCLKernel() override{};

  int Init() override;

@@ -42,20 +41,16 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel {

  int InitBuffer();

  int GetImageSize(size_t idx, std::vector<size_t>* img_size) override;
  int GetGlobalSize(size_t idx, std::vector<size_t>* global_size) override;
  int GetLocalSize(size_t idx, const std::vector<size_t>& global_size,
                   std::vector<size_t>* local_size) override;
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;
  int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override;
  int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override;

 private:
  FLOAT_t *packed_weight_;
  FLOAT_t *bias_data_;
  cl::Kernel kernel_;
  enum class MEM_TYPE {
    BUF, IMG
  } mem_type_{MEM_TYPE::IMG};
  enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG};
 };
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_DEPTHWISE_H_

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc
@@ -64,12 +64,18 @@ int PoolingOpenCLKernel::Init() {
 #ifdef PROGRAM_WITH_IL
  ocl_runtime->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  if (mem_type_ == MEM_TYPE::BUF) {
    kernel_name += "_BUF";
  } else {
    kernel_name += "_IMG";
  }
  std::set<std::string> build_options;
  ocl_runtime->LoadSource(program_name, source);
  ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  outputs_[0]->SetFormat(schema::Format_NHWC4);
  MS_LOG(DEBUG) << kernel_name << " Init Done!";

  return RET_OK;
 }

@@ -81,8 +87,30 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const {
  return global;
 }

 int PoolingOpenCLKernel::InitBuffer() { return 0; }
 int PoolingOpenCLKernel::ReSize() { return 0; }
 int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  size_t im_dst_x, im_dst_y;
  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
    im_dst_x = outputs_[0]->Height();
    im_dst_y = outputs_[0]->Width() * CO4;
  } else {
    im_dst_y = outputs_[0]->Width();
    im_dst_x = outputs_[0]->Height() * CO4;
  }
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;
 #else
  size_t img_dtype = CL_FLOAT;
 #endif
  img_size->clear();
  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
  *img_size = vec;
  return RET_OK;
 }

 int PoolingOpenCLKernel::InitBuffer() { return RET_OK; }

 int PoolingOpenCLKernel::ReSize() { return RET_OK; }

 int PoolingOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->Name() << " Running!";
@@ -110,12 +138,11 @@ int PoolingOpenCLKernel::Run() {
  std::vector<size_t> local_size;
  std::vector<size_t> global_size = InitGlobalSize();
  int max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())());
  local_size = GetLocalSize(global_size, max_work_group_size);
  global_size = GetGlobalSize(local_size, global_size);
  local_size = GetCommonLocalSize(global_size, max_work_group_size);
  global_size = GetCommonGlobalSize(local_size, global_size);

  // run opengl kernel
  ocl_runtime->RunKernel(kernel_, global_size, local_size, nullptr);

  return RET_OK;
 }

--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h
@@ -25,11 +25,11 @@

 namespace mindspore::kernel {

 class PoolingOpenCLKernel : public LiteKernel {
 class PoolingOpenCLKernel : public OpenCLKernel {
 public:
  explicit PoolingOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                               const std::vector<lite::tensor::Tensor *> &outputs)
      : LiteKernel(parameter, inputs, outputs) {
      : OpenCLKernel(parameter, inputs, outputs) {
    parameter_ = reinterpret_cast<PoolingParameter *>(parameter);
  }
  ~PoolingOpenCLKernel() override{};
@@ -38,10 +38,11 @@ class PoolingOpenCLKernel : public LiteKernel {
  int ReSize() override;
  int Run() override;
  int InitBuffer();
  int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;

 private:
  std::vector<size_t> InitGlobalSize() const;

  enum class MEM_TYPE { BUF, IMG } mem_type_{MEM_TYPE::IMG};
  PoolingParameter *parameter_;
  cl::Kernel kernel_;
 };
@@ -49,4 +50,3 @@ class PoolingOpenCLKernel : public LiteKernel {
 }  // namespace mindspore::kernel

 #endif  // MINDSPORE_LITE_SRC_BACKEND_OPENCL_POOLING_H_

--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -22,7 +22,7 @@
 namespace mindspore {
 namespace kernel {

 std::vector<size_t> GetGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
  std::vector<size_t> result(3, 1);
  for (int i = 0; i < 3; ++i) {
    result[i] = AlignByN(global[i], local[i]);
@@ -30,7 +30,7 @@ std::vector<size_t> GetGlobalSize(const std::vector<size_t> &local, const std::v
  return result;
 }

 std::vector<size_t> GetLocalSize(const std::vector<size_t> &global, int max_size) {
 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) {
  size_t wg_z = GetBiggestDividerWithPriority(global[2], 8);
  size_t wg_xy_size = max_size / wg_z;
  size_t wg_x = std::min(DivideRoundUp(global[0], 2), wg_xy_size);
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -75,10 +75,10 @@ T AlignByN(T number, N n) {
 }

 // GetGlobalSize
 std::vector<size_t> GetGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global);
 std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global);

 // GetLocalSize
 std::vector<size_t> GetLocalSize(const std::vector<size_t> &global, int max_size);
 std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size);

 std::string CLErrorCode(cl_int error_code);

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/max_pooling_tests.cc
@@ -43,33 +43,42 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
  MS_LOG(INFO) << "ocl runtime";
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  ocl_runtime->Init();
  auto allocator = ocl_runtime->GetAllocator();

  MS_LOG(INFO) << "PoolingParameter";
  auto param = new PoolingParameter;
  InitParameter(param);

  // define tensor
  MS_LOG(INFO) << "define tensor";
  MS_LOG(INFO) << "define tensor1";
  std::vector<int> input_shape = {1, 16, 256, 192};
  std::vector<int> output_shape = {1, 8, 128, 192};
  auto data_type = kNumberTypeFloat32;
  auto tensorType = schema::NodeType_ValueNode;
  MS_LOG(INFO) << "define tensor2";
  auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, schema::Format_NHWC4, tensorType);
  auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, schema::Format_NHWC4, tensorType);
  MS_LOG(INFO) << "define input";
  std::vector<lite::tensor::Tensor *> inputs{input_tensor};
  std::vector<lite::tensor::Tensor *> outputs{output_tensor};

  // run
  MS_LOG(INFO) << "pooling_kernel";
  auto *pooling_kernel = new kernel::PoolingOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
  MS_LOG(INFO) << "pooling_kernel init";
  pooling_kernel->Init();

  std::vector<kernel::LiteKernel *> kernels{pooling_kernel};
  inputs[0]->MallocData(allocator);
  auto *pGraph = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  MS_LOG(INFO) << "pGraph init";
  pGraph->Init();

  // load data
  MS_LOG(INFO) << "load data";
  MS_LOG(INFO) << "load data1";
  std::string input_file = "maxpool_in.bin";
  std::string expect_file = "maxpool_out.bin";
  MS_LOG(INFO) << "load data2";
  LoadTestData(input_tensor->Data(), input_tensor->Size(), input_file);
  auto *input_data = reinterpret_cast<float *>(input_tensor->Data());
  printf("input[0:10]:");
@@ -81,6 +90,7 @@ TEST_F(TestMaxPoolingOpenCL, MaxPool_1_32_512_96) {
  pGraph->Run();

  MS_LOG(INFO) << "compare result";
  std::cout << "compare result" << std::endl;
  CompareOutput(output_tensor, expect_file);
 }

--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/utils_tests.cc
@@ -24,9 +24,14 @@ namespace mindspore {

 void LoadTestData(void *dst, size_t dst_size, const std::string &file_path) {
  if (file_path.empty()) {
    memset(dst, dst_size, dst_size);
    memset(dst, 0x00, dst_size);
  } else {
    memcpy(dst, reinterpret_cast<const void *>(dst_size), dst_size);
    auto src_data = reinterpret_cast<float *>(mindspore::lite::ReadFile(file_path.c_str(), &dst_size));
    if (src_data != nullptr) {
      memcpy(dst, src_data, dst_size);
    } else {
      MS_LOG(ERROR) << "read file empty.";
    }
  }
 }