!4138 change buffer to image2d for arithmetic

Merge pull request !4138 from liuchao/arith_image
5 years ago · a6dc949054
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_buffer.cl
@@ -26,26 +26,9 @@ __kernel void ElementDiv(__global float *input_a, __global float *input_b, __glo
  output[idx] = input_a[idx] * input_b[idx];
 }

 __kernel void BoardcastAdd(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
 __kernel void BoardcastArith(__global float *input_a, float weight, float bias, __global float *output,
                             const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] + input_b;
 }

 __kernel void BoardcastSub(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] - input_b;
 }

 __kernel void BoardcastMul(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] * input_b;
 }

 __kernel void BoardcastDiv(__global float *input_a, float input_b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] * input_b;
  output[idx] = weight * input_a[idx] + bias;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fp32/arithmetic_image2d.cl
@@ -1,15 +1,65 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void ElementAdd(__read_only image2d_t *input_a, __read_only image2d_t *input_b, __write_only image2d_t *output,
                         const int4 output_shape) {
 __kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                         const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  int Z = get_global_id(2);
  if (X >= output_shape.x || Y >= output_shape.y || Z >= output_shape.w) return;
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  if (idx >= n) return;
  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y * output_shape.w + Z));
  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y * output_shape.w + Z));
  src = a + b;
  write_imagef(output, (int2)(0, 0), src);
  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
  write_imagef(output, (int2)(X, Y), a + b);
 }

 __kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                         const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
  write_imagef(output, (int2)(X, Y), a - b);
 }

 __kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                         const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
  write_imagef(output, (int2)(X, Y), a * b);
 }

 __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                         const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
  float4 b = read_imagef(input_b, smp_none, (int2)(X, Y));
  write_imagef(output, (int2)(X, Y), a / b);
 }

 __kernel void BoardcastArith(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
                             const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  float4 a = read_imagef(input_a, smp_none, (int2)(X, Y));
  write_imagef(output, (int2)(X, Y), weight * a + bias);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -40,10 +40,10 @@ std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
 }

 void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() {
  global_size_ = InitGlobalSize();
  int max_work_group_size = runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*runtime_->Device())());
  local_size_ = GetCommonLocalSize(global_size_, max_work_group_size);
  global_size_ = GetCommonGlobalSize(local_size_, global_size_);
  size_t H = outputs_[0]->Batch() * outputs_[0]->Height();
  size_t W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
  local_size_ = {16, 16};
  global_size_ = {H, W};
 }

 void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
@@ -51,63 +51,75 @@ void ArithmeticOpenCLKernel::BufferGetWorkGroupSize() {
  global_size_ = {element_num};
 }

 int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t>* img_size) {
  size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM);
  int H = outputs_[0]->Batch() * outputs_[0]->Height();
  int W = outputs_[0]->Width() * CO4;
  size_t im_dst_x, im_dst_y;
  if (inputs_[0]->GetFormat() == schema::Format_NHWC4) {
    im_dst_x = W;
    im_dst_y = H;
  } else {
    im_dst_y = outputs_[0]->Batch() * outputs_[0]->Height() * CO4;
    im_dst_x = outputs_[0]->Width();
  }
 #ifdef ENABLE_FP16
  size_t img_dtype = CL_HALF_FLOAT;
 #else
  size_t img_dtype = CL_FLOAT;
 #endif
  img_size->clear();
  std::vector<size_t> vec{im_dst_x, im_dst_y, img_dtype};
  *img_size = vec;
  return 0;
 }

 int ArithmeticOpenCLKernel::Init() {
  runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  std::string element_name;
  std::string boardcast_name;
  std::string kernel_name;

  if (inputs_[1]->TensorType() == schema::NodeType_ValueNode && inputs_[1]->Data() != nullptr) {
    element_flag_ = false;
    kernel_name = "BoardcastArith";
  } else {
    element_flag_ = true;
    switch (opParameter->type_) {
      case PrimitiveType_Mul:
        kernel_name = "ElementMul";
        break;
      case PrimitiveType_Add:
        kernel_name = "ElementAdd";
        break;
      case PrimitiveType_Sub:
        kernel_name = "ElementSub";
        break;
      case PrimitiveType_Div:
        kernel_name = "ElementDiv";
        break;
      default:
        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
        break;
    }
  }

  switch (opParameter->type_) {
    case PrimitiveType_Mul:
      element_name = "ElementMul";
      boardcast_name = "BoardcastMul";
      break;
    case PrimitiveType_Add:
      element_name = "ElementAdd";
      boardcast_name = "BoardcastAdd";
      break;
    case PrimitiveType_Sub:
      element_name = "ElementSub";
      boardcast_name = "BoardcastSub";
      break;
    case PrimitiveType_Div:
      element_name = "ElementDiv";
      boardcast_name = "BoardcastDiv";
      break;
    default:
      MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
      break;
  }

 #ifdef PROGRAM_WITH_IL
  runtime_->CreateKernelFromIL(kernel_(), kernel_name);
 #else
  std::string program_name = "Arithmetic";
  std::set<std::string> build_options;
  std::string source = arithmetic_buffer_source_fp32;
  std::string source = arithmetic_image2d_source_fp32;
  runtime_->LoadSource(program_name, source);

  if (element_flag_) {
    runtime_->BuildKernel(kernel_, program_name, element_name, build_options);
    MS_LOG(DEBUG) << element_name << " Init Done!";
  } else {
    runtime_->BuildKernel(kernel_, program_name, boardcast_name, build_options);
    MS_LOG(DEBUG) << boardcast_name << " Init Done!";
  }
  runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
 #endif
  outputs_[0]->SetFormat(schema::Format_NHWC4);
  Image2dGetWorkGroupSize();
  return 0;
 }

 int ArithmeticOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->Name() << " Running!";
  auto runtime_ = lite::opencl::OpenCLRuntime::GetInstance();
  BufferGetWorkGroupSize();

  int arg_idx = 0;
  uint32_t element_num = outputs_[0]->ElementsC4Num();
@@ -116,11 +128,34 @@ int ArithmeticOpenCLKernel::Run() {
  if (element_flag_) {
    runtime_->SetKernelArg(kernel_, arg_idx++, inputs_[1]->Data());
  } else {
    runtime_->SetKernelArg(kernel_, arg_idx++, static_cast<float *>(inputs_[1]->Data())[0]);
    float value = static_cast<float *>(inputs_[1]->Data())[0];
    switch (opParameter->type_) {
      case PrimitiveType_Mul:
        weight_ = value;
        break;
      case PrimitiveType_Add:
        bias_ = value;
        break;
      case PrimitiveType_Sub:
        bias_ = -1 * value;
        break;
      case PrimitiveType_Div:
        bias_ = 1 / value;
        break;
      default:
        MS_LOG(ERROR) << "Error Operator type " << opParameter->type_;
        break;
    }
    runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
    runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
    MS_LOG(DEBUG) << arg_idx-2 << " " << weight_;
    MS_LOG(DEBUG) << arg_idx-1 << " " << bias_;
  }
  runtime_->SetKernelArg(kernel_, arg_idx++, outputs_[0]->Data());
  runtime_->SetKernelArg(kernel_, arg_idx++, element_num);

  int H = outputs_[0]->Batch() * outputs_[0]->Height();
  int W = outputs_[0]->Width() * UP_DIV(outputs_[0]->Channel(), C4NUM);
  cl_int2 output_shape{H, W};
  runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  return 0;
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -24,15 +24,16 @@

 namespace mindspore::kernel {

 class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
 class ArithmeticOpenCLKernel : public OpenCLKernel {
 public:
  explicit ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::tensor::Tensor *> &inputs,
                                  const std::vector<lite::tensor::Tensor *> &outputs, const lite::Context *ctx)
      : ArithmeticCPUKernel(parameter, inputs, outputs, ctx) {}
      : OpenCLKernel(parameter, inputs, outputs) {}
  ~ArithmeticOpenCLKernel() override{};

  int Init() override;
  int Run() override;
  int GetImageSize(size_t idx, std::vector<size_t>* img_size) override;

 private:
  std::vector<size_t> InitGlobalSize() const;
@@ -42,6 +43,8 @@ class ArithmeticOpenCLKernel : public ArithmeticCPUKernel {
  cl::Kernel kernel_;
  lite::opencl::OpenCLRuntime *runtime_;
  bool element_flag_{true};
  float weight_{1.f};
  float bias_{.0f};

  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/arithmetic_tests.cc
@@ -61,13 +61,12 @@ void LogData(void *data, const int size, const std::string prefix) {
 }

 void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b) {
  std::cout << "TestCase" << std::endl;
  auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
  auto allocator = ocl_runtime->GetAllocator();

  bool is_bias_add = shape_b.empty();
  auto tensorType = schema::NodeType_ValueNode;

  std::cout << "TestCase tensor" << std::endl;
  lite::tensor::Tensor *tensor_a =
    new lite::tensor::Tensor(kNumberTypeFloat32, shape_a, schema::Format_NHWC4, tensorType);
  lite::tensor::Tensor *tensor_b =
@@ -77,7 +76,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  int64_t element_num = tensor_a->ElementsC4Num();
  int64_t element_num_b = is_bias_add ? 1 : tensor_b->ElementsC4Num();

  std::cout << "TestCase new data" << std::endl;
  float *data_a = new float[element_num];
  float *data_b = new float[element_num_b];
  float *data_c_cpu = new float[element_num];
@@ -87,14 +85,12 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  InitData(data_b, element_num_b);
  memset(data_c_ocl, 0, sizeof(float) * element_num);

  std::cout << "TestCase run cpu" << std::endl;
  if (is_bias_add) {
    BoardcaseAdd(data_a, static_cast<float *>(data_b)[0], data_c_cpu, element_num);
  } else {
    ElementAdd(data_a, data_b, data_c_cpu, element_num);
  }

  std::cout << "TestCase set data" << std::endl;
  std::vector<lite::tensor::Tensor *> inputs = {tensor_a};
  if (!is_bias_add) {
    inputs.push_back(tensor_b);
@@ -114,9 +110,10 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
    new kernel::ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(param), arithmetic_inputs, outputs, &ctx);
  arith_kernel->Init();

  tensor_a->MallocData(allocator);
  tensor_b->MallocData(allocator);
  std::vector<kernel::LiteKernel *> kernels{arith_kernel};
  auto *kernel = new kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
  std::cout << "TestCase Init" << std::endl;
  kernel->Init();

  memcpy(inputs[0]->Data(), data_a, sizeof(float) * element_num);
@@ -124,7 +121,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
    memcpy(inputs[1]->Data(), data_b, sizeof(float) * element_num_b);
  }

  std::cout << "TestCase Run" << std::endl;
  kernel->Run();

  memcpy(data_c_ocl, outputs[0]->Data(), sizeof(float) * element_num);
@@ -136,7 +132,6 @@ void TestCase(const std::vector<int> &shape_a, const std::vector<int> &shape_b)
  LogData(outputs[0]->Data(), 10, "OpenCL compute : ");
  bool cmp = DataCompare(data_c_cpu, data_c_ocl, element_num);
  MS_LOG(INFO) << "Compare " << (cmp ? "success!" : "failed!");
  std::cout << "TestCase End" << std::endl;

  // free
  delete[] data_a;
@@ -162,15 +157,15 @@ class TestArithmeticOpenCL : public mindspore::Common {
 };

 TEST_F(TestArithmeticOpenCL, AddElementwiseTest) {
  const std::vector<int> &shape_a = {1, 32, 32, 4};
  const std::vector<int> &shape_b = {1, 32, 32, 4};
  const std::vector<int> &shape_a = {1, 1024, 1024, 4};
  const std::vector<int> &shape_b = {1, 1024, 1024, 4};
  TestCase(shape_a, shape_b);
 }

 // TEST_F(TestOpenCLKernel, AddBoardcaseTest) {
 //  const std::vector<int> &shape_a = {1, 4, 128, 128};
 //  const std::vector<int> &shape_b = {};
 //  TestCase(shape_a, shape_b);
 //}
 TEST_F(TestArithmeticOpenCL, AddBoardcaseTest) {
  const std::vector<int> &shape_a = {1, 128, 128, 4};
  const std::vector<int> &shape_b = {};
  TestCase(shape_a, shape_b);
 }

 }  // namespace mindspore