add equality and relational operators for opencl

5 years ago · 547fea05a6
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -54,7 +54,307 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_
  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
 }

 __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output,
 __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & as_int4(b)));
 }

 __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                            const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | as_int4(b)));
 }

 __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), max(a, b));
 }

 __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), min(a, b));
 }

 __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(a / b));
 }

 __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b);
 }

 __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                           __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2));
 }

 __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                               __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b));
 }

 __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b));
 }

 __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                              __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b));
 }

 __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                   __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b));
 }

 __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                 __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b));
 }

 __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                      __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b));
 }

 __kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b);
 }

 __kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b);
 }

 __kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b);
 }

 __kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b));
 }
 __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & (int4)(b)));
 }

 __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                              const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | (int4)b));
 }

 __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b));
 }

 __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b));
 }

 __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b));
 }

 __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b);
 }

 __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                             const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2));
 }

 __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                 const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
@@ -63,7 +363,67 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias);
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b));
 }

 __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b));
 }

 __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b));
 }

 __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                     const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b));
 }

 __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                   const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b));
 }

 __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                        const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b));
 }

 __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
@@ -94,9 +454,26 @@ __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, _
  output[idx] = input_a[idx] * input_b[idx];
 }

 __kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output,
                                 const unsigned int n) {
 __kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] + (FLT)b;
 }

 __kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] - (FLT)b;
 }

 __kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = input_a[idx] * (FLT)b;
 }

 __kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
  int idx = get_global_id(0);
  if (idx >= n) return;
  output[idx] = weight * input_a[idx] + bias;
  output[idx] = divide_no_check(input_a[idx], (FLT)b);
 }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -28,6 +28,7 @@

 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::schema::PrimitiveType_Eltwise;

 namespace mindspore::kernel {

@@ -130,18 +131,18 @@ int ArithmeticOpenCLKernel::InitBuffer() {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
            PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
            int16_t *weight = new (std::nothrow) int16_t[pack_weight_size];
            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<int16_t(float)> to_dtype = Float32ToShort;
            PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
            PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else {
@@ -162,18 +163,18 @@ int ArithmeticOpenCLKernel::InitBuffer() {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
            PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
            int16_t *weight = new (std::nothrow) int16_t[pack_weight_size];
            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<int16_t(float)> to_dtype = Float32ToShort;
            PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
            PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else {
@@ -197,28 +198,69 @@ int ArithmeticOpenCLKernel::Init() {
  std::string kernel_name;

  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);

  if (arithmetic_parameter->broadcasting_) {
    element_flag_ = false;
    kernel_name = "BoardcastArith";
    kernel_name = "Broadcast";
  } else {
    element_flag_ = true;
    switch (op_parameter_->type_) {
      case PrimitiveType_Mul:
        kernel_name = "ElementMul";
        break;
      case PrimitiveType_Add:
        kernel_name = "ElementAdd";
        break;
      case PrimitiveType_Sub:
        kernel_name = "ElementSub";
        break;
      case PrimitiveType_Div:
        kernel_name = "ElementDiv";
        break;
      default:
        MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
        break;
    }
    kernel_name = "Element";
  }

  switch (op_parameter_->type_) {
    case PrimitiveType_Mul:
      kernel_name += "Mul";
      break;
    case PrimitiveType_Add:
      kernel_name += "Add";
      break;
    case PrimitiveType_Sub:
      kernel_name += "Sub";
      break;
    case PrimitiveType_Div:
      kernel_name += "Div";
      break;
    case PrimitiveType_LogicalAnd:
      kernel_name += "And";
      break;
    case PrimitiveType_LogicalOr:
      kernel_name += "Or";
      break;
    case PrimitiveType_Maximum:
      kernel_name += "Max";
      break;
    case PrimitiveType_Minimum:
      kernel_name += "Min";
      break;
    case PrimitiveType_FloorDiv:
      kernel_name += "FloorDiv";
      break;
    case PrimitiveType_FloorMod:
      kernel_name += "FloorMod";
      break;
    case PrimitiveType_SquaredDifference:
      kernel_name += "SquaredDifference";
      break;
    case PrimitiveType_Equal:
      kernel_name += "Equal";
      break;
    case PrimitiveType_NotEqual:
      kernel_name += "NotEqual";
      break;
    case PrimitiveType_Less:
      kernel_name += "Less";
      break;
    case PrimitiveType_LessEqual:
      kernel_name += "LessEqual";
      break;
    case PrimitiveType_Greater:
      kernel_name += "Greater";
      break;
    case PrimitiveType_GreaterEqual:
      kernel_name += "GreaterEqual";
      break;
    default:
      MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
      return RET_ERROR;
  }

  lite::STATUS error_code = RET_OK;
@@ -265,26 +307,8 @@ int ArithmeticOpenCLKernel::Run() {
    void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_;
    runtime_->SetKernelArg(kernel_, arg_idx++, weight);
  } else {
    float value = static_cast<float *>(in_tensors_[1]->MutableData())[0];
    switch (op_parameter_->type_) {
      case PrimitiveType_Mul:
        weight_ = value;
        break;
      case PrimitiveType_Add:
        bias_ = value;
        break;
      case PrimitiveType_Sub:
        bias_ = -1 * value;
        break;
      case PrimitiveType_Div:
        weight_ = 1 / value;
        break;
      default:
        MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
        break;
    }
    runtime_->SetKernelArg(kernel_, arg_idx++, weight_);
    runtime_->SetKernelArg(kernel_, arg_idx++, bias_);
    float weight = static_cast<float *>(in_tensors_[1]->MutableData())[0];
    runtime_->SetKernelArg(kernel_, arg_idx++, weight);
  }
  runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData());

@@ -345,4 +369,36 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCr
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -44,8 +44,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {
  cl::Kernel kernel_;
  lite::opencl::OpenCLRuntime *runtime_;
  bool element_flag_{true};
  float weight_{1.f};
  float bias_{.0f};
  void *weight_ptr_{nullptr};

  std::vector<size_t> local_size_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -152,7 +152,7 @@ int ScaleOpenCLKernel::InitBuffer() {
            delete[] scale;
            return RET_ERROR;
          }
          std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
          std::function<float(float)> to_dtype = [](float x) -> float { return x; };
          PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNC4HW4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
@@ -160,20 +160,20 @@ int ScaleOpenCLKernel::InitBuffer() {
          delete[] scale;
          delete[] offset;
        } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
          int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
          float16_t *scale = new (std::nothrow) float16_t[pack_weight_size];
          if (scale == nullptr) {
            MS_LOG(ERROR) << "Malloc buffer failed!";
            return RET_ERROR;
          }
          int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
          float16_t *offset = new (std::nothrow) float16_t[pack_weight_size];
          if (offset == nullptr) {
            MS_LOG(ERROR) << "Malloc buffer failed!";
            delete[] scale;
            return RET_ERROR;
          }
          std::function<int16_t(float)> to_dtype = Float32ToShort;
          PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNC4HW4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
          PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNC4HW4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
          offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
          delete[] scale;
@@ -202,7 +202,7 @@ int ScaleOpenCLKernel::InitBuffer() {
            delete[] scale;
            return RET_ERROR;
          }
          std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; };
          std::function<float(float)> to_dtype = [](float x) -> float { return x; };
          PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNHWC4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
@@ -210,20 +210,20 @@ int ScaleOpenCLKernel::InitBuffer() {
          delete[] scale;
          delete[] offset;
        } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
          int16_t *scale = new (std::nothrow) int16_t[pack_weight_size];
          float16_t *scale = new (std::nothrow) float16_t[pack_weight_size];
          if (scale == nullptr) {
            MS_LOG(ERROR) << "Malloc buffer failed!";
            return RET_ERROR;
          }
          int16_t *offset = new (std::nothrow) int16_t[pack_weight_size];
          float16_t *offset = new (std::nothrow) float16_t[pack_weight_size];
          if (offset == nullptr) {
            MS_LOG(ERROR) << "Malloc buffer failed!";
            delete[] scale;
            return RET_ERROR;
          }
          std::function<int16_t(float)> to_dtype = Float32ToShort;
          PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNHWC4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
          PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype);
          PackNHWCToNHWC4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype);
          scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size);
          offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size);
          delete[] scale;
@@ -328,8 +328,8 @@ int ScaleOpenCLKernel::Run() {
        ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
        ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
      } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
        int16_t scale = static_cast<int16_t *>(in_tensors_[1]->MutableData())[0];
        int16_t offset = static_cast<int16_t *>(in_tensors_[2]->MutableData())[0];
        float16_t scale = static_cast<float16_t *>(in_tensors_[1]->MutableData())[0];
        float16_t offset = static_cast<float16_t *>(in_tensors_[2]->MutableData())[0];
        ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale));
        ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset));
      } else {
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc
@@ -300,12 +300,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na
  if (fp16_enable_) {
    // fp16 enable, kernel will use half and read_imageh and write_imageh.
    build_options_str =
      "-DFLT=half -DFLT4=half4 -DFLT16=half16 "
      "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 "
      "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half  -DTO_FLT4=convert_half4 ";
  } else {
    // fp16 not enable, kernel will use float and read_imagef and write_imagef.
    build_options_str =
      "-DFLT=float -DFLT4=float4 -DFLT16=float16 "
      "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 "
      "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float  -DTO_FLT4=convert_float4 ";
  }