arithmetic support 2tensor

5 years ago · 0c9e9e5d82
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -3,7 +3,7 @@
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;

 __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -12,24 +12,13 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a + b);
 }

 __kernel void ElementAddReLU_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), max(a + b, (FLT4)(0.f)));
  FLT4 result = a + b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -38,11 +27,13 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a - b);
  FLT4 result = a - b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -51,11 +42,13 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a * b);
  FLT4 result = a * b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -64,11 +57,13 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b));
  FLT4 result = divide_no_check(a, b);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -77,11 +72,13 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & AS_UINT4(b)));
  FLT4 result = AS_FLT4(AS_UINT4(a) & AS_UINT4(b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
                            const int2 output_shape) {
                            const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -90,11 +87,13 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | AS_UINT4(b)));
  FLT4 result = AS_FLT4(AS_UINT4(a) | AS_UINT4(b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -103,11 +102,13 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), max(a, b));
  FLT4 result = max(a, b);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                             __write_only image2d_t output, const int2 output_shape) {
                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -116,11 +117,14 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), min(a, b));
  FLT4 result = min(a, b);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
                                  __write_only image2d_t output, const int2 output_shape, float act_min,
                                  float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -129,11 +133,14 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)));
  FLT4 result = floor(divide_no_check(a, b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
                                  __write_only image2d_t output, const int2 output_shape, float act_min,
                                  float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -142,11 +149,14 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b);
  FLT4 result = floor(divide_no_check(a, b)) * b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                           __write_only image2d_t output, const int2 output_shape) {
                                           __write_only image2d_t output, const int2 output_shape, float act_min,
                                           float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -155,11 +165,13 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2));
  FLT4 result = pown((a - b), (int4)2);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                               __write_only image2d_t output, const int2 output_shape) {
                               __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -168,11 +180,15 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a == b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a == b ? (FLT4)1.f : (FLT4).0f;
  // error?
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                  __write_only image2d_t output, const int2 output_shape) {
                                  __write_only image2d_t output, const int2 output_shape, float act_min,
                                  float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -181,11 +197,13 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a != b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a != b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                              __write_only image2d_t output, const int2 output_shape) {
                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -194,11 +212,14 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a < b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a < b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                   __write_only image2d_t output, const int2 output_shape) {
                                   __write_only image2d_t output, const int2 output_shape, float act_min,
                                   float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -207,11 +228,13 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a <= b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a <= b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                 __write_only image2d_t output, const int2 output_shape) {
                                 __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -220,11 +243,14 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a > b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a > b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                      __write_only image2d_t output, const int2 output_shape) {
                                      __write_only image2d_t output, const int2 output_shape, float act_min,
                                      float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -233,58 +259,117 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a >= b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a >= b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
                                    const int4 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);  // C4
  int Y = get_global_id(1);  // W
  int Z = get_global_id(2);  // N * H
  int N = Z / output_shape.y;
  int H = Z % output_shape.y;
  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
    return;
  }
  int a_c = X < a_shape.w ? X : a_shape.w - 1;
  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
  int a_h = H < a_shape.y ? H : a_shape.y - 1;
  int a_n = N < a_shape.x ? N : a_shape.x - 1;
  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
  int b_c = X < b_shape.w ? X : b_shape.w - 1;
  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
  int b_h = H < b_shape.y ? H : b_shape.y - 1;
  int b_n = N < b_shape.x ? N : b_shape.x - 1;
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
  FLT4 result = a + b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
 }

 __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
                                    const int4 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);  // C4
  int Y = get_global_id(1);  // W
  int Z = get_global_id(2);  // N * H
  int N = Z / output_shape.y;
  int H = Z % output_shape.y;
  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
    return;
  }
  int a_c = X < a_shape.w ? X : a_shape.w - 1;
  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
  int a_h = H < a_shape.y ? H : a_shape.y - 1;
  int a_n = N < a_shape.x ? N : a_shape.x - 1;
  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
  int b_c = X < b_shape.w ? X : b_shape.w - 1;
  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
  int b_h = H < b_shape.y ? H : b_shape.y - 1;
  int b_n = N < b_shape.x ? N : b_shape.x - 1;
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
  FLT4 result = a - b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
 }

 __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
                                    const int4 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);  // C4
  int Y = get_global_id(1);  // W
  int Z = get_global_id(2);  // N * H
  int N = Z / output_shape.y;
  int H = Z % output_shape.y;
  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
    return;
  }
  int a_c = X < a_shape.w ? X : a_shape.w - 1;
  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
  int a_h = H < a_shape.y ? H : a_shape.y - 1;
  int a_n = N < a_shape.x ? N : a_shape.x - 1;
  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
  int b_c = X < b_shape.w ? X : b_shape.w - 1;
  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
  int b_h = H < b_shape.y ? H : b_shape.y - 1;
  int b_n = N < b_shape.x ? N : b_shape.x - 1;
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
  FLT4 result = a * b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
 }

 __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
                                    const int4 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);  // C4
  int Y = get_global_id(1);  // W
  int Z = get_global_id(2);  // N * H
  int N = Z / output_shape.y;
  int H = Z % output_shape.y;
  if (X >= output_shape.w || Y >= output_shape.z || Z >= output_shape.x * output_shape.y) {
    return;
  }
  int a_c = X < a_shape.w ? X : a_shape.w - 1;
  int a_w = Y < a_shape.z ? Y : a_shape.z - 1;
  int a_h = H < a_shape.y ? H : a_shape.y - 1;
  int a_n = N < a_shape.x ? N : a_shape.x - 1;
  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(a_w * a_shape.w + a_c, a_n * a_shape.y + a_h));
  int b_c = X < b_shape.w ? X : b_shape.w - 1;
  int b_w = Y < b_shape.z ? Y : b_shape.z - 1;
  int b_h = H < b_shape.y ? H : b_shape.y - 1;
  int b_n = N < b_shape.x ? N : b_shape.x - 1;
  FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(b_w * b_shape.w + b_c, b_n * b_shape.y + b_h));
  FLT4 result = a / b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, N * output_shape.y + H), result);
 }

 __kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b);
 }

 __kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b);
 }

 __kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b);
 }

 __kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
    return;
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b));
 }
 __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
                               const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -292,11 +377,13 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b)));
  FLT4 result = AS_FLT4(AS_UINT4(a) & (UINT4)((FLT)b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                              const int2 output_shape) {
                              const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -304,11 +391,13 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b)));
  FLT4 result = AS_FLT4(AS_UINT4(a) | (UINT4)((FLT)b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
                               const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -316,11 +405,13 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b));
  FLT4 result = max(a, (FLT4)b);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                               const int2 output_shape) {
                               const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -328,11 +419,13 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b));
  FLT4 result = min(a, (FLT4)b);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
                                    const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -340,11 +433,13 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)));
  FLT4 result = floor(divide_no_check(a, (FLT4)b));
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
                                    const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -352,11 +447,13 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b);
  FLT4 result = floor(divide_no_check(a, (FLT4)b)) * (FLT)b;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                             const int2 output_shape) {
                                             const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -364,11 +461,13 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2));
  FLT4 result = pown((a - (FLT4)b), (int4)2);
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                 const int2 output_shape) {
                                 const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -376,11 +475,13 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a == (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a == (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                    const int2 output_shape) {
                                    const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -388,11 +489,13 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a != (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a != (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                const int2 output_shape) {
                                const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -400,11 +503,13 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a < (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a < (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                     const int2 output_shape) {
                                     const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -412,11 +517,13 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a <= (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                   const int2 output_shape) {
                                   const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -424,11 +531,13 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a > (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a > (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                        const int2 output_shape) {
                                        const int2 output_shape, float act_min, float act_max) {
  int X = get_global_id(0);
  int Y = get_global_id(1);
  if (X >= output_shape.x || Y >= output_shape.y) {
@@ -436,7 +545,9 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b,
  }

  FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y));
  WRITE_IMAGE(output, (int2)(X, Y), a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f);
  FLT4 result = a >= (FLT4)b ? (FLT4)1.f : (FLT4).0f;
  result = clamp(result, (FLT)(act_min), (FLT)(act_max));
  WRITE_IMAGE(output, (int2)(X, Y), result);
 }

 __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -32,13 +32,7 @@ using mindspore::schema::PrimitiveType_Eltwise;

 namespace mindspore::kernel {

 ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {
  if (weight_ptr_ != nullptr) {
    auto allocator = ocl_runtime_->GetAllocator();
    allocator->Free(weight_ptr_);
    weight_ptr_ = nullptr;
  }
 }
 ArithmeticOpenCLKernel::~ArithmeticOpenCLKernel() {}

 std::vector<size_t> ArithmeticOpenCLKernel::InitGlobalSize() const {
  const size_t global_x = out_tensors_[0]->Width();
@@ -114,90 +108,83 @@ int ArithmeticOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_si
 }

 int ArithmeticOpenCLKernel::InitBuffer() {
  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
  if (!arithmetic_parameter->broadcasting_) {
    if (in_tensors_[1]->category() == lite::Tensor::Category::CONST && in_tensors_[1]->data_c() != nullptr) {
  auto fp16_enable = ocl_runtime_->GetFp16Enable();
  auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float);
  for (auto in_tensor_ : in_tensors_) {
    auto nhwc_shape = GetNHWCShape(in_tensor_->shape());
    inputs_nhwc_shapes_.push_back(nhwc_shape);
    if (in_tensor_->category() != lite::Tensor::Category::CONST || in_tensor_->data_c() == nullptr) {
      inputs_weight_ptrs_.push_back(nullptr);
    } else {
      auto allocator = ocl_runtime_->GetAllocator();
      std::vector<size_t> img_size;
      GetImageSize(0, &img_size);
      int pack_weight_size = in_tensors_[1]->ElementsC4Num();
      int plane = in_tensors_[1]->Height() * in_tensors_[1]->Width();
      int channel = in_tensors_[1]->Channel();
      int batch = in_tensors_[1]->Batch();

      if (in_tensors_[0]->GetFormat() == in_tensors_[1]->GetFormat()) {
        if (in_tensors_[0]->data_type() == in_tensors_[1]->data_type()) {
          weight_ptr_ =
            allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size);
        } else {
          MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
                        << in_tensors_[0]->data_type();
      std::vector<size_t> img_size = GetImage2dShapeFromNHWC(nhwc_shape, op_format_);
      int pack_weight_size = img_size[0] * img_size[1] * C4NUM;
      int plane = nhwc_shape[1] * nhwc_shape[2];
      int channel = nhwc_shape[3];
      int batch = nhwc_shape[0];
      img_size.push_back(fp16_enable ? CL_HALF_FLOAT : CL_FLOAT);
      if (!fp16_enable) {
        float *weight = new (std::nothrow) float[pack_weight_size];
        if (weight == nullptr) {
          MS_LOG(ERROR) << "Malloc buffer failed!";
          return RET_ERROR;
        }
      } else if (in_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
        if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
          if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
            float *weight = new (std::nothrow) float[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
        memset(weight, 0x00, pack_weight_size * data_size);
        if (op_format_ == schema::Format_NHWC4) {
          if (in_tensor_->data_type() == kNumberTypeFloat32) {
            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
            PackNHWCToNC4HW4<float, float>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
            PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else {
            MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
                          << in_tensors_[0]->data_type();
            return RET_ERROR;
            PackNHWCToNHWC4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
            std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
            PackNHWCToNHWC4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          }
        } else if (op_format_ == schema::Format_NC4HW4) {
          if (in_tensor_->data_type() == kNumberTypeFloat32) {
            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
            PackNHWCToNC4HW4<float, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
            std::function<float(float16_t)> to_dtype = [](float16_t x) -> float { return static_cast<float>(x); };
            PackNHWCToNC4HW4<float16_t, float>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          }
        } else {
          MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
                        << in_tensors_[0]->GetFormat();
        }
        if (batch * plane * channel == 1) {
          // scalar
          weight[3] = weight[2] = weight[1] = weight[0];
        }
        auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size);
        inputs_weight_ptrs_.push_back(weight_ptr_);
        delete[] weight;
      } else {
        float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
        if (weight == nullptr) {
          MS_LOG(ERROR) << "Malloc buffer failed!";
          return RET_ERROR;
        }
      } else if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
        if (in_tensors_[1]->GetFormat() == schema::Format_NHWC) {
          if (in_tensors_[0]->data_type() == kNumberTypeFloat32) {
            float *weight = new (std::nothrow) float[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
            std::function<float(float)> to_dtype = [](float x) -> float { return x; };
            PackNHWCToNHWC4<float, float>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) {
            float16_t *weight = new (std::nothrow) float16_t[pack_weight_size];
            if (weight == nullptr) {
              MS_LOG(ERROR) << "Malloc buffer failed!";
              return RET_ERROR;
            }
        memset(weight, 0x00, pack_weight_size * data_size);
        if (op_format_ == schema::Format_NHWC4) {
          if (in_tensor_->data_type() == kNumberTypeFloat32) {
            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
            PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->data_c(), weight, batch, plane, channel, to_dtype);
            weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size);
            delete[] weight;
          } else {
            MS_LOG(ERROR) << "Unsupport data type transpose from " << in_tensors_[1]->data_type() << "to "
                          << in_tensors_[0]->data_type();
            return RET_ERROR;
            PackNHWCToNHWC4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
            std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
            PackNHWCToNHWC4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          }
        } else if (op_format_ == schema::Format_NC4HW4) {
          if (in_tensor_->data_type() == kNumberTypeFloat32) {
            std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); };
            PackNHWCToNC4HW4<float, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          } else if (in_tensor_->data_type() == kNumberTypeFloat16) {
            std::function<float16_t(float16_t)> to_dtype = [](float16_t x) -> float16_t { return x; };
            PackNHWCToNC4HW4<float16_t, float16_t>(in_tensor_->data_c(), weight, batch, plane, channel, to_dtype);
          }
        } else {
          MS_LOG(ERROR) << "Unsupport format transpose from " << in_tensors_[1]->GetFormat() << "to "
                        << in_tensors_[0]->GetFormat();
          return RET_ERROR;
        }
        if (batch * plane * channel == 1) {
          // scalar
          weight[3] = weight[2] = weight[1] = weight[0];
        }
        auto weight_ptr_ = allocator->CreateImageFromHost(weight, pack_weight_size, img_size);
        inputs_weight_ptrs_.push_back(weight_ptr_);
        delete[] weight;
      }
    }
  }
@@ -211,7 +198,13 @@ int ArithmeticOpenCLKernel::Init() {

  if (arithmetic_parameter->broadcasting_) {
    element_flag_ = false;
    kernel_name = "Broadcast";
    if (op_format_ == schema::Format_NHWC4) {
      kernel_name = "BroadcastNHWC4";
    } else {
      kernel_name = "BroadcastNC4HW4";
      MS_LOG(ERROR) << "Don't support BroadcastNC4HW4 yet";
      return RET_ERROR;
    }
  } else {
    kernel_name = "Element";
  }
@@ -277,15 +270,14 @@ int ArithmeticOpenCLKernel::Init() {
    case schema::ActivationType_NO_ACTIVATION:
      break;
    case schema::ActivationType_RELU:
      if (op_parameter_->type_ == PrimitiveType_Add && element_flag_) {
        kernel_name += "ReLU";
      } else {
        MS_LOG(ERROR) << "Only support ElementAdd + ReLU";
        return RET_ERROR;
      }
      activation_min_ = 0.f;
      break;
    case schema::ActivationType_RELU6:
      activation_min_ = 0.f;
      activation_max_ = 6.f;
      break;
    default:
      MS_LOG(ERROR) << "Error activation type " << arithmetic_parameter->activation_type_;
      MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_;
      return RET_ERROR;
  }

@@ -328,47 +320,41 @@ int ArithmeticOpenCLKernel::Run() {
  MS_LOG(DEBUG) << this->name() << " Running!";

  int arg_idx = 0;
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
  if (element_flag_) {
    void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->data_c() : weight_ptr_;
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
  auto input_0_ptr = inputs_weight_ptrs_[0] == nullptr ? in_tensors_[0]->data_c() : inputs_weight_ptrs_[0];
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_0_ptr);
  auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1];
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr);
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  if (!element_flag_) {
    cl_int4 input0_shape = {inputs_nhwc_shapes_[0][0], inputs_nhwc_shapes_[0][1], inputs_nhwc_shapes_[0][2],
                            UP_DIV(inputs_nhwc_shapes_[0][3], C4NUM)};
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input0_shape);
    cl_int4 input1_shape = {inputs_nhwc_shapes_[1][0], inputs_nhwc_shapes_[1][1], inputs_nhwc_shapes_[1][2],
                            UP_DIV(inputs_nhwc_shapes_[1][3], C4NUM)};
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input1_shape);
    auto out_shape = GetNHWCShape(out_tensors_[0]->shape());
    cl_int4 output_shape{out_shape[0], out_shape[1], out_shape[2], UP_DIV(out_shape[3], C4NUM)};
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
    ocl_runtime_->RunKernel(kernel_,
                            {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]),
                             static_cast<size_t>(out_shape[1] * out_shape[0])},
                            {}, nullptr);
  } else {
    float weight = 0.f;
    if (in_tensors_[1]->data_type() == kNumberTypeFloat32) {
      weight = static_cast<float *>(in_tensors_[1]->data_c())[0];
    } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) {
      weight = static_cast<float>(static_cast<float16_t *>(in_tensors_[1]->data_c())[0]);
    } else {
      MS_LOG(ERROR) << "Unsupport data type " << in_tensors_[1]->data_type();
      return RET_ERROR;
    }
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, weight);
    cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_min_);
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, activation_max_);
    ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  }
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());

  cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
  return RET_OK;
 }

 kernel::LiteKernel *OpenCLBiasAddKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                               const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                               const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                               const lite::PrimitiveC *primitive);

 kernel::LiteKernel *OpenCLArithmeticKernelCreator(const std::vector<lite::Tensor *> &inputs,
                                                  const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
                                                  const lite::InnerContext *ctx, const kernel::KernelKey &desc,
                                                  const mindspore::lite::PrimitiveC *primitive) {
  const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(opParameter);
  if (arithmetic_parameter->broadcasting_) {
    for (size_t i = 0; i < arithmetic_parameter->ndim_; i++) {
      if (arithmetic_parameter->in_shape1_[i] != 0 && arithmetic_parameter->in_shape1_[i] != 1) {
        return OpenCLBiasAddKernelCreator(inputs, outputs, opParameter, ctx, desc, primitive);
      }
    }
  }
  auto *kernel =
    new (std::nothrow) ArithmeticOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs, ctx);
  if (kernel == nullptr) {
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -42,7 +42,10 @@ class ArithmeticOpenCLKernel : public OpenCLKernel {

  cl::Kernel kernel_;
  bool element_flag_{true};
  void *weight_ptr_{nullptr};
  float activation_min_{-FLT_MAX};
  float activation_max_{FLT_MAX};
  std::vector<std::vector<int>> inputs_nhwc_shapes_;
  std::vector<void *> inputs_weight_ptrs_;

  std::vector<size_t> local_size_;
  std::vector<size_t> global_size_;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -54,6 +54,12 @@ std::vector<size_t> ScaleOpenCLKernel::InitGlobalSize() const {

 void ScaleOpenCLKernel::Image2dGetWorkGroupSize() {
  local_size_ = {16, 16};
  if (out_tensors_[0]->shape().size() == 2) {
    size_t H = out_tensors_[0]->shape()[0];
    size_t W = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
    global_size_ = {W, H};
    return;
  }
  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
    size_t H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    size_t W = out_tensors_[0]->Width();
@@ -78,18 +84,23 @@ void ScaleOpenCLKernel::BufferGetWorkGroupSize() {

 int ScaleOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
    im_dst_x = out_tensors_[0]->Width();
    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
    im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
    im_dst_y = out_tensors_[0]->Batch();
    im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
  if (out_tensors_[0]->shape().size() == 2) {
    im_dst_x = UP_DIV(out_tensors_[0]->shape()[1], C4NUM);
    im_dst_y = out_tensors_[0]->shape()[0];
  } else {
    MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
    return RET_ERROR;
    if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
      im_dst_x = out_tensors_[0]->Width();
      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
      im_dst_x = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
      im_dst_y = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
    } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
      im_dst_y = out_tensors_[0]->Batch();
      im_dst_x = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    } else {
      MS_LOG(ERROR) << "Unsupport data format " << out_tensors_[0]->GetFormat();
      return RET_ERROR;
    }
  }

  size_t img_dtype = CL_FLOAT;
@@ -114,7 +125,7 @@ int ScaleOpenCLKernel::InitBuffer() {
    auto allocator = ocl_runtime_->GetAllocator();
    std::vector<size_t> img_size;
    GetImageSize(0, &img_size);
    if (in_tensors_[1]->shape().size() == 1 && axis_ == 3) {
    if (scale_C_flag_) {
      img_size[1] = 1;
      img_size[0] = UP_DIV(in_tensors_[1]->shape()[0], C4NUM);
      scale_ptr_ = allocator->CreateImageFromHost(in_tensors_[1]->data_c(), in_tensors_[1]->ElementsNum(), img_size);
@@ -256,8 +267,10 @@ int ScaleOpenCLKernel::Init() {
    if (scale_tensor->ElementsNum() == 1) {
      element_flag_ = false;
      kernel_name = "BoardcastScale";
    } else if (axis_ == 3 && scale_shape.size() == 1) {
    } else if (((in_shape.size() == 4 && axis_ == 3) || (in_shape.size() == 2 && axis_ == 1)) &&
               scale_shape.size() == 1) {
      element_flag_ = true;
      scale_C_flag_ = true;
      kernel_name = "Scale_C";
    }
  } else {
@@ -327,24 +340,9 @@ int ScaleOpenCLKernel::Run() {
    }
  }
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
  int H = 0;
  int W = 0;
  if (out_tensors_[0]->GetFormat() == schema::Format_NC4HW4) {
    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
    W = out_tensors_[0]->Width();
  } else if (out_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
    H = out_tensors_[0]->Batch() * out_tensors_[0]->Height();
    W = out_tensors_[0]->Width() * UP_DIV(out_tensors_[0]->Channel(), C4NUM);
  } else if (out_tensors_[0]->GetFormat() == schema::Format_NC4) {
    H = out_tensors_[0]->Batch();
    W = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
  } else {
    MS_LOG(ERROR) << "Error output type " << out_tensors_[0]->GetFormat();
    return RET_ERROR;
  }
  cl_int2 output_shape{W, H};
  cl_int2 output_shape{static_cast<int>(global_size_[0]), static_cast<int>(global_size_[1])};
  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, output_shape);
  if (element_flag_ && axis_ == 3) {
  if (element_flag_ && scale_C_flag_) {
    ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
  }
  ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr);
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
@@ -42,6 +42,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {

  cl::Kernel kernel_;
  bool element_flag_{true};
  bool scale_C_flag_{false};
  void *scale_ptr_{nullptr};
  void *offset_ptr_{nullptr};
  int axis_{0};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc
@@ -27,6 +27,8 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::schema::PrimitiveType_Nchw2Nhwc;
 using mindspore::schema::PrimitiveType_Nhwc2Nchw;
 using mindspore::schema::PrimitiveType_Transpose;

 namespace mindspore::kernel {
@@ -141,4 +143,8 @@ kernel::LiteKernel *OpenCLTransposeKernelCreator(const std::vector<lite::Tensor

 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Transpose, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nhwc2Nchw, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
 REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Nchw2Nhwc, OpenCLTransposeKernelCreator)
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -276,4 +276,41 @@ void PrintTensor(lite::Tensor *tensor, int num, const std::string &out_file) {
  }
  allocator->UnmapBuffer(origin_data);
 }

 std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape) {
  int n, h, w, c;
  n = h = w = c = 1;
  if (tensor_shape.size() == 1) {
    c = tensor_shape[0];
  } else if (tensor_shape.size() == 2) {
    n = tensor_shape[0];
    c = tensor_shape[1];
  } else if (tensor_shape.size() == 3) {
    n = tensor_shape[0];
    h = tensor_shape[1];
    c = tensor_shape[2];
  } else if (tensor_shape.size() == 4) {
    n = tensor_shape[0];
    h = tensor_shape[1];
    w = tensor_shape[2];
    c = tensor_shape[3];
  }
  return {n, h, w, c};
 }

 std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format) {
  if (tensor_shape.size() != 4) {
    return {1, 1};
  }
  size_t image_x, image_y;
  image_x = image_y = 1;
  if (format == schema::Format_NHWC4) {
    image_x = tensor_shape[2] * UP_DIV(tensor_shape[3], C4NUM);
    image_y = tensor_shape[0] * tensor_shape[1];
  } else if (format == schema::Format_NC4HW4) {
    image_x = tensor_shape[2];
    image_y = tensor_shape[0] * tensor_shape[1] * UP_DIV(tensor_shape[3], C4NUM);
  }
  return {image_x, image_y};
 }
 }  // namespace mindspore::kernel
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -48,6 +48,10 @@ void Write2File(void *mem, const std::string &file_name, int size);

 void PrintTensor(lite::Tensor *tensor, int num = 10, const std::string &out_file = "");

 std::vector<int> GetNHWCShape(const std::vector<int> &tensor_shape);

 std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape, schema::Format format);

 template <class T1, class T2>
 void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
  int c4 = UP_DIV(channel, C4NUM);
--- a/mindspore/lite/test/run_test.sh
+++ b/mindspore/lite/test/run_test.sh
@@ -37,8 +37,7 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestBatchnormOpenCLCI.Batchnormfp32CI*"
 ./lite-test --gtest_filter="TestAvgPoolingOpenCL*"
 ./lite-test --gtest_filter="TestConv2dTransposeOpenCL*"
 ./lite-test --gtest_filter="TestMatMulOpenCL.MatMul2D*"
 ./lite-test --gtest_filter="TestMatMulOpenCL.MatMul4D*"
 ./lite-test --gtest_filter="TestMatMulOpenCL*"
 ./lite-test --gtest_filter="TestMaxPoolingOpenCL*"
 ./lite-test --gtest_filter="TestReduceOpenCL*"
 ./lite-test --gtest_filter="TestReshapeOpenCL*"
@@ -46,3 +45,5 @@ cp -fr $TEST_DATA_DIR/testPK ./data
 ./lite-test --gtest_filter="TestTransposeOpenCL*"
 ./lite-test --gtest_filter="TestArithmeticOpenCL*"
 ./lite-test --gtest_filter="TestScaleOpenCL*"
 ./lite-test --gtest_filter="TestFullConnectionOpenCL*"
 ./lite-test --gtest_filter="TestResizeOpenCL*"