| @@ -54,7 +54,307 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||
| WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); | |||
| } | |||
| __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, | |||
| __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & as_int4(b))); | |||
| } | |||
| __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | as_int4(b))); | |||
| } | |||
| __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), max(a, b)); | |||
| } | |||
| __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), min(a, b)); | |||
| } | |||
| __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / b)); | |||
| } | |||
| __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b); | |||
| } | |||
| __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2)); | |||
| } | |||
| __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b)); | |||
| } | |||
| __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b)); | |||
| } | |||
| __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b)); | |||
| } | |||
| __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b)); | |||
| } | |||
| __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b)); | |||
| } | |||
| __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||
| __write_only image2d_t output, const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b)); | |||
| } | |||
| __kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b); | |||
| } | |||
| __kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b); | |||
| } | |||
| __kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b); | |||
| } | |||
| __kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b)); | |||
| } | |||
| __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & (int4)(b))); | |||
| } | |||
| __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | (int4)b)); | |||
| } | |||
| __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b); | |||
| } | |||
| __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2)); | |||
| } | |||
| __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -63,7 +363,67 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b)); | |||
| } | |||
| __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||
| const int2 output_shape) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||
| return; | |||
| } | |||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b)); | |||
| } | |||
| __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||
| @@ -94,9 +454,26 @@ __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, _ | |||
| output[idx] = input_a[idx] * input_b[idx]; | |||
| } | |||
| __kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output, | |||
| const unsigned int n) { | |||
| __kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] + (FLT)b; | |||
| } | |||
| __kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] - (FLT)b; | |||
| } | |||
| __kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = input_a[idx] * (FLT)b; | |||
| } | |||
| __kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||
| int idx = get_global_id(0); | |||
| if (idx >= n) return; | |||
| output[idx] = weight * input_a[idx] + bias; | |||
| output[idx] = divide_no_check(input_a[idx], (FLT)b); | |||
| } | |||
| @@ -28,6 +28,7 @@ | |||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::schema::PrimitiveType_Eltwise; | |||
| namespace mindspore::kernel { | |||
| @@ -130,18 +131,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||
| PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | |||
| delete[] weight; | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (weight == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | |||
| delete[] weight; | |||
| } else { | |||
| @@ -162,18 +163,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||
| PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | |||
| delete[] weight; | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (weight == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | |||
| delete[] weight; | |||
| } else { | |||
| @@ -197,28 +198,69 @@ int ArithmeticOpenCLKernel::Init() { | |||
| std::string kernel_name; | |||
| const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||
| if (arithmetic_parameter->broadcasting_) { | |||
| element_flag_ = false; | |||
| kernel_name = "BoardcastArith"; | |||
| kernel_name = "Broadcast"; | |||
| } else { | |||
| element_flag_ = true; | |||
| switch (op_parameter_->type_) { | |||
| case PrimitiveType_Mul: | |||
| kernel_name = "ElementMul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| kernel_name = "ElementAdd"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| kernel_name = "ElementSub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| kernel_name = "ElementDiv"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||
| break; | |||
| } | |||
| kernel_name = "Element"; | |||
| } | |||
| switch (op_parameter_->type_) { | |||
| case PrimitiveType_Mul: | |||
| kernel_name += "Mul"; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| kernel_name += "Add"; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| kernel_name += "Sub"; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| kernel_name += "Div"; | |||
| break; | |||
| case PrimitiveType_LogicalAnd: | |||
| kernel_name += "And"; | |||
| break; | |||
| case PrimitiveType_LogicalOr: | |||
| kernel_name += "Or"; | |||
| break; | |||
| case PrimitiveType_Maximum: | |||
| kernel_name += "Max"; | |||
| break; | |||
| case PrimitiveType_Minimum: | |||
| kernel_name += "Min"; | |||
| break; | |||
| case PrimitiveType_FloorDiv: | |||
| kernel_name += "FloorDiv"; | |||
| break; | |||
| case PrimitiveType_FloorMod: | |||
| kernel_name += "FloorMod"; | |||
| break; | |||
| case PrimitiveType_SquaredDifference: | |||
| kernel_name += "SquaredDifference"; | |||
| break; | |||
| case PrimitiveType_Equal: | |||
| kernel_name += "Equal"; | |||
| break; | |||
| case PrimitiveType_NotEqual: | |||
| kernel_name += "NotEqual"; | |||
| break; | |||
| case PrimitiveType_Less: | |||
| kernel_name += "Less"; | |||
| break; | |||
| case PrimitiveType_LessEqual: | |||
| kernel_name += "LessEqual"; | |||
| break; | |||
| case PrimitiveType_Greater: | |||
| kernel_name += "Greater"; | |||
| break; | |||
| case PrimitiveType_GreaterEqual: | |||
| kernel_name += "GreaterEqual"; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||
| return RET_ERROR; | |||
| } | |||
| lite::STATUS error_code = RET_OK; | |||
| @@ -265,26 +307,8 @@ int ArithmeticOpenCLKernel::Run() { | |||
| void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_; | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight); | |||
| } else { | |||
| float value = static_cast<float *>(in_tensors_[1]->MutableData())[0]; | |||
| switch (op_parameter_->type_) { | |||
| case PrimitiveType_Mul: | |||
| weight_ = value; | |||
| break; | |||
| case PrimitiveType_Add: | |||
| bias_ = value; | |||
| break; | |||
| case PrimitiveType_Sub: | |||
| bias_ = -1 * value; | |||
| break; | |||
| case PrimitiveType_Div: | |||
| weight_ = 1 / value; | |||
| break; | |||
| default: | |||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||
| break; | |||
| } | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight_); | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, bias_); | |||
| float weight = static_cast<float *>(in_tensors_[1]->MutableData())[0]; | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight); | |||
| } | |||
| runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); | |||
| @@ -345,4 +369,36 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCr | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||
| } // namespace mindspore::kernel | |||
| @@ -44,8 +44,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| cl::Kernel kernel_; | |||
| lite::opencl::OpenCLRuntime *runtime_; | |||
| bool element_flag_{true}; | |||
| float weight_{1.f}; | |||
| float bias_{.0f}; | |||
| void *weight_ptr_{nullptr}; | |||
| std::vector<size_t> local_size_; | |||
| @@ -152,7 +152,7 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| delete[] scale; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||
| PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNC4HW4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | |||
| @@ -160,20 +160,20 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| delete[] scale; | |||
| delete[] offset; | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (scale == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (offset == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| delete[] scale; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | |||
| offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | |||
| delete[] scale; | |||
| @@ -202,7 +202,7 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| delete[] scale; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||
| PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNHWC4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | |||
| @@ -210,20 +210,20 @@ int ScaleOpenCLKernel::InitBuffer() { | |||
| delete[] scale; | |||
| delete[] offset; | |||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | |||
| int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (scale == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| return RET_ERROR; | |||
| } | |||
| int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; | |||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||
| if (offset == nullptr) { | |||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | |||
| delete[] scale; | |||
| return RET_ERROR; | |||
| } | |||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | |||
| offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | |||
| delete[] scale; | |||
| @@ -328,8 +328,8 @@ int ScaleOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | |||
| } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { | |||
| int16_t scale = static_cast<int16_t *>(in_tensors_[1]->MutableData())[0]; | |||
| int16_t offset = static_cast<int16_t *>(in_tensors_[2]->MutableData())[0]; | |||
| float16_t scale = static_cast<float16_t *>(in_tensors_[1]->MutableData())[0]; | |||
| float16_t offset = static_cast<float16_t *>(in_tensors_[2]->MutableData())[0]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | |||
| } else { | |||
| @@ -300,12 +300,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na | |||
| if (fp16_enable_) { | |||
| // fp16 enable, kernel will use half and read_imageh and write_imageh. | |||
| build_options_str = | |||
| "-DFLT=half -DFLT4=half4 -DFLT16=half16 " | |||
| "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 " | |||
| "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4 "; | |||
| } else { | |||
| // fp16 not enable, kernel will use float and read_imagef and write_imagef. | |||
| build_options_str = | |||
| "-DFLT=float -DFLT4=float4 -DFLT16=float16 " | |||
| "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 " | |||
| "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4 "; | |||
| } | |||