| @@ -54,7 +54,307 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); | WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, b)); | ||||
| } | } | ||||
| __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, float bias, __write_only image2d_t output, | |||||
| __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & as_int4(b))); | |||||
| } | |||||
| __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | as_int4(b))); | |||||
| } | |||||
| __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), max(a, b)); | |||||
| } | |||||
| __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), min(a, b)); | |||||
| } | |||||
| __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / b)); | |||||
| } | |||||
| __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, b)) * b); | |||||
| } | |||||
| __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), pown((a - b), (int4)2)); | |||||
| } | |||||
| __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == b)); | |||||
| } | |||||
| __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != b)); | |||||
| } | |||||
| __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < b)); | |||||
| } | |||||
| __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= b)); | |||||
| } | |||||
| __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > b)); | |||||
| } | |||||
| __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| FLT4 b = READ_IMAGE(input_b, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= b)); | |||||
| } | |||||
| __kernel void BroadcastAdd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), a + (FLT)b); | |||||
| } | |||||
| __kernel void BroadcastSub_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), a - (FLT)b); | |||||
| } | |||||
| __kernel void BroadcastMul_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), a * (FLT)b); | |||||
| } | |||||
| __kernel void BroadcastDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), divide_no_check(a, (FLT)b)); | |||||
| } | |||||
| __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) & (int4)(b))); | |||||
| } | |||||
| __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(as_int4(a) | (int4)b)); | |||||
| } | |||||
| __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), max(a, (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), min(a, (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), floor(a / (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), floor(divide_no_check(a, (FLT4)b)) * (FLT)b); | |||||
| } | |||||
| __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), pown((a - (FLT4)b), (int4)2)); | |||||
| } | |||||
| __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | const int2 output_shape) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -63,7 +363,67 @@ __kernel void BoardcastArith_IMG(__read_only image2d_t input_a, float weight, fl | |||||
| } | } | ||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | ||||
| WRITE_IMAGE(output, (int2)(X, Y), ((FLT)weight) * a + (FLT)bias); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a == (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a != (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a < (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a <= (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a > (FLT4)b)); | |||||
| } | |||||
| __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape) { | |||||
| int X = get_global_id(0); | |||||
| int Y = get_global_id(1); | |||||
| if (X >= output_shape.x || Y >= output_shape.y) { | |||||
| return; | |||||
| } | |||||
| FLT4 a = READ_IMAGE(input_a, smp_none, (int2)(X, Y)); | |||||
| WRITE_IMAGE(output, (int2)(X, Y), AS_FLT4(a >= (FLT4)b)); | |||||
| } | } | ||||
| __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, | __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, | ||||
| @@ -94,9 +454,26 @@ __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, _ | |||||
| output[idx] = input_a[idx] * input_b[idx]; | output[idx] = input_a[idx] * input_b[idx]; | ||||
| } | } | ||||
| __kernel void BoardcastArith_BUF(__global float *input_a, float weight, float bias, __global float *output, | |||||
| const unsigned int n) { | |||||
| __kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] + (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] - (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] * (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | int idx = get_global_id(0); | ||||
| if (idx >= n) return; | if (idx >= n) return; | ||||
| output[idx] = weight * input_a[idx] + bias; | |||||
| output[idx] = divide_no_check(input_a[idx], (FLT)b); | |||||
| } | } | ||||
| @@ -28,6 +28,7 @@ | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::schema::PrimitiveType_Eltwise; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| @@ -130,18 +131,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||||
| PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | ||||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | ||||
| delete[] weight; | delete[] weight; | ||||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | ||||
| int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (weight == nullptr) { | if (weight == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | ||||
| delete[] weight; | delete[] weight; | ||||
| } else { | } else { | ||||
| @@ -162,18 +163,18 @@ int ArithmeticOpenCLKernel::InitBuffer() { | |||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||||
| PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | ||||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | ||||
| delete[] weight; | delete[] weight; | ||||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | ||||
| int16_t *weight = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *weight = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (weight == nullptr) { | if (weight == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), weight, batch, plane, channel, to_dtype); | |||||
| weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | weight_ptr_ = allocator->CreateImageFromHost(weight, in_tensors_[1]->ElementsNum(), img_size); | ||||
| delete[] weight; | delete[] weight; | ||||
| } else { | } else { | ||||
| @@ -197,28 +198,69 @@ int ArithmeticOpenCLKernel::Init() { | |||||
| std::string kernel_name; | std::string kernel_name; | ||||
| const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | const ArithmeticParameter *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | ||||
| if (arithmetic_parameter->broadcasting_) { | if (arithmetic_parameter->broadcasting_) { | ||||
| element_flag_ = false; | element_flag_ = false; | ||||
| kernel_name = "BoardcastArith"; | |||||
| kernel_name = "Broadcast"; | |||||
| } else { | } else { | ||||
| element_flag_ = true; | |||||
| switch (op_parameter_->type_) { | |||||
| case PrimitiveType_Mul: | |||||
| kernel_name = "ElementMul"; | |||||
| break; | |||||
| case PrimitiveType_Add: | |||||
| kernel_name = "ElementAdd"; | |||||
| break; | |||||
| case PrimitiveType_Sub: | |||||
| kernel_name = "ElementSub"; | |||||
| break; | |||||
| case PrimitiveType_Div: | |||||
| kernel_name = "ElementDiv"; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||||
| break; | |||||
| } | |||||
| kernel_name = "Element"; | |||||
| } | |||||
| switch (op_parameter_->type_) { | |||||
| case PrimitiveType_Mul: | |||||
| kernel_name += "Mul"; | |||||
| break; | |||||
| case PrimitiveType_Add: | |||||
| kernel_name += "Add"; | |||||
| break; | |||||
| case PrimitiveType_Sub: | |||||
| kernel_name += "Sub"; | |||||
| break; | |||||
| case PrimitiveType_Div: | |||||
| kernel_name += "Div"; | |||||
| break; | |||||
| case PrimitiveType_LogicalAnd: | |||||
| kernel_name += "And"; | |||||
| break; | |||||
| case PrimitiveType_LogicalOr: | |||||
| kernel_name += "Or"; | |||||
| break; | |||||
| case PrimitiveType_Maximum: | |||||
| kernel_name += "Max"; | |||||
| break; | |||||
| case PrimitiveType_Minimum: | |||||
| kernel_name += "Min"; | |||||
| break; | |||||
| case PrimitiveType_FloorDiv: | |||||
| kernel_name += "FloorDiv"; | |||||
| break; | |||||
| case PrimitiveType_FloorMod: | |||||
| kernel_name += "FloorMod"; | |||||
| break; | |||||
| case PrimitiveType_SquaredDifference: | |||||
| kernel_name += "SquaredDifference"; | |||||
| break; | |||||
| case PrimitiveType_Equal: | |||||
| kernel_name += "Equal"; | |||||
| break; | |||||
| case PrimitiveType_NotEqual: | |||||
| kernel_name += "NotEqual"; | |||||
| break; | |||||
| case PrimitiveType_Less: | |||||
| kernel_name += "Less"; | |||||
| break; | |||||
| case PrimitiveType_LessEqual: | |||||
| kernel_name += "LessEqual"; | |||||
| break; | |||||
| case PrimitiveType_Greater: | |||||
| kernel_name += "Greater"; | |||||
| break; | |||||
| case PrimitiveType_GreaterEqual: | |||||
| kernel_name += "GreaterEqual"; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| lite::STATUS error_code = RET_OK; | lite::STATUS error_code = RET_OK; | ||||
| @@ -265,26 +307,8 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_; | void *weight = weight_ptr_ == nullptr ? in_tensors_[1]->MutableData() : weight_ptr_; | ||||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight); | runtime_->SetKernelArg(kernel_, arg_idx++, weight); | ||||
| } else { | } else { | ||||
| float value = static_cast<float *>(in_tensors_[1]->MutableData())[0]; | |||||
| switch (op_parameter_->type_) { | |||||
| case PrimitiveType_Mul: | |||||
| weight_ = value; | |||||
| break; | |||||
| case PrimitiveType_Add: | |||||
| bias_ = value; | |||||
| break; | |||||
| case PrimitiveType_Sub: | |||||
| bias_ = -1 * value; | |||||
| break; | |||||
| case PrimitiveType_Div: | |||||
| weight_ = 1 / value; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||||
| break; | |||||
| } | |||||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight_); | |||||
| runtime_->SetKernelArg(kernel_, arg_idx++, bias_); | |||||
| float weight = static_cast<float *>(in_tensors_[1]->MutableData())[0]; | |||||
| runtime_->SetKernelArg(kernel_, arg_idx++, weight); | |||||
| } | } | ||||
| runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); | runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->MutableData()); | ||||
| @@ -345,4 +369,36 @@ REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Mul, OpenCLArithmeticKernelCr | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | ||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Mul, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Add, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Sub, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Div, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalAnd, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LogicalOr, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Maximum, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Minimum, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorDiv, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_FloorMod, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SquaredDifference, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Equal, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_NotEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Less, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_LessEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Greater, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_GreaterEqual, OpenCLArithmeticKernelCreator) | |||||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Eltwise, OpenCLArithmeticKernelCreator) | |||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -44,8 +44,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||||
| cl::Kernel kernel_; | cl::Kernel kernel_; | ||||
| lite::opencl::OpenCLRuntime *runtime_; | lite::opencl::OpenCLRuntime *runtime_; | ||||
| bool element_flag_{true}; | bool element_flag_{true}; | ||||
| float weight_{1.f}; | |||||
| float bias_{.0f}; | |||||
| void *weight_ptr_{nullptr}; | void *weight_ptr_{nullptr}; | ||||
| std::vector<size_t> local_size_; | std::vector<size_t> local_size_; | ||||
| @@ -152,7 +152,7 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| delete[] scale; | delete[] scale; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||||
| PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | PackNHWCToNC4HW4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | ||||
| PackNHWCToNC4HW4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | PackNHWCToNC4HW4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | ||||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | ||||
| @@ -160,20 +160,20 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| delete[] scale; | delete[] scale; | ||||
| delete[] offset; | delete[] offset; | ||||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | ||||
| int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (scale == nullptr) { | if (scale == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (offset == nullptr) { | if (offset == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| delete[] scale; | delete[] scale; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||||
| PackNHWCToNC4HW4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||||
| PackNHWCToNC4HW4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | ||||
| offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | ||||
| delete[] scale; | delete[] scale; | ||||
| @@ -202,7 +202,7 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| delete[] scale; | delete[] scale; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<float(float)> to_dtype = [](float x) -> float { return (float)x; }; | |||||
| std::function<float(float)> to_dtype = [](float x) -> float { return x; }; | |||||
| PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | PackNHWCToNHWC4<float, float>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | ||||
| PackNHWCToNHWC4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | PackNHWCToNHWC4<float, float>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | ||||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | ||||
| @@ -210,20 +210,20 @@ int ScaleOpenCLKernel::InitBuffer() { | |||||
| delete[] scale; | delete[] scale; | ||||
| delete[] offset; | delete[] offset; | ||||
| } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[0]->data_type() == kNumberTypeFloat16) { | ||||
| int16_t *scale = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *scale = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (scale == nullptr) { | if (scale == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| int16_t *offset = new (std::nothrow) int16_t[pack_weight_size]; | |||||
| float16_t *offset = new (std::nothrow) float16_t[pack_weight_size]; | |||||
| if (offset == nullptr) { | if (offset == nullptr) { | ||||
| MS_LOG(ERROR) << "Malloc buffer failed!"; | MS_LOG(ERROR) << "Malloc buffer failed!"; | ||||
| delete[] scale; | delete[] scale; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| std::function<int16_t(float)> to_dtype = Float32ToShort; | |||||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||||
| PackNHWCToNHWC4<float, int16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||||
| std::function<float16_t(float)> to_dtype = [](float x) -> float16_t { return static_cast<float16_t>(x); }; | |||||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[1]->MutableData(), scale, batch, plane, channel, to_dtype); | |||||
| PackNHWCToNHWC4<float, float16_t>(in_tensors_[2]->MutableData(), offset, batch, plane, channel, to_dtype); | |||||
| scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | scale_ptr_ = allocator->CreateImageFromHost(scale, in_tensors_[1]->ElementsNum(), img_size); | ||||
| offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | offset_ptr_ = allocator->CreateImageFromHost(offset, in_tensors_[2]->ElementsNum(), img_size); | ||||
| delete[] scale; | delete[] scale; | ||||
| @@ -328,8 +328,8 @@ int ScaleOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | ||||
| } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { | } else if (in_tensors_[1]->data_type() == kNumberTypeFloat16) { | ||||
| int16_t scale = static_cast<int16_t *>(in_tensors_[1]->MutableData())[0]; | |||||
| int16_t offset = static_cast<int16_t *>(in_tensors_[2]->MutableData())[0]; | |||||
| float16_t scale = static_cast<float16_t *>(in_tensors_[1]->MutableData())[0]; | |||||
| float16_t offset = static_cast<float16_t *>(in_tensors_[2]->MutableData())[0]; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(scale)); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, Float32ToShort(offset)); | ||||
| } else { | } else { | ||||
| @@ -300,12 +300,12 @@ int OpenCLRuntime::BuildKernel(cl::Kernel &kernel, const std::string &program_na | |||||
| if (fp16_enable_) { | if (fp16_enable_) { | ||||
| // fp16 enable, kernel will use half and read_imageh and write_imageh. | // fp16 enable, kernel will use half and read_imageh and write_imageh. | ||||
| build_options_str = | build_options_str = | ||||
| "-DFLT=half -DFLT4=half4 -DFLT16=half16 " | |||||
| "-DFLT=half -DFLT4=half4 -DFLT16=half16 -DAS_FLT4=as_half4 " | |||||
| "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4 "; | "-DWRITE_IMAGE=write_imageh -DREAD_IMAGE=read_imageh -DTO_FLT=convert_half -DTO_FLT4=convert_half4 "; | ||||
| } else { | } else { | ||||
| // fp16 not enable, kernel will use float and read_imagef and write_imagef. | // fp16 not enable, kernel will use float and read_imagef and write_imagef. | ||||
| build_options_str = | build_options_str = | ||||
| "-DFLT=float -DFLT4=float4 -DFLT16=float16 " | |||||
| "-DFLT=float -DFLT4=float4 -DFLT16=float16 -DAS_FLT4=as_float4 " | |||||
| "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4 "; | "-DWRITE_IMAGE=write_imagef -DREAD_IMAGE=read_imagef -DTO_FLT=convert_float -DTO_FLT4=convert_float4 "; | ||||
| } | } | ||||