| @@ -17,22 +17,6 @@ __kernel void ArithmeticSelf_ElementAbs_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementAbs_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x >= 0 ? result.x : -result.x; | |||
| result.y = result.y >= 0 ? result.y : -result.y; | |||
| result.z = result.z >= 0 ? result.z : -result.z; | |||
| result.w = result.w >= 0 ? result.w : -result.w; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -49,22 +33,6 @@ __kernel void ArithmeticSelf_ElementCos_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementCos_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = cos(result.x); | |||
| result.y = cos(result.y); | |||
| result.z = cos(result.z); | |||
| result.w = cos(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -81,22 +49,6 @@ __kernel void ArithmeticSelf_ElementSin_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSin_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = sin(result.x); | |||
| result.y = sin(result.y); | |||
| result.z = sin(result.z); | |||
| result.w = sin(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -113,22 +65,6 @@ __kernel void ArithmeticSelf_ElementNeg_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementNeg_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = -result.x; | |||
| result.y = -result.y; | |||
| result.z = -result.z; | |||
| result.w = -result.w; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -145,22 +81,6 @@ __kernel void ArithmeticSelf_ElementExp_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementExp_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = exp(result.x); | |||
| result.y = exp(result.y); | |||
| result.z = exp(result.z); | |||
| result.w = exp(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -177,22 +97,6 @@ __kernel void ArithmeticSelf_ElementLog_NHWC4(__read_only image2d_t input0, __wr | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementLog_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x > 0 ? log(result.x) : HUGE_VALF; | |||
| result.y = result.y > 0 ? log(result.y) : HUGE_VALF; | |||
| result.z = result.z > 0 ? log(result.z) : HUGE_VALF; | |||
| result.w = result.w > 0 ? log(result.w) : HUGE_VALF; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -209,22 +113,6 @@ __kernel void ArithmeticSelf_ElementSquare_NHWC4(__read_only image2d_t input0, _ | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSquare_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x * result.x; | |||
| result.y = result.y * result.y; | |||
| result.z = result.z * result.z; | |||
| result.w = result.w * result.w; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -241,22 +129,6 @@ __kernel void ArithmeticSelf_ElementSqrt_NHWC4(__read_only image2d_t input0, __w | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementSqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x > 0 ? sqrt(result.x) : HUGE_VALF; | |||
| result.y = result.y > 0 ? sqrt(result.y) : HUGE_VALF; | |||
| result.z = result.z > 0 ? sqrt(result.z) : HUGE_VALF; | |||
| result.w = result.w > 0 ? sqrt(result.w) : HUGE_VALF; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -273,22 +145,6 @@ __kernel void ArithmeticSelf_ElementRsqrt_NHWC4(__read_only image2d_t input0, __ | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementRsqrt_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x > 0 ? 1.0f / sqrt(result.x) : HUGE_VALF; | |||
| result.y = result.y > 0 ? 1.0f / sqrt(result.y) : HUGE_VALF; | |||
| result.z = result.z > 0 ? 1.0f / sqrt(result.z) : HUGE_VALF; | |||
| result.w = result.w > 0 ? 1.0f / sqrt(result.w) : HUGE_VALF; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -305,22 +161,6 @@ __kernel void ArithmeticSelf_ElementLogicalNot_NHWC4(__read_only image2d_t input | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementLogicalNot_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = result.x > 0 || result.x < 0 ? false : true; | |||
| result.y = result.y > 0 || result.y < 0 ? false : true; | |||
| result.z = result.z > 0 || result.z < 0 ? false : true; | |||
| result.w = result.w > 0 || result.w < 0 ? false : true; | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -337,22 +177,6 @@ __kernel void ArithmeticSelf_ElementFloor_NHWC4(__read_only image2d_t input0, __ | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementFloor_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = floor(result.x); | |||
| result.y = floor(result.y); | |||
| result.z = floor(result.z); | |||
| result.w = floor(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -369,22 +193,6 @@ __kernel void ArithmeticSelf_ElementCeil_NHWC4(__read_only image2d_t input0, __w | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementCeil_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = ceil(result.x); | |||
| result.y = ceil(result.y); | |||
| result.z = ceil(result.z); | |||
| result.w = ceil(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| @@ -401,18 +209,3 @@ __kernel void ArithmeticSelf_ElementRound_NHWC4(__read_only image2d_t input0, __ | |||
| WRITE_IMAGE(output, (int2)((Y)*output_shape.w + Z, (X)), result); | |||
| } | |||
| __kernel void ArithmeticSelf_ElementRound_NC4HW4(__read_only image2d_t input0, __write_only image2d_t output, | |||
| int4 output_shape) { | |||
| int X = get_global_id(0); // N*H | |||
| int Y = get_global_id(1); // W | |||
| int Z = get_global_id(2); // c/4 | |||
| if (X >= output_shape.x * output_shape.y || Y >= output_shape.z || Z >= output_shape.w) { | |||
| return; | |||
| } | |||
| FLT4 result = READ_IMAGE(input0, smp_none, (int2)((Y), (Z * output_shape.y + X))); | |||
| result.x = round(result.x); | |||
| result.y = round(result.y); | |||
| result.z = round(result.z); | |||
| result.w = round(result.w); | |||
| WRITE_IMAGE(output, (int2)((Y), (Z * output_shape.y + X)), result); | |||
| } | |||
| @@ -1,131 +1,48 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| #define C4NUM 4 | |||
| __kernel void SparseToDenseScalarDim0(__read_only image2d_t input, __write_only image2d_t output, float weight, | |||
| int2 input_shape, float default_value) { | |||
| FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, 0)); | |||
| FLT4 result = {default_value, default_value, default_value, default_value}; | |||
| int integer = index_input.x / C4NUM; | |||
| int decimal = (int)(index_input.x) % C4NUM; | |||
| if (decimal == 0) { | |||
| result.x = weight; | |||
| } else if (decimal == 1) { | |||
| result.y = weight; | |||
| } else if (decimal == 2) { | |||
| result.z = weight; | |||
| } else { | |||
| result.w = weight; | |||
| } | |||
| WRITE_IMAGE(output, (int2)(0, integer), result); | |||
| return; | |||
| } | |||
| __kernel void SparseToDenseScalarDim1(__read_only image2d_t input, __write_only image2d_t output, float weight, | |||
| int2 input_shape, float default_value) { | |||
| for (int i = 0; i < input_shape.x; ++i) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| int Y = result.x; | |||
| result.x = weight; | |||
| WRITE_IMAGE(output, (int2)(0, Y), result); | |||
| } | |||
| } | |||
| __kernel void SparseToDenseVectorDim1(__read_only image2d_t input, __write_only image2d_t output, | |||
| __global float *weight, int2 input_shape, float default_value) { | |||
| int index_weight = 0; | |||
| for (int i = 0; i < input_shape.x; ++i) { | |||
| FLT4 result = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| int Y = result.x; | |||
| result.x = weight[index_weight++]; | |||
| WRITE_IMAGE(output, (int2)(0, Y), result); | |||
| } | |||
| } | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void SparseToDenseScalarDim2Shape2(__read_only image2d_t input, __write_only image2d_t output, float weight, | |||
| int2 input_shape, float default_value) { | |||
| FLT temp[8] = {default_value, default_value, default_value, default_value, | |||
| default_value, default_value, default_value, default_value}; | |||
| FLT result_temp[8] = {default_value, default_value, default_value, default_value, | |||
| default_value, default_value, default_value, default_value}; | |||
| int index = 0; // 0~4 | |||
| int X = 0; | |||
| FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0)); | |||
| int Y = (int)index_begin.x; // N | |||
| temp[index] = index_begin.y; // c/4 | |||
| for (int i = 1; i < input_shape.x && index < C4NUM; ++i) { | |||
| FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) { | |||
| index++; | |||
| if (index < C4NUM) { | |||
| temp[index] = index_input.y; | |||
| } | |||
| } else { | |||
| for (int j = 0; j <= index && index < C4NUM; ++j) { | |||
| int decimal = (int)temp[j] % C4NUM; | |||
| result_temp[decimal] = weight; | |||
| X = ((int)temp[0]) / C4NUM; | |||
| } | |||
| FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; | |||
| WRITE_IMAGE(output, (int2)(X, Y), result); | |||
| index = 0; | |||
| Y = (int)index_input.x; | |||
| temp[0] = index_input.y; | |||
| temp[1] = temp[2] = temp[3] = default_value; | |||
| result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value; | |||
| } | |||
| __kernel void SparseToDenseScalar(__read_only image2d_t input, __global float *output, float weight, int2 inputshape, | |||
| int4 outputshape, float default_value, int stride_w, int inshapeindex1_dim) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= inputshape.x || Y >= inputshape.y) { | |||
| return; | |||
| } | |||
| // judge the last element for input | |||
| X = ((int)temp[0]) / C4NUM; | |||
| for (int i = 0; i <= index && index < C4NUM; ++i) { | |||
| int decimal = (int)temp[i] % C4NUM; | |||
| result_temp[decimal] = weight; | |||
| FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X)); | |||
| int index = 0; | |||
| if (inshapeindex1_dim == 1) { | |||
| index = ((int)index_input.x) * stride_w; | |||
| } else if (inshapeindex1_dim == 2) { | |||
| index = ((int)index_input.x) * stride_w + ((int)index_input.y); | |||
| } else if (inshapeindex1_dim == 3) { | |||
| index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + ((int)index_input.z); | |||
| } else { | |||
| index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w + | |||
| ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w; | |||
| } | |||
| FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; | |||
| WRITE_IMAGE(output, (int2)(X, Y), result); | |||
| output[index] = weight; | |||
| } | |||
| __kernel void SparseToDenseVectorDim2Shape2(__read_only image2d_t input, __write_only image2d_t output, | |||
| __global float *weight, int2 input_shape, float default_value) { | |||
| FLT temp[8] = {default_value, default_value, default_value, default_value, | |||
| default_value, default_value, default_value, default_value}; | |||
| FLT result_temp[8] = {default_value, default_value, default_value, default_value, | |||
| default_value, default_value, default_value, default_value}; | |||
| int index = 0; // 0~4 | |||
| int weight_index = 0; | |||
| int X = 0; | |||
| FLT4 index_begin = READ_IMAGE(input, smp_zero, (int2)(0, 0)); | |||
| int Y = (int)index_begin.x; // N | |||
| temp[index] = index_begin.y; // c/4 | |||
| for (int i = 1; i < input_shape.x && index < C4NUM; ++i) { | |||
| FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(0, i)); | |||
| if ((((int)temp[index]) / C4NUM == ((int)index_input.y) / C4NUM) && (Y == (int)index_input.x)) { | |||
| index++; | |||
| if (index < C4NUM) { | |||
| temp[index] = index_input.y; | |||
| } | |||
| } else { | |||
| for (int j = 0; j <= index && index < C4NUM; ++j) { | |||
| int decimal = (int)temp[j] % C4NUM; | |||
| result_temp[decimal] = weight[weight_index++]; | |||
| X = ((int)temp[0]) / C4NUM; | |||
| } | |||
| FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; | |||
| WRITE_IMAGE(output, (int2)(X, Y), result); | |||
| index = 0; | |||
| Y = (int)index_input.x; | |||
| temp[0] = index_input.y; | |||
| temp[1] = temp[2] = temp[3] = default_value; | |||
| result_temp[0] = result_temp[1] = result_temp[2] = result_temp[3] = default_value; | |||
| } | |||
| __kernel void SparseToDenseVector(__read_only image2d_t input, __global float *output, __global float *weight_vector, | |||
| int2 inputshape, int4 outputshape, float default_value, int stride_w, | |||
| int inshapeindex1_dim) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| if (X >= inputshape.x || Y >= inputshape.y) { | |||
| return; | |||
| } | |||
| // judge the last element for input | |||
| X = ((int)temp[0]) / C4NUM; | |||
| for (int i = 0; i <= index && index < C4NUM; ++i) { | |||
| int decimal = (int)temp[i] % C4NUM; | |||
| result_temp[decimal] = weight[weight_index++]; | |||
| FLT4 index_input = READ_IMAGE(input, smp_zero, (int2)(Y, X)); | |||
| int index = 0; | |||
| if (inshapeindex1_dim == 1) { | |||
| index = ((int)index_input.x) * stride_w; | |||
| } else if (inshapeindex1_dim == 2) { | |||
| index = ((int)index_input.x) * stride_w + (int)index_input.y; | |||
| } else if (inshapeindex1_dim == 3) { | |||
| index = ((int)index_input.x) * stride_w + ((int)index_input.y) * outputshape.w * C4NUM + (int)index_input.z; | |||
| } else { | |||
| index = ((int)index_input.x) * outputshape.y * stride_w + ((int)index_input.y) * stride_w + | |||
| ((int)index_input.z) * outputshape.w * C4NUM + (int)index_input.w; | |||
| } | |||
| FLT4 result = {result_temp[0], result_temp[1], result_temp[2], result_temp[3]}; | |||
| WRITE_IMAGE(output, (int2)(X, Y), result); | |||
| output[index] = weight_vector[X]; | |||
| } | |||
| @@ -87,24 +87,17 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme | |||
| } | |||
| } | |||
| int ArithmeticSelfOpenCLKernel::Init() { | |||
| int ArithmeticSelfOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) { | |||
| MS_LOG(ERROR) << " only support dim = 4 or 2 but your dim = " << in_tensors_[0]->shape().size(); | |||
| return RET_ERROR; | |||
| } | |||
| auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_); | |||
| std::string kernel_name = "ArithmeticSelf"; | |||
| GetKernelName(&kernel_name, param); | |||
| kernel_name += "_NHWC4"; | |||
| MS_LOG(DEBUG) << "execute kernel name : " << kernel_name; | |||
| std::set<std::string> build_options; | |||
| std::string source = arithmeticself_source; | |||
| std::string program_name = "ArithmeticSelf"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| return RET_OK; | |||
| } | |||
| return mindspore::lite::RET_OK; | |||
| void ArithmeticSelfOpenCLKernel::SetConstArgs() { | |||
| int arg_cn = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| } | |||
| void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| @@ -121,11 +114,8 @@ void ArithmeticSelfGetWorkGroup(const std::vector<size_t> &global, std::vector<s | |||
| local->push_back(z); | |||
| } | |||
| int ArithmeticSelfOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| void ArithmeticSelfOpenCLKernel::SetGlobalLocal() { | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int4 output_shape_ = {}; | |||
| uint32_t OH = 1, OW = 1, OC = 1; | |||
| if (output_shape.size() == 4) { | |||
| output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | |||
| @@ -142,49 +132,48 @@ int ArithmeticSelfOpenCLKernel::Run() { | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| ArithmeticSelfGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int ArithmeticSelfOpenCLKernel::Prepare() { | |||
| auto param = reinterpret_cast<ArithmeticSelfParameter *>(this->op_parameter_); | |||
| std::string kernel_name = "ArithmeticSelf"; | |||
| GetKernelName(&kernel_name, param); | |||
| kernel_name += "_NHWC4"; | |||
| MS_LOG(DEBUG) << "execute kernel name : " << kernel_name; | |||
| std::set<std::string> build_options; | |||
| std::string source = arithmeticself_source; | |||
| std::string program_name = "ArithmeticSelf"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| return RET_OK; | |||
| } | |||
| int ArithmeticSelfOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLArithmeticSelfKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ArithmeticSelfOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new ArithmeticSelfOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: ArithmeticSelf "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| return RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLArithmeticSelfKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Abs, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Ceil, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cos, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Exp, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Floor, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Log, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_LogicalNot, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Round, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Rsqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sin, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Neg, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Sqrt, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Square, OpenCLKernelCreator<ArithmeticSelfOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -32,13 +32,17 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel { | |||
| ~ArithmeticSelfOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| private: | |||
| void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); | |||
| cl_int4 output_shape_ = {}; | |||
| cl::Kernel kernel_; | |||
| }; | |||
| @@ -30,16 +30,7 @@ using mindspore::schema::PrimitiveType_BatchNorm; | |||
| namespace mindspore::kernel { | |||
| int BatchNormOpenCLKernel::Init() { | |||
| std::string kernel_name = "Batch_normalization_NHWC4"; | |||
| std::set<std::string> build_options; | |||
| std::string source = batchnorm_source; | |||
| std::string program_name = "Batch_normalization"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| int BatchNormOpenCLKernel::CheckSpecs() { return RET_OK; } | |||
| void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| @@ -55,13 +46,17 @@ void BatchNormGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t | |||
| local->push_back(z); | |||
| } | |||
| int BatchNormOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| void BatchNormOpenCLKernel::SetConstArgs() { | |||
| int arg_cn = 6; | |||
| auto param = reinterpret_cast<BatchNormParameter *>(this->op_parameter_); | |||
| auto input0_shape = in_tensors_[0]->shape(); | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int4 input_shape_ = {input0_shape[0], input0_shape[1], input0_shape[2], UP_DIV(input0_shape[3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_); | |||
| } | |||
| void BatchNormOpenCLKernel::SetGlobalLocal() { | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| uint32_t OH = output_shape[1]; | |||
| uint32_t OW = output_shape[2]; | |||
| uint32_t OC = UP_DIV(output_shape[3], C4NUM); | |||
| @@ -70,6 +65,25 @@ int BatchNormOpenCLKernel::Run() { | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| BatchNormGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int BatchNormOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "Batch_normalization_NHWC4"; | |||
| std::set<std::string> build_options; | |||
| std::string source = batchnorm_source; | |||
| std::string program_name = "Batch_normalization"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| return RET_OK; | |||
| } | |||
| int BatchNormOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[1]->data_c()); // scale | |||
| @@ -77,32 +91,11 @@ int BatchNormOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->epsilon_); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLBatchnormKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) BatchNormOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new BatchnormOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Batchnorm "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLBatchnormKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchNorm, OpenCLKernelCreator<BatchNormOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -31,9 +31,12 @@ class BatchNormOpenCLKernel : public OpenCLKernel { | |||
| ~BatchNormOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| @@ -1,3 +1,4 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| @@ -42,18 +43,13 @@ int CastOpenCLKernel::GetKernelName(std::string *kernel_name, CastParameter *par | |||
| return RET_OK; | |||
| } | |||
| int CastOpenCLKernel::Init() { | |||
| auto param = reinterpret_cast<CastParameter *>(this->op_parameter_); | |||
| std::string kernel_name = "Cast"; | |||
| GetKernelName(&kernel_name, param); | |||
| kernel_name += "_NHWC4"; | |||
| std::set<std::string> build_options; | |||
| std::string source = cast_source; | |||
| std::string program_name = "cast"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| int CastOpenCLKernel::CheckSpecs() { return RET_OK; } | |||
| void CastOpenCLKernel::SetConstArgs() { | |||
| auto input_shape = in_tensors_[0]->shape(); | |||
| cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)}; | |||
| int arg_cn = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_); | |||
| } | |||
| void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| @@ -70,11 +66,8 @@ void CastGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *lo | |||
| local->push_back(z); | |||
| } | |||
| int CastOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| void CastOpenCLKernel::SetGlobalLocal() { | |||
| auto input_shape = in_tensors_[0]->shape(); | |||
| cl_int4 input_shape_ = {input_shape[0], input_shape[1], input_shape[2], UP_DIV(input_shape[3], C4NUM)}; | |||
| uint32_t OH = input_shape[1]; | |||
| uint32_t OW = input_shape[2]; | |||
| uint32_t OC = UP_DIV(input_shape[3], C4NUM); | |||
| @@ -83,34 +76,35 @@ int CastOpenCLKernel::Run() { | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| CastGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int CastOpenCLKernel::Prepare() { | |||
| auto param = reinterpret_cast<CastParameter *>(this->op_parameter_); | |||
| std::string kernel_name = "Cast"; | |||
| GetKernelName(&kernel_name, param); | |||
| kernel_name += "_NHWC4"; | |||
| std::set<std::string> build_options; | |||
| std::string source = cast_source; | |||
| std::string program_name = "cast"; | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| SetConstArgs(); | |||
| SetGlobalLocal(); | |||
| return RET_OK; | |||
| } | |||
| int CastOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape_); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLCastKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) CastOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new CastOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Cast "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLCastKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLCastKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Cast, OpenCLKernelCreator<CastOpenCLKernel>); | |||
| } // namespace mindspore::kernel | |||
| @@ -31,8 +31,11 @@ class CastOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~CastOpenCLKernel() override = default; | |||
| int Prepare() override; | |||
| int Init() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| @@ -49,7 +49,21 @@ int ConcatOpenCLKernel::RunAxis0() { | |||
| return RET_OK; | |||
| } | |||
| int ConcatOpenCLKernel::Init() { | |||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 2, max_y = 8; | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| local->push_back(x); | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int ConcatOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_[0]->shape().size() != 4) { | |||
| MS_LOG(ERROR) << " only support dim = 4 "; | |||
| return RET_ERROR; | |||
| @@ -65,6 +79,32 @@ int ConcatOpenCLKernel::Init() { | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void ConcatOpenCLKernel::SetConstArgs() { | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | |||
| int arg_cn = 2 * in_tensors_.size() + 1; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_); | |||
| } | |||
| void ConcatOpenCLKernel::SetGlobalLocal() { | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| uint32_t OH = output_shape[0] * output_shape[1]; | |||
| uint32_t OW = output_shape[2]; | |||
| uint32_t OC = output_shape[3]; | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| ConcatGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int ConcatOpenCLKernel::Prepare() { | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| std::string kernel_name = "Concat"; | |||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | |||
| kernel_name += std::to_string(in_tensors_.size()) + "inputaxis" + std::to_string(param->axis_); | |||
| @@ -80,38 +120,17 @@ int ConcatOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| return RET_OK; | |||
| } | |||
| void ConcatGetWorkGroup(const std::vector<size_t> &global, std::vector<size_t> *local, int max_size) { | |||
| const int max_divider = 8; | |||
| const int max_x = 2, max_y = 8; | |||
| int x = std::min(GetMaxDivisorStrategy1(global[0], max_divider), max_x); | |||
| int yz = max_size / x; | |||
| int y = std::min(std::min(GetMaxDivisorStrategy1(global[1], max_divider), yz), max_y); | |||
| int z = std::min(yz / y, static_cast<int>(UP_DIV(global[2], 2))); | |||
| local->clear(); | |||
| local->push_back(x); | |||
| local->push_back(y); | |||
| local->push_back(z); | |||
| } | |||
| int ConcatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<ConcatParameter *>(this->op_parameter_); | |||
| if (param->axis_ == 0) { | |||
| return RunAxis0(); | |||
| } | |||
| auto output_shape = out_tensors_[0]->shape(); | |||
| cl_int4 output_shape_ = {output_shape[0], output_shape[1], output_shape[2], UP_DIV(output_shape[3], C4NUM)}; | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| uint32_t OH = output_shape_.s[0] * output_shape_.s[1]; | |||
| uint32_t OW = output_shape_.s[2]; | |||
| uint32_t OC = output_shape_.s[3]; | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| ConcatGetWorkGroup(global, &local, max_global[0]); | |||
| if (in_tensors_.size() == 2 || in_tensors_.size() == 3 || in_tensors_.size() == 4 || in_tensors_.size() == 6) { | |||
| int arg_cn = 0; | |||
| for (int i = 0; i < in_tensors_.size(); ++i) { | |||
| @@ -123,35 +142,14 @@ int ConcatOpenCLKernel::Run() { | |||
| UP_DIV(in_tensors_[i]->shape()[3], C4NUM)}; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, temp); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_); | |||
| } else { | |||
| MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLConcatKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ConcatOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new ConcatOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: Concat "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLConcatKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLConcatKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Concat, OpenCLKernelCreator<ConcatOpenCLKernel>); | |||
| } // namespace mindspore::kernel | |||
| @@ -31,8 +31,11 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||
| ~ConcatOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| private: | |||
| @@ -62,7 +62,11 @@ int FillOpenCLKernel::RunShape() { | |||
| return RET_OK; | |||
| } | |||
| int FillOpenCLKernel::Init() { | |||
| void FillOpenCLKernel::SetConstArgs() {} | |||
| void FillOpenCLKernel::SetGlobalLocal() {} | |||
| int FillOpenCLKernel::CheckSpecs() { | |||
| auto param = this->op_parameter_; | |||
| if (out_tensors_[0]->shape().size() > 4) { | |||
| @@ -76,6 +80,8 @@ int FillOpenCLKernel::Init() { | |||
| return RET_OK; | |||
| } | |||
| int FillOpenCLKernel::Prepare() { return RET_OK; } | |||
| int FillOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = this->op_parameter_; | |||
| @@ -88,28 +94,9 @@ int FillOpenCLKernel::Run() { | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *FillOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) FillOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new FillOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: fill "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, FillOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, FillOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, FillOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, FillOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Fill, OpenCLKernelCreator<FillOpenCLKernel>); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Shape, OpenCLKernelCreator<FillOpenCLKernel>); | |||
| } // namespace mindspore::kernel | |||
| @@ -32,7 +32,11 @@ class FillOpenCLKernel : public OpenCLKernel { | |||
| ~FillOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Run() override; | |||
| @@ -90,24 +90,64 @@ int SparseToDenseOpenCLKernel::InitWeights() { | |||
| return RET_OK; | |||
| } | |||
| int SparseToDenseOpenCLKernel::Init() { | |||
| int SparseToDenseOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_[0]->shape().size() > 4 || out_tensors_[0]->shape().size() > 4) { | |||
| MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size() << "outdim" | |||
| << out_tensors_[0]->shape().size(); | |||
| return RET_ERROR; | |||
| } | |||
| if (out_tensors_[0]->shape().size() > 2 || in_tensors_.size() < 3) { | |||
| MS_LOG(ERROR) << " only support dim <= 2 and in_tensors_.size >= 3"; | |||
| return RET_ERROR; | |||
| } | |||
| if ((in_tensors_[0]->shape()[1] > 3) && (input_dim_ == 2)) { | |||
| MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: " | |||
| << in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_; | |||
| if (input_dim_ == 2) { | |||
| if ((in_tensors_[0]->shape()[1] > 4)) { | |||
| MS_LOG(ERROR) << "in_tensors_indices shape[1] must be 1 2 or 3 && input_dim_=2 ,but your shapes is: " | |||
| << in_tensors_[0]->shape()[1] << "your input_dim_ is: " << input_dim_; | |||
| return ERROR; | |||
| } | |||
| } | |||
| if (inshapeindex1_dim > 4) { | |||
| MS_LOG(ERROR) << "Unsupported input_indices[1] > 4: "; | |||
| return ERROR; | |||
| } | |||
| input_dim_ = in_tensors_[0]->shape().size(); | |||
| weight_scalar_ = in_tensors_[2]->IsScalar(); | |||
| std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "ScalarDim" : "VectorDim") + | |||
| std::to_string(in_tensors_[0]->shape()[1] == 1 ? 1 : input_dim_); | |||
| if (input_dim_ == 2 && in_tensors_[0]->shape()[1] != 1) { | |||
| kernel_name += "Shape" + std::to_string(in_tensors_[0]->shape()[1]); | |||
| auto param = reinterpret_cast<SparseToDenseParameter *>(op_parameter_); | |||
| if (param->validate_indices_) { | |||
| MS_LOG(ERROR) << "Unspported unordered for in_tensors_indices"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| void SparseToDenseOpenCLKernel::SetConstArgs() { | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| Image2DInfo img_info(out_tensors_[0]); | |||
| size_t dtype = enable_fp16_ ? sizeof(cl_half) : sizeof(cl_float); | |||
| stride_w = img_info.RowPitch() / dtype; | |||
| cl_int2 input_shape = {n_ * h_, w_ * UP_DIV(c_, C4NUM)}; | |||
| auto out_shape_temp = out_tensors_[0]->shape(); | |||
| cl_int4 out_shape = {out_n_, out_h_, out_w_, UP_DIV(out_c_, C4NUM)}; | |||
| int arg_cn = 3; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, stride_w); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, inshapeindex1_dim); | |||
| } | |||
| void SparseToDenseOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> local = {1, 1}; | |||
| size_t OH = n_ * h_; | |||
| size_t OW = w_ * UP_DIV(c_, C4NUM); | |||
| std::vector<size_t> global = {OH, OW}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int SparseToDenseOpenCLKernel::Prepare() { | |||
| input_dim_ = in_tensors_[0]->shape().size(); | |||
| inshapeindex1_dim = in_tensors_[0]->shape()[1]; | |||
| weight_scalar_ = in_tensors_[2]->IsScalar(); | |||
| std::string kernel_name = "SparseToDense" + std::string(weight_scalar_ ? "Scalar" : "Vector"); | |||
| std::set<std::string> build_options; | |||
| std::string source = sparse_to_dense_source; | |||
| std::string program_name = "SparseToDense"; | |||
| @@ -122,8 +162,10 @@ int SparseToDenseOpenCLKernel::Init() { | |||
| default_ = *reinterpret_cast<float *>(input_tensor3->data_c()); | |||
| } | |||
| } | |||
| InitWeights(); | |||
| InferShapeTo4D(); | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -131,73 +173,47 @@ int SparseToDenseOpenCLKernel::Init() { | |||
| int SparseToDenseOpenCLKernel::InferShapeTo4D() { | |||
| if (in_tensors_[0]->shape().size() <= 4) { | |||
| if (in_tensors_[0]->shape().size() == 1) { | |||
| N_ = in_tensors_[0]->shape()[0]; | |||
| n_ = in_tensors_[0]->shape()[0]; | |||
| } else if (in_tensors_[0]->shape().size() == 2) { | |||
| N_ = in_tensors_[0]->shape()[0]; | |||
| C_ = in_tensors_[0]->shape()[1]; | |||
| } else if (in_tensors_[0]->shape().size() == 3) { | |||
| N_ = in_tensors_[0]->shape()[0]; | |||
| W_ = in_tensors_[0]->shape()[1]; | |||
| C_ = in_tensors_[0]->shape()[2]; | |||
| n_ = in_tensors_[0]->shape()[0]; | |||
| c_ = in_tensors_[0]->shape()[1]; | |||
| } | |||
| } | |||
| if (out_tensors_[0]->shape().size() <= 4) { | |||
| if (out_tensors_[0]->shape().size() == 1) { | |||
| out_n_ = out_tensors_[0]->shape()[0]; | |||
| } else if (out_tensors_[0]->shape().size() == 2) { | |||
| out_n_ = out_tensors_[0]->shape()[0]; | |||
| out_c_ = out_tensors_[0]->shape()[1]; | |||
| } else if (out_tensors_[0]->shape().size() == 3) { | |||
| out_n_ = out_tensors_[0]->shape()[0]; | |||
| out_w_ = out_tensors_[0]->shape()[1]; | |||
| out_c_ = out_tensors_[0]->shape()[2]; | |||
| } else { | |||
| N_ = in_tensors_[0]->shape()[0]; | |||
| H_ = in_tensors_[0]->shape()[1]; | |||
| W_ = in_tensors_[0]->shape()[2]; | |||
| C_ = in_tensors_[0]->shape()[3]; | |||
| out_n_ = out_tensors_[0]->shape()[0]; | |||
| out_h_ = out_tensors_[0]->shape()[1]; | |||
| out_w_ = out_tensors_[0]->shape()[2]; | |||
| out_c_ = out_tensors_[0]->shape()[3]; | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Unsupported inputdim: " << in_tensors_[0]->shape().size(); | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int SparseToDenseOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| InferShapeTo4D(); | |||
| cl_int2 input_shape = {static_cast<cl_int>(N_ * H_), static_cast<cl_int>(W_ * UP_DIV(C_, C4NUM))}; | |||
| InitOutputToDefault(); | |||
| std::vector<size_t> local = {1, 1}; | |||
| std::vector<size_t> global = {1, 1}; | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||
| if (weight_scalar_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | |||
| if (!weight_scalar_) { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_, lite::opencl::MemType::BUF); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_vector_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); | |||
| } | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, default_); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *SparseToDenseOpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| if (inputs.empty()) { | |||
| MS_LOG(ERROR) << "Input data size must be greater than 0, but your size is " << inputs.size(); | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto *kernel = new (std::nothrow) SparseToDenseOpenCLKernel(opParameter, inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << " new HswishOpenCLKernel failed "; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << " Init kernel failed, name: hswish "; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, SparseToDenseOpenCLKernelCreator); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>); | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SparseToDense, OpenCLKernelCreator<SparseToDenseOpenCLKernel>); | |||
| } // namespace mindspore::kernel | |||
| @@ -31,9 +31,12 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel { | |||
| ~SparseToDenseOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Prepare() override; | |||
| int Run() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int CheckSpecs() override; | |||
| private: | |||
| int InferShapeTo4D(); | |||
| @@ -47,12 +50,19 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel { | |||
| float weight_scalar_{0.f}; | |||
| void *weight_vector_{nullptr}; | |||
| int input_dim_{1}; | |||
| int inshapeindex1_dim{1}; | |||
| cl_int stride_w{1}; | |||
| std::vector<int32_t> output_shape_; | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| size_t W_{1}; | |||
| size_t C_{1}; | |||
| cl_int n_{1}; | |||
| cl_int h_{1}; | |||
| cl_int w_{1}; | |||
| cl_int c_{1}; | |||
| cl_int out_n_{1}; | |||
| cl_int out_h_{1}; | |||
| cl_int out_w_{1}; | |||
| cl_int out_c_{1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -112,7 +112,7 @@ struct Image2DInfo { | |||
| size_t RowPitch() const { | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | |||
| size_t row_pitch = (width + alignment - 1) / alignment * alignment * FLT4_size; | |||
| size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size; | |||
| return row_pitch; | |||
| } | |||
| @@ -98,8 +98,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfOpenCLFp16) { | |||
| } | |||
| arithmeticself_kernel->Init(); | |||
| // to do allocate memory for inputs and outputs | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| for (auto &input_tensor_ : inputs) { | |||
| input_tensor_->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel}; | |||
| @@ -186,8 +186,8 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) { | |||
| } | |||
| arithmeticself_kernel->Init(); | |||
| // to do allocate memory for inputs and outputs | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| for (auto &input_tensor_ : inputs) { | |||
| input_tensor_->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel}; | |||
| @@ -206,7 +206,6 @@ TEST_F(TestArithmeticSelfOpenCLCI, ArithmeticSelfRound) { | |||
| } | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| std::cout << sizeof(input_data1) / sizeof(float) << std::endl; | |||
| memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1)); | |||
| std::cout << "==================output data================" << std::endl; | |||
| @@ -280,8 +279,8 @@ TEST_F(TestArithmeticSelfOpenCLfp16, ArithmeticSelfdim2Fp16) { | |||
| } | |||
| arithmeticself_kernel->Init(); | |||
| // to do allocate memory for inputs and outputs | |||
| for (auto &input_tensor : inputs) { | |||
| input_tensor->MallocData(allocator); | |||
| for (auto &input_tensor_ : inputs) { | |||
| input_tensor_->MallocData(allocator); | |||
| } | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{arithmeticself_kernel}; | |||
| @@ -28,6 +28,78 @@ class TestSparseToDenseOpenCLCI : public mindspore::CommonTest { | |||
| TestSparseToDenseOpenCLCI() {} | |||
| }; | |||
| TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Shape3Vector) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| runtime->Init(); | |||
| auto allocator = runtime->GetAllocator(); | |||
| MS_LOG(INFO) << " init tensors "; | |||
| std::vector<int> input_shape1 = {6, 3}; | |||
| std::vector<int> input_shape2 = {3}; | |||
| std::vector<int> input_shape3 = {6}; | |||
| std::vector<int> input_shape4 = {1}; | |||
| float input_data1[] = {0, 0, 0, 0, 0, 1, 0, 0, 2, 0, 0, 3, 0, 0, 4, 0, 0, 5, 0, 0, 6}; | |||
| float input_data2[] = {6, 1, 10}; | |||
| float input_data3[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; | |||
| float input_data4[] = {0.0}; | |||
| float correctOutput[] = {1, 2, 3, 4, 5, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |||
| 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; | |||
| auto data_type = kNumberTypeFloat32; | |||
| std::vector<int> output_shape = {6, 1, 10}; | |||
| auto in_tensor1 = Tensor(data_type, input_shape1, Format_NHWC, lite::Tensor::VAR); | |||
| auto in_tensor2 = Tensor(data_type, input_shape2, Format_NHWC, lite::Tensor::CONST_TENSOR); | |||
| auto in_tensor3 = Tensor(data_type, input_shape3, Format_NHWC, lite::Tensor::CONST_TENSOR); | |||
| auto in_tensor4 = Tensor(data_type, input_shape4, Format_NHWC, lite::Tensor::CONST_SCALAR); | |||
| auto output_tensor = Tensor(data_type, output_shape, Format_NHWC, lite::Tensor::VAR); | |||
| // allocate memory for weights | |||
| in_tensor2.MallocData(); | |||
| in_tensor3.MallocData(); | |||
| in_tensor4.MallocData(); | |||
| std::vector<lite::Tensor *> inputs{&in_tensor1, &in_tensor2, &in_tensor3, &in_tensor4}; | |||
| std::vector<lite::Tensor *> outputs{&output_tensor}; | |||
| // initialize weights | |||
| memcpy(inputs[1]->data_c(), input_data2, sizeof(input_data2)); | |||
| memcpy(inputs[2]->data_c(), input_data3, sizeof(input_data3)); | |||
| memcpy(inputs[3]->data_c(), input_data4, sizeof(input_data4)); | |||
| MS_LOG(INFO) << " initialize tensors "; | |||
| auto param = reinterpret_cast<SparseToDenseParameter *>(malloc(sizeof(SparseToDenseParameter))); | |||
| if (param == nullptr) { | |||
| MS_LOG(INFO) << " new ActivationParameter failed "; | |||
| return; | |||
| } | |||
| auto *sparse_to_dense_kernel = | |||
| new (std::nothrow) kernel::SparseToDenseOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs); | |||
| if (sparse_to_dense_kernel == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::SparseToDenseOpenCLKernel failed "; | |||
| delete param; | |||
| return; | |||
| } | |||
| sparse_to_dense_kernel->Init(); | |||
| MS_LOG(INFO) << " initialize sub_graph "; | |||
| std::vector<kernel::LiteKernel *> kernels{sparse_to_dense_kernel}; | |||
| auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&in_tensor1}, outputs, kernels, kernels, kernels); | |||
| if (sub_graph == nullptr) { | |||
| MS_LOG(INFO) << " new kernel::SubGraphOpenCLKernel failed "; | |||
| delete param; | |||
| delete sparse_to_dense_kernel; | |||
| return; | |||
| } | |||
| // to do allocate memory for inputs | |||
| in_tensor1.MallocData(allocator); | |||
| sub_graph->Init(); | |||
| MS_LOG(INFO) << " initialize input data "; | |||
| memcpy(inputs[0]->data_c(), input_data1, sizeof(input_data1)); | |||
| std::cout << "==================output data================" << std::endl; | |||
| sub_graph->Run(); | |||
| auto *output_data_gpu = reinterpret_cast<float *>(output_tensor.data_c()); | |||
| ASSERT_EQ(0, CompareOutputData(output_data_gpu, correctOutput, output_tensor.ElementsNum(), 0.0001)); | |||
| delete sub_graph; | |||
| } | |||
| TEST_F(TestSparseToDenseOpenCLCI, Fp32Dim2Scalar) { | |||
| MS_LOG(INFO) << " begin test "; | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||