From: @wangdongxu6 Reviewed-by: @ddwsky Signed-off-by: @ddwskytags/v1.1.0
| @@ -42,6 +42,7 @@ if (SUPPORT_GPU) | |||||
| set(LITE_SRC | set(LITE_SRC | ||||
| ${LITE_SRC} | ${LITE_SRC} | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc | ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_fusion.cc | |||||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc | ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc | ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc | ||||
| ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc | ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc | ||||
| @@ -2,8 +2,8 @@ | |||||
| #define divide_no_check(a, b) (a / b) | #define divide_no_check(a, b) (a / b) | ||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | ||||
| __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -17,8 +17,8 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -32,8 +32,8 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -47,8 +47,8 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -62,8 +62,8 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementLogicalAnd(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -77,8 +77,8 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementLogicalOr(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -92,7 +92,7 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __kernel void ElementMaximum(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -107,7 +107,7 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __kernel void ElementMinimum(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -122,9 +122,8 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementFloorDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -138,9 +137,8 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementFloorMod(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -154,9 +152,9 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementSquaredDifference(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -170,8 +168,8 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -186,9 +184,8 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2 | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementNotEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -202,8 +199,8 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementLess(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -217,9 +214,8 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementLessEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -233,8 +229,8 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void ElementGreater(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -248,9 +244,9 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void ElementGreaterEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int2 output_shape, float act_min, | |||||
| float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -264,9 +260,9 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| __kernel void BroadcastNHWC4Add(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| int X = get_global_id(0); // C4 | int X = get_global_id(0); // C4 | ||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H | int Z = get_global_id(2); // H | ||||
| @@ -293,9 +289,9 @@ __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only i | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | ||||
| } | } | ||||
| __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| __kernel void BroadcastNHWC4Sub(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| int X = get_global_id(0); // C4 | int X = get_global_id(0); // C4 | ||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H | int Z = get_global_id(2); // H | ||||
| @@ -322,9 +318,9 @@ __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only i | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | ||||
| } | } | ||||
| __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| __kernel void BroadcastNHWC4Mul(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| int X = get_global_id(0); // C4 | int X = get_global_id(0); // C4 | ||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H | int Z = get_global_id(2); // H | ||||
| @@ -351,9 +347,9 @@ __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only i | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | ||||
| } | } | ||||
| __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| __kernel void BroadcastNHWC4Div(__read_only image2d_t input_a, __read_only image2d_t input_b, | |||||
| __write_only image2d_t output, const int4 a_shape, const int4 b_shape, | |||||
| const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { | |||||
| int X = get_global_id(0); // C4 | int X = get_global_id(0); // C4 | ||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H | int Z = get_global_id(2); // H | ||||
| @@ -380,8 +376,8 @@ __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only i | |||||
| WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); | ||||
| } | } | ||||
| __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastLogicalAnd(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -394,8 +390,8 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastLogicalOr(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -408,7 +404,7 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| __kernel void BroadcastMaximum(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | const int2 output_shape, float act_min, float act_max) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -422,7 +418,7 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| __kernel void BroadcastMinimum(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | const int2 output_shape, float act_min, float act_max) { | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -436,8 +432,8 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastFloorDiv(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -450,8 +446,8 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastFloorMod(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -464,8 +460,8 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastSquaredDifference(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -478,8 +474,8 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -492,8 +488,8 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastNotEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -506,8 +502,8 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastLess(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -520,8 +516,8 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_ | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastLessEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -534,8 +530,8 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastGreater(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -548,8 +544,8 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri | |||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| __kernel void BroadcastGreaterEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, | |||||
| const int2 output_shape, float act_min, float act_max) { | |||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| if (X >= output_shape.x || Y >= output_shape.y) { | if (X >= output_shape.x || Y >= output_shape.y) { | ||||
| @@ -561,55 +557,3 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, | |||||
| result = clamp(result, (FLT)(act_min), (FLT)(act_max)); | result = clamp(result, (FLT)(act_min), (FLT)(act_max)); | ||||
| WRITE_IMAGE(output, (int2)(X, Y), result); | WRITE_IMAGE(output, (int2)(X, Y), result); | ||||
| } | } | ||||
| __kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||||
| const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] + input_b[idx]; | |||||
| } | |||||
| __kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||||
| const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] - input_b[idx]; | |||||
| } | |||||
| __kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||||
| const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] * input_b[idx]; | |||||
| } | |||||
| __kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output, | |||||
| const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] * input_b[idx]; | |||||
| } | |||||
| __kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] + (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] - (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = input_a[idx] * (FLT)b; | |||||
| } | |||||
| __kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { | |||||
| int idx = get_global_id(0); | |||||
| if (idx >= n) return; | |||||
| output[idx] = divide_no_check(input_a[idx], (FLT)b); | |||||
| } | |||||
| @@ -6,51 +6,59 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | |||||
| #define MAX_IMAGE2D_SIZE 65535 | #define MAX_IMAGE2D_SIZE 65535 | ||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| #define ActType_Relu 1 | |||||
| #define ActType_Relu6 3 | |||||
| #define DEFINE_ARGS \ | |||||
| const int N = input_shape.x; \ | |||||
| const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ | |||||
| const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ | |||||
| const int KH = kernel_stride.x, KW = kernel_stride.y; \ | |||||
| const int strideH = kernel_stride.z, strideW = kernel_stride.w; \ | |||||
| const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ | |||||
| const int dilationH = dilation.x, dilationW = dilation.y; \ | |||||
| \ | |||||
| const int n_oh = get_global_id(0); \ | |||||
| const int ow = get_global_id(1) * BlockW; \ | |||||
| const int co_slice = get_global_id(2) * BlockC; \ | |||||
| const int OH_SLICES = UP_DIV(OH, BlockH); \ | |||||
| const int n = n_oh / OH_SLICES; \ | |||||
| const int oh = (n_oh % OH_SLICES) * BlockH; \ | |||||
| if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ | |||||
| return; \ | |||||
| #define DEFINE_ARGS \ | |||||
| int N = input_shape.x; \ | |||||
| int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ | |||||
| int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ | |||||
| int KH = kernel_stride.x, KW = kernel_stride.y; \ | |||||
| int strideH = kernel_stride.z, strideW = kernel_stride.w; \ | |||||
| int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ | |||||
| int dilationH = dilation.x, dilationW = dilation.y; \ | |||||
| \ | |||||
| int n_oh = get_global_id(0); \ | |||||
| int ow = get_global_id(1) * BlockW; \ | |||||
| int co_slice = get_global_id(2) * BlockC; \ | |||||
| int OH_SLICES = UP_DIV(OH, BlockH); \ | |||||
| int n = n_oh / OH_SLICES; \ | |||||
| int oh = (n_oh % OH_SLICES) * BlockH; \ | |||||
| if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ | |||||
| return; \ | |||||
| } | } | ||||
| #define DO_TANH(data) \ | |||||
| exp0 = exp(data); \ | |||||
| exp1 = exp(-data); \ | |||||
| data = (exp0 - exp1) / (exp0 + exp1); | |||||
| #define DO_LEAKY_RELU(data) \ | |||||
| if (data.x < 0) data.x *= alpha; \ | |||||
| if (data.y < 0) data.y *= alpha; \ | |||||
| if (data.z < 0) data.z *= alpha; \ | |||||
| if (data.w < 0) data.w *= alpha; | |||||
| __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | ||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 1; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 1; | |||||
| __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, | |||||
| int2 dilation, int act_type, float alpha) { | |||||
| int BlockH = 1; | |||||
| int BlockW = 1; | |||||
| int BlockC = 1; | |||||
| DEFINE_ARGS; | DEFINE_ARGS; | ||||
| const int oh0 = oh + 0; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| int oh0 = oh + 0; | |||||
| int n_oh0 = n * OH + oh0; | |||||
| int ow0 = ow + 0; | |||||
| int co_slice0 = co_slice + 0; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | ||||
| for (int kh = 0; kh < KH; ++kh) { | for (int kh = 0; kh < KH; ++kh) { | ||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | for (int kw = 0; kw < KW; ++kw) { | ||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | int x_idx0 = iw0 * CI_SLICES; | ||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | ||||
| @@ -71,10 +79,17 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h0_w0_c0 += bias[co_slice0]; | out_h0_w0_c0 += bias[co_slice0]; | ||||
| } | } | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0, exp1; | |||||
| DO_TANH(out_h0_w0_c0); | |||||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||||
| DO_LEAKY_RELU(out_h0_w0_c0); | |||||
| } else if (act_type == ActivationType_SIGMOID) { | |||||
| out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); | |||||
| } | } | ||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | ||||
| @@ -85,19 +100,19 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | ||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 1; | |||||
| __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, | |||||
| int2 dilation, int act_type, float alpha) { | |||||
| int BlockH = 2; | |||||
| int BlockW = 1; | |||||
| int BlockC = 1; | |||||
| DEFINE_ARGS; | DEFINE_ARGS; | ||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| int oh0 = oh + 0; | |||||
| int oh1 = oh + 1; | |||||
| int n_oh0 = n * OH + oh0; | |||||
| int n_oh1 = n * OH + oh1; | |||||
| int ow0 = ow + 0; | |||||
| int co_slice0 = co_slice + 0; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| @@ -105,15 +120,15 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | ||||
| for (int kh = 0; kh < KH; ++kh) { | for (int kh = 0; kh < KH; ++kh) { | ||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | // no need to check oh1, finally write out will check (oh1 < OH) | ||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | // check ih0 and ih1 | ||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | for (int kw = 0; kw < KW; ++kw) { | ||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | int x_idx0 = iw0 * CI_SLICES; | ||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | ||||
| @@ -140,12 +155,22 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h1_w0_c0 += bias[co_slice0]; | out_h1_w0_c0 += bias[co_slice0]; | ||||
| } | } | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | ||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0, exp1; | |||||
| DO_TANH(out_h0_w0_c0); | |||||
| DO_TANH(out_h1_w0_c0); | |||||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||||
| DO_LEAKY_RELU(out_h0_w0_c0); | |||||
| DO_LEAKY_RELU(out_h1_w0_c0); | |||||
| } else if (act_type == ActivationType_SIGMOID) { | |||||
| out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); | |||||
| out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); | |||||
| } | } | ||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | ||||
| @@ -162,20 +187,20 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | ||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 1; | |||||
| const int BlockC = 2; | |||||
| __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, | |||||
| int2 dilation, int act_type, float alpha) { | |||||
| int BlockH = 2; | |||||
| int BlockW = 1; | |||||
| int BlockC = 2; | |||||
| DEFINE_ARGS; | DEFINE_ARGS; | ||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| const int co_slice1 = co_slice + 1; | |||||
| int oh0 = oh + 0; | |||||
| int oh1 = oh + 1; | |||||
| int n_oh0 = n * OH + oh0; | |||||
| int n_oh1 = n * OH + oh1; | |||||
| int ow0 = ow + 0; | |||||
| int co_slice0 = co_slice + 0; | |||||
| int co_slice1 = co_slice + 1; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| @@ -185,15 +210,15 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | ||||
| for (int kh = 0; kh < KH; ++kh) { | for (int kh = 0; kh < KH; ++kh) { | ||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | // no need to check oh1, finally write out will check (oh1 < OH) | ||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | // check ih0 and ih1 | ||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | for (int kw = 0; kw < KW; ++kw) { | ||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int x_idx0 = iw0 * CI_SLICES; | int x_idx0 = iw0 * CI_SLICES; | ||||
| for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { | ||||
| @@ -231,16 +256,32 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h1_w0_c1 += bias[co_slice1]; | out_h1_w0_c1 += bias[co_slice1]; | ||||
| } | } | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | ||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | ||||
| out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); | out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); | ||||
| out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0, exp1; | |||||
| DO_TANH(out_h0_w0_c0); | |||||
| DO_TANH(out_h1_w0_c0); | |||||
| DO_TANH(out_h0_w0_c1); | |||||
| DO_TANH(out_h1_w0_c1); | |||||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||||
| DO_LEAKY_RELU(out_h0_w0_c0); | |||||
| DO_LEAKY_RELU(out_h1_w0_c0); | |||||
| DO_LEAKY_RELU(out_h0_w0_c1); | |||||
| DO_LEAKY_RELU(out_h1_w0_c1); | |||||
| } else if (act_type == ActivationType_SIGMOID) { | |||||
| out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); | |||||
| out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); | |||||
| out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1)); | |||||
| out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); | |||||
| } | } | ||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | ||||
| @@ -269,21 +310,21 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, | ||||
| __global FLT4 *bias, const int4 input_shape, const int4 output_shape, | |||||
| const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { | |||||
| const int BlockH = 2; | |||||
| const int BlockW = 2; | |||||
| const int BlockC = 2; | |||||
| __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, | |||||
| int2 dilation, int act_type, float alpha) { | |||||
| int BlockH = 2; | |||||
| int BlockW = 2; | |||||
| int BlockC = 2; | |||||
| DEFINE_ARGS; | DEFINE_ARGS; | ||||
| const int oh0 = oh + 0; | |||||
| const int oh1 = oh + 1; | |||||
| const int n_oh0 = n * OH + oh0; | |||||
| const int n_oh1 = n * OH + oh1; | |||||
| const int ow0 = ow + 0; | |||||
| const int ow1 = ow + 1; | |||||
| const int co_slice0 = co_slice + 0; | |||||
| const int co_slice1 = co_slice + 1; | |||||
| int oh0 = oh + 0; | |||||
| int oh1 = oh + 1; | |||||
| int n_oh0 = n * OH + oh0; | |||||
| int n_oh1 = n * OH + oh1; | |||||
| int ow0 = ow + 0; | |||||
| int ow1 = ow + 1; | |||||
| int co_slice0 = co_slice + 0; | |||||
| int co_slice1 = co_slice + 1; | |||||
| FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); | ||||
| @@ -297,15 +338,15 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||||
| __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; | ||||
| for (int kh = 0; kh < KH; ++kh) { | for (int kh = 0; kh < KH; ++kh) { | ||||
| const int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| int ih0 = kh * dilationH + oh0 * strideH - padTop; | |||||
| // no need to check oh1, finally write out will check (oh1 < OH) | // no need to check oh1, finally write out will check (oh1 < OH) | ||||
| const int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| int ih1 = kh * dilationH + oh1 * strideH - padTop; | |||||
| // check ih0 and ih1 | // check ih0 and ih1 | ||||
| const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; | |||||
| int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; | |||||
| for (int kw = 0; kw < KW; ++kw) { | for (int kw = 0; kw < KW; ++kw) { | ||||
| const int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw0 = kw * dilationW + ow0 * strideW - padLeft; | |||||
| int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; | int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; | ||||
| int x_idx0 = iw0 * CI_SLICES; | int x_idx0 = iw0 * CI_SLICES; | ||||
| int x_idx1 = iw1 * CI_SLICES; | int x_idx1 = iw1 * CI_SLICES; | ||||
| @@ -368,7 +409,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h1_w1_c1 += bias[co_slice1]; | out_h1_w1_c1 += bias[co_slice1]; | ||||
| } | } | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); | ||||
| out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); | out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); | ||||
| out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); | ||||
| @@ -377,7 +418,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); | out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); | ||||
| out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); | ||||
| out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); | out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| @@ -386,6 +427,34 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||||
| out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0, exp1; | |||||
| DO_TANH(out_h0_w0_c0); | |||||
| DO_TANH(out_h0_w1_c0); | |||||
| DO_TANH(out_h1_w0_c0); | |||||
| DO_TANH(out_h1_w1_c0); | |||||
| DO_TANH(out_h0_w0_c1); | |||||
| DO_TANH(out_h0_w1_c1); | |||||
| DO_TANH(out_h1_w0_c1); | |||||
| DO_TANH(out_h1_w1_c1); | |||||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||||
| DO_LEAKY_RELU(out_h0_w0_c0); | |||||
| DO_LEAKY_RELU(out_h0_w1_c0); | |||||
| DO_LEAKY_RELU(out_h1_w0_c0); | |||||
| DO_LEAKY_RELU(out_h1_w1_c0); | |||||
| DO_LEAKY_RELU(out_h0_w0_c1); | |||||
| DO_LEAKY_RELU(out_h0_w1_c1); | |||||
| DO_LEAKY_RELU(out_h1_w0_c1); | |||||
| DO_LEAKY_RELU(out_h1_w1_c1); | |||||
| } else if (act_type == ActivationType_SIGMOID) { | |||||
| out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); | |||||
| out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0)); | |||||
| out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); | |||||
| out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0)); | |||||
| out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1)); | |||||
| out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1)); | |||||
| out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); | |||||
| out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1)); | |||||
| } | } | ||||
| if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { | ||||
| @@ -2,7 +2,7 @@ | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, | ||||
| __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, | __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, | ||||
| int2 stride, int2 padding, int4 src_size, int4 dst_size) { | |||||
| int2 stride, int2 padding, int4 src_size, int4 dst_size, int act_type) { | |||||
| int dst_h = get_global_id(0); | int dst_h = get_global_id(0); | ||||
| int rem_h = dst_h % stride.x; | int rem_h = dst_h % stride.x; | ||||
| int ceil_h = dst_h / stride.x; | int ceil_h = dst_h / stride.x; | ||||
| @@ -70,6 +70,19 @@ __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_onl | |||||
| r1 += bias_val; | r1 += bias_val; | ||||
| r2 += bias_val; | r2 += bias_val; | ||||
| r3 += bias_val; | r3 += bias_val; | ||||
| if (act_type == ActivationType_RELU) { | |||||
| r0 = max(r0, (FLT4)(0.0f)); | |||||
| r1 = max(r1, (FLT4)(0.0f)); | |||||
| r2 = max(r2, (FLT4)(0.0f)); | |||||
| r3 = max(r3, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| r0 = clamp(r0, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| r1 = clamp(r1, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| r2 = clamp(r2, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } | |||||
| WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0); | WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0); | ||||
| if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) { | if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) { | ||||
| WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1); | WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1); | ||||
| @@ -2,9 +2,9 @@ | |||||
| #define C4NUM 4 | #define C4NUM 4 | ||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||||
| __read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min, | |||||
| float act_max) { | |||||
| __kernel void FullConnection(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | |||||
| __read_only image2d_t bias, int4 in_shape, int2 out_shape, int act_type) { | |||||
| int gidx = get_global_id(0); // CO4 | int gidx = get_global_id(0); // CO4 | ||||
| int gidz = get_global_id(2); // N | int gidz = get_global_id(2); // N | ||||
| int lidx = get_local_id(0); | int lidx = get_local_id(0); | ||||
| @@ -34,7 +34,15 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only ima | |||||
| result += temp[lidx][2]; | result += temp[lidx][2]; | ||||
| result += temp[lidx][3]; | result += temp[lidx][3]; | ||||
| result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); | ||||
| result = clamp(result, (FLT)(act_min), (FLT)(act_max)); | |||||
| if (act_type == ActivationType_RELU) { | |||||
| result = max(result, (FLT4)(0.0f)); | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| result = clamp(result, (FLT4)(0.0f), (FLT4)(6.0f)); | |||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0 = exp(result); | |||||
| FLT4 exp1 = exp(-result); | |||||
| result = (exp0 - exp1) / (exp0 + exp1); | |||||
| } | |||||
| WRITE_IMAGE(output, (int2)(gidx, gidz), result); | WRITE_IMAGE(output, (int2)(gidx, gidz), result); | ||||
| } | } | ||||
| } | } | ||||
| @@ -1,10 +1,6 @@ | |||||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | #pragma OPENCL EXTENSION cl_khr_fp16 : enable | ||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | ||||
| #define ActType_No 0 | |||||
| #define ActType_Relu 1 | |||||
| #define ActType_Sigmod 2 | |||||
| #define ActType_Relu6 3 | |||||
| #define C4NUM 4 | #define C4NUM 4 | ||||
| __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, | ||||
| @@ -19,9 +15,9 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale | |||||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); | FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); | ||||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); | FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); | ||||
| FLT4 out = in * s + o; | FLT4 out = in * s + o; | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out = max(out, (FLT4)(0.0f)); | out = max(out, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | WRITE_IMAGE(output, (int2)(X, Y), out); | ||||
| @@ -37,9 +33,9 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float | |||||
| FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); | ||||
| FLT4 out = in * (FLT)scale + (FLT)offset; | FLT4 out = in * (FLT)scale + (FLT)offset; | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out = max(out, (FLT4)(0.0f)); | out = max(out, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | WRITE_IMAGE(output, (int2)(X, Y), out); | ||||
| @@ -57,9 +53,9 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca | |||||
| FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); | FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); | ||||
| FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); | FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); | ||||
| FLT4 out = in * s + o; | FLT4 out = in * s + o; | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out = max(out, (FLT4)(0.0f)); | out = max(out, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | WRITE_IMAGE(output, (int2)(X, Y), out); | ||||
| @@ -94,9 +90,9 @@ __kernel void Scale_H_IMG(__read_only image2d_t input, __read_only image2d_t sca | |||||
| o_real = o.w; | o_real = o.w; | ||||
| } | } | ||||
| FLT4 out = in * s_real + o_real; | FLT4 out = in * s_real + o_real; | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| out = max(out, (FLT4)(0.0f)); | out = max(out, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(X, Y), out); | WRITE_IMAGE(output, (int2)(X, Y), out); | ||||
| @@ -4,9 +4,6 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | |||||
| #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) | ||||
| #define ActType_Relu 1 | |||||
| #define ActType_Relu6 3 | |||||
| constant FLT Bt[36] = { | constant FLT Bt[36] = { | ||||
| 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, | 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, | ||||
| 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, | 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, | ||||
| @@ -17,8 +14,8 @@ constant FLT Bt[36] = { | |||||
| }; | }; | ||||
| __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, | __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, | ||||
| const int4 input_shape, // N H W CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES | |||||
| int4 input_shape, // N H W CI_SLICES | |||||
| int4 output_shape) { // N 36 H/4*W/4 CI_SLICES | |||||
| #define PAD 1 | #define PAD 1 | ||||
| int tile_xy = get_global_id(0); | int tile_xy = get_global_id(0); | ||||
| int row = get_global_id(1); | int row = get_global_id(1); | ||||
| @@ -63,8 +60,8 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_ | |||||
| } | } | ||||
| __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, | ||||
| const int4 input_shape, // N 36 H/4*W/4 CI_SLICES | |||||
| const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES | |||||
| int4 input_shape, // N 36 H/4*W/4 CI_SLICES | |||||
| int4 output_shape) { // N 36 H/4*W/4 CO_SLICES | |||||
| #define H 36 | #define H 36 | ||||
| int w = get_global_id(0) * 2; | int w = get_global_id(0) * 2; | ||||
| int h = get_global_id(1); | int h = get_global_id(1); | ||||
| @@ -134,9 +131,9 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.000000000 | |||||
| 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; | 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; | ||||
| __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, | __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, | ||||
| const int4 input_shape, // N 36 H/4*W/4 CO_SLICES | |||||
| const int4 output_shape, // N H W CO_SLICES | |||||
| const int act_type) { | |||||
| int4 input_shape, // N 36 H/4*W/4 CO_SLICES | |||||
| int4 output_shape, // N H W CO_SLICES | |||||
| int act_type, float alpha) { | |||||
| int tile_xy = get_global_id(0); | int tile_xy = get_global_id(0); | ||||
| int row = get_global_id(1); | int row = get_global_id(1); | ||||
| int slice = get_global_id(2); | int slice = get_global_id(2); | ||||
| @@ -175,10 +172,21 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ | |||||
| acc += bias[slice]; | acc += bias[slice]; | ||||
| } | } | ||||
| if (act_type == ActType_Relu) { | |||||
| if (act_type == ActivationType_RELU) { | |||||
| acc = max(acc, (FLT4)(0.0f)); | acc = max(acc, (FLT4)(0.0f)); | ||||
| } else if (act_type == ActType_Relu6) { | |||||
| } else if (act_type == ActivationType_RELU6) { | |||||
| acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); | acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); | ||||
| } else if (act_type == ActivationType_TANH) { | |||||
| FLT4 exp0 = exp(acc); | |||||
| FLT4 exp1 = exp(-acc); | |||||
| acc = (exp0 - exp1) / (exp0 + exp1); | |||||
| } else if (act_type == ActivationType_LEAKY_RELU) { | |||||
| if (acc.x < 0) acc.x *= alpha; | |||||
| if (acc.y < 0) acc.y *= alpha; | |||||
| if (acc.z < 0) acc.z *= alpha; | |||||
| if (acc.w < 0) acc.w *= alpha; | |||||
| } else if (act_type == ActivationType_SIGMOID) { | |||||
| acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); | |||||
| } | } | ||||
| WRITE_IMAGE(output, (int2)(x_idx, oh), acc); | WRITE_IMAGE(output, (int2)(x_idx, oh), acc); | ||||
| @@ -31,93 +31,46 @@ using mindspore::lite::KernelRegistrar; | |||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::lite::opencl::MemType; | using mindspore::lite::opencl::MemType; | ||||
| using mindspore::schema::ActivationType_NO_ACTIVATION; | |||||
| using mindspore::schema::ActivationType_RELU; | |||||
| using mindspore::schema::ActivationType_RELU6; | |||||
| using mindspore::schema::PrimitiveType_Eltwise; | using mindspore::schema::PrimitiveType_Eltwise; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| std::set<schema::PrimitiveType> SupportedOpenCLArithmetics = {PrimitiveType_Mul, | |||||
| PrimitiveType_Add, | |||||
| PrimitiveType_Sub, | |||||
| PrimitiveType_Div, | |||||
| PrimitiveType_LogicalAnd, | |||||
| PrimitiveType_LogicalOr, | |||||
| PrimitiveType_Maximum, | |||||
| PrimitiveType_Minimum, | |||||
| PrimitiveType_FloorDiv, | |||||
| PrimitiveType_FloorMod, | |||||
| PrimitiveType_SquaredDifference, | |||||
| PrimitiveType_Equal, | |||||
| PrimitiveType_NotEqual, | |||||
| PrimitiveType_Less, | |||||
| PrimitiveType_LessEqual, | |||||
| PrimitiveType_Greater, | |||||
| PrimitiveType_GreaterEqual, | |||||
| PrimitiveType_Eltwise}; | |||||
| int ArithmeticOpenCLKernel::CheckSpecs() { | int ArithmeticOpenCLKernel::CheckSpecs() { | ||||
| auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||||
| if (arithmetic_parameter->broadcasting_) { | |||||
| element_flag_ = false; | |||||
| kernel_name_ = "BroadcastNHWC4"; | |||||
| if (out_tensors_[0]->shape()[0] > 1) { | |||||
| MS_LOG(ERROR) << "Broadcasting don't support N > 1"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| } else { | |||||
| kernel_name_ = "Element"; | |||||
| auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||||
| if (param->broadcasting_ && out_tensors_[0]->shape()[0] > 1) { | |||||
| MS_LOG(ERROR) << "Broadcasting don't support N > 1"; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| switch (op_parameter_->type_) { | |||||
| case PrimitiveType_Mul: | |||||
| kernel_name_ += "Mul"; | |||||
| break; | |||||
| case PrimitiveType_Add: | |||||
| kernel_name_ += "Add"; | |||||
| break; | |||||
| case PrimitiveType_Sub: | |||||
| kernel_name_ += "Sub"; | |||||
| break; | |||||
| case PrimitiveType_Div: | |||||
| kernel_name_ += "Div"; | |||||
| break; | |||||
| case PrimitiveType_LogicalAnd: | |||||
| kernel_name_ += "And"; | |||||
| break; | |||||
| case PrimitiveType_LogicalOr: | |||||
| kernel_name_ += "Or"; | |||||
| break; | |||||
| case PrimitiveType_Maximum: | |||||
| kernel_name_ += "Max"; | |||||
| break; | |||||
| case PrimitiveType_Minimum: | |||||
| kernel_name_ += "Min"; | |||||
| break; | |||||
| case PrimitiveType_FloorDiv: | |||||
| kernel_name_ += "FloorDiv"; | |||||
| break; | |||||
| case PrimitiveType_FloorMod: | |||||
| kernel_name_ += "FloorMod"; | |||||
| break; | |||||
| case PrimitiveType_SquaredDifference: | |||||
| kernel_name_ += "SquaredDifference"; | |||||
| break; | |||||
| case PrimitiveType_Equal: | |||||
| kernel_name_ += "Equal"; | |||||
| break; | |||||
| case PrimitiveType_NotEqual: | |||||
| kernel_name_ += "NotEqual"; | |||||
| break; | |||||
| case PrimitiveType_Less: | |||||
| kernel_name_ += "Less"; | |||||
| break; | |||||
| case PrimitiveType_LessEqual: | |||||
| kernel_name_ += "LessEqual"; | |||||
| break; | |||||
| case PrimitiveType_Greater: | |||||
| kernel_name_ += "Greater"; | |||||
| break; | |||||
| case PrimitiveType_GreaterEqual: | |||||
| kernel_name_ += "GreaterEqual"; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; | |||||
| return RET_ERROR; | |||||
| if (SupportedOpenCLArithmetics.count(static_cast<schema::PrimitiveType>(op_parameter_->type_)) == 0) { | |||||
| MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamesPrimitiveType()[op_parameter_->type_]; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| switch (arithmetic_parameter->activation_type_) { | |||||
| case schema::ActivationType_NO_ACTIVATION: | |||||
| break; | |||||
| case schema::ActivationType_RELU: | |||||
| activation_min_ = 0.f; | |||||
| break; | |||||
| case schema::ActivationType_RELU6: | |||||
| activation_min_ = 0.f; | |||||
| activation_max_ = 6.f; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; | |||||
| return RET_ERROR; | |||||
| if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU || | |||||
| param->activation_type_ == ActivationType_RELU6)) { | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -240,11 +193,18 @@ int ArithmeticOpenCLKernel::Prepare() { | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); | ||||
| #else | #else | ||||
| if (out_mem_type_ == MemType::IMG) { | |||||
| kernel_name_ += "_IMG"; | |||||
| } else { | |||||
| kernel_name_ += "_BUF"; | |||||
| auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_); | |||||
| element_flag_ = !param->broadcasting_; | |||||
| kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element"; | |||||
| kernel_name_ += schema::EnumNamesPrimitiveType()[op_parameter_->type_]; | |||||
| if (param->activation_type_ == ActivationType_RELU) { | |||||
| activation_min_ = 0.f; | |||||
| } else if (param->activation_type_ == ActivationType_RELU6) { | |||||
| activation_min_ = 0.f; | |||||
| activation_max_ = 6.f; | |||||
| } | } | ||||
| std::string program_name = "Arithmetic"; | std::string program_name = "Arithmetic"; | ||||
| std::string source = arithmetic_source; | std::string source = arithmetic_source; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| @@ -18,12 +18,15 @@ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ | #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ | ||||
| #include <vector> | #include <vector> | ||||
| #include <set> | |||||
| #include <string> | #include <string> | ||||
| #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" | #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" | ||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | #include "src/runtime/kernel/opencl/opencl_kernel.h" | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| extern std::set<schema::PrimitiveType> SupportedOpenCLArithmetics; | |||||
| class ArithmeticOpenCLKernel : public OpenCLKernel { | class ArithmeticOpenCLKernel : public OpenCLKernel { | ||||
| public: | public: | ||||
| ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| @@ -23,6 +23,7 @@ | |||||
| #include "src/runtime/kernel/opencl/utils.h" | #include "src/runtime/kernel/opencl/utils.h" | ||||
| #include "src/kernel_registry.h" | #include "src/kernel_registry.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "schema/ops_generated.h" | |||||
| #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc" | #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc" | ||||
| #include "src/runtime/kernel/opencl/cl/winograd.cl.inc" | #include "src/runtime/kernel/opencl/cl/winograd.cl.inc" | ||||
| @@ -30,6 +31,11 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::ActivationType_LEAKY_RELU; | |||||
| using mindspore::schema::ActivationType_RELU; | |||||
| using mindspore::schema::ActivationType_RELU6; | |||||
| using mindspore::schema::ActivationType_SIGMOID; | |||||
| using mindspore::schema::ActivationType_TANH; | |||||
| using mindspore::schema::PrimitiveType_Conv2D; | using mindspore::schema::PrimitiveType_Conv2D; | ||||
| using mindspore::schema::PrimitiveType_FullConnection; | using mindspore::schema::PrimitiveType_FullConnection; | ||||
| @@ -59,6 +65,10 @@ int Conv2DOpenCLKernel::CheckSpecs() { | |||||
| MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; | MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (param_->act_type_ != ActType_No && param_->act_type_ != ActType_Relu && param_->act_type_ != ActType_Relu6) { | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param_->act_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -72,9 +82,16 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| IH_ = input_shape[1]; | IH_ = input_shape[1]; | ||||
| IW_ = input_shape[2]; | IW_ = input_shape[2]; | ||||
| CI_ = input_shape[3]; | CI_ = input_shape[3]; | ||||
| OH_ = output_shape[1]; | |||||
| OW_ = output_shape[2]; | |||||
| CO_ = output_shape[3]; | |||||
| // for fusion Conv2D and Reshape(N11C->NC) | |||||
| if (output_shape.size() == 2) { | |||||
| OH_ = 1; | |||||
| OW_ = 1; | |||||
| CO_ = output_shape[1]; | |||||
| } else { // output_shape.size()==4 | |||||
| OH_ = output_shape[1]; | |||||
| OW_ = output_shape[2]; | |||||
| CO_ = output_shape[3]; | |||||
| } | |||||
| CI_SLICES_ = UP_DIV(CI_, C4NUM); | CI_SLICES_ = UP_DIV(CI_, C4NUM); | ||||
| CO_SLICES_ = UP_DIV(CO_, C4NUM); | CO_SLICES_ = UP_DIV(CO_, C4NUM); | ||||
| KH_ = param_->kernel_h_; | KH_ = param_->kernel_h_; | ||||
| @@ -91,7 +108,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| MS_LOG(DEBUG) << "use winograd"; | MS_LOG(DEBUG) << "use winograd"; | ||||
| std::string program_name = "winograd"; | std::string program_name = "winograd"; | ||||
| ocl_runtime_->LoadSource(program_name, winograd_source); | |||||
| ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source); | |||||
| ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); | ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); | ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); | ||||
| ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); | ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); | ||||
| @@ -100,7 +117,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| std::string program_name = "conv2d"; | std::string program_name = "conv2d"; | ||||
| std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + | std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + | ||||
| std::to_string(block_size_.C); | std::to_string(block_size_.C); | ||||
| ocl_runtime_->LoadSource(program_name, conv2d_source); | |||||
| ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| } | } | ||||
| @@ -347,13 +364,6 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { | |||||
| } | } | ||||
| void Conv2DOpenCLKernel::SetConstArgs() { | void Conv2DOpenCLKernel::SetConstArgs() { | ||||
| auto param = reinterpret_cast<ConvParameter *>(op_parameter_); | |||||
| cl_int act_type = 0; | |||||
| if (param->act_type_ == ActType_Relu) { | |||||
| act_type = 1; | |||||
| } else if (param->act_type_ == ActType_Relu6) { | |||||
| act_type = 3; | |||||
| } | |||||
| cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; | cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; | ||||
| cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; | cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; | ||||
| @@ -380,12 +390,13 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, act_type); | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast<cl_int>(param_->act_type_)); | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast<cl_float>(alpha_)); | |||||
| } else { | } else { | ||||
| arg_cn = 2; | arg_cn = 2; | ||||
| cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; | |||||
| cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; | |||||
| cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; | |||||
| cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_}; | |||||
| cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_}; | |||||
| cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_}; | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | ||||
| @@ -393,7 +404,8 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast<cl_int>(param_->act_type_)); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(alpha_)); | |||||
| } | } | ||||
| } | } | ||||
| @@ -44,6 +44,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||||
| int Run() override; | int Run() override; | ||||
| int Tune() override; | int Tune() override; | ||||
| // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU | |||||
| float alpha_{0.0f}; | |||||
| private: | private: | ||||
| void SetBlockSize(); | void SetBlockSize(); | ||||
| int InitFilter(); | int InitFilter(); | ||||
| @@ -22,11 +22,14 @@ | |||||
| #ifndef PROGRAM_WITH_IL | #ifndef PROGRAM_WITH_IL | ||||
| #include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc" | #include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc" | ||||
| #endif | #endif | ||||
| #include "src/runtime/kernel/opencl/utils.h" | |||||
| using mindspore::kernel::KERNEL_ARCH::kGPU; | using mindspore::kernel::KERNEL_ARCH::kGPU; | ||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::ActivationType_RELU; | |||||
| using mindspore::schema::ActivationType_RELU6; | |||||
| using mindspore::schema::PrimitiveType_DeConv2D; | using mindspore::schema::PrimitiveType_DeConv2D; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| @@ -38,6 +41,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() { | |||||
| MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; | MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) { | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -47,7 +54,7 @@ int Conv2dTransposeOpenCLKernel::Prepare() { | |||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | ||||
| #else | #else | ||||
| std::string source = conv2d_transpose_source; | |||||
| std::string source = GetActDefines() + conv2d_transpose_source; | |||||
| std::string program_name = "conv2d_transpose"; | std::string program_name = "conv2d_transpose"; | ||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| @@ -102,6 +109,7 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); | ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); | ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); | ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, static_cast<cl_int>(param->act_type_)); | |||||
| } | } | ||||
| int Conv2dTransposeOpenCLKernel::InitWeights() { | int Conv2dTransposeOpenCLKernel::InitWeights() { | ||||
| @@ -29,48 +29,38 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::RET_ERROR; | using mindspore::lite::RET_ERROR; | ||||
| using mindspore::lite::RET_OK; | using mindspore::lite::RET_OK; | ||||
| using mindspore::schema::ActivationType_RELU; | |||||
| using mindspore::schema::ActivationType_RELU6; | |||||
| using mindspore::schema::ActivationType_TANH; | |||||
| using mindspore::schema::PrimitiveType_FullConnection; | using mindspore::schema::PrimitiveType_FullConnection; | ||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int FullConnectionOpenCLKernel::Init() { | |||||
| // deleted soon | |||||
| return CheckSpecs(); | |||||
| } | |||||
| int FullConnectionOpenCLKernel::CheckSpecs() { | int FullConnectionOpenCLKernel::CheckSpecs() { | ||||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | ||||
| transposeA = param->a_transpose_; | |||||
| if (transposeA) { | |||||
| if (param->a_transpose_) { | |||||
| MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; | MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| transposeB = param->b_transpose_; | |||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||||
| if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || | if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || | ||||
| (out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) { | (out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) { | ||||
| MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; | MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| switch (param->act_type_) { | |||||
| case ActType_No: | |||||
| break; | |||||
| case ActType_Relu: | |||||
| activation_min_ = 0.f; | |||||
| break; | |||||
| case ActType_Relu6: | |||||
| activation_min_ = 0.f; | |||||
| activation_max_ = 6.f; | |||||
| break; | |||||
| default: | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; | |||||
| return RET_ERROR; | |||||
| if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) { | |||||
| MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; | |||||
| return RET_ERROR; | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int FullConnectionOpenCLKernel::Prepare() { | int FullConnectionOpenCLKernel::Prepare() { | ||||
| std::string kernel_name = "FullConnection_NHWC4"; | |||||
| auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| transposeA = param->a_transpose_; | |||||
| transposeB = param->b_transpose_; | |||||
| enable_fp16_ = ocl_runtime_->GetFp16Enable(); | |||||
| std::string kernel_name = "FullConnection"; | |||||
| inShape = GpuTensorInfo(in_tensors_[0]); | inShape = GpuTensorInfo(in_tensors_[0]); | ||||
| outShape = GpuTensorInfo(out_tensors_[0]); | outShape = GpuTensorInfo(out_tensors_[0]); | ||||
| #ifdef PROGRAM_WITH_IL | #ifdef PROGRAM_WITH_IL | ||||
| @@ -78,7 +68,7 @@ int FullConnectionOpenCLKernel::Prepare() { | |||||
| #else | #else | ||||
| std::string source = fullconnection_source; | std::string source = fullconnection_source; | ||||
| std::string program_name = "FullConnection"; | std::string program_name = "FullConnection"; | ||||
| ocl_runtime_->LoadSource(program_name, source); | |||||
| ocl_runtime_->LoadSource(program_name, GetActDefines() + source); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| auto ret = InitWeights(); | auto ret = InitWeights(); | ||||
| @@ -200,11 +190,11 @@ void FullConnectionOpenCLKernel::SetConstArgs() { | |||||
| static_cast<int>(inShape.C)}; | static_cast<int>(inShape.C)}; | ||||
| cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)}; | cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)}; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); | ||||
| auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_)); | |||||
| } | } | ||||
| int FullConnectionOpenCLKernel::Run() { | int FullConnectionOpenCLKernel::Run() { | ||||
| @@ -37,7 +37,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||||
| int InitWeights() override; | int InitWeights() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Init() override; | |||||
| int Tune() override { return lite::RET_OK; } | int Tune() override { return lite::RET_OK; } | ||||
| private: | private: | ||||
| @@ -46,8 +45,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| bool transposeA{false}; | bool transposeA{false}; | ||||
| bool transposeB{true}; | bool transposeB{true}; | ||||
| float activation_min_{-FLT_MAX}; | |||||
| float activation_max_{FLT_MAX}; | |||||
| GpuTensorInfo inShape = GpuTensorInfo(nullptr); | GpuTensorInfo inShape = GpuTensorInfo(nullptr); | ||||
| GpuTensorInfo outShape = GpuTensorInfo(nullptr); | GpuTensorInfo outShape = GpuTensorInfo(nullptr); | ||||
| }; | }; | ||||
| @@ -35,6 +35,15 @@ using mindspore::schema::PrimitiveType_Scale; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| int ScaleOpenCLKernel::CheckSpecs() { | |||||
| auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | |||||
| if (param->activation_type_ != ActType_No && param->activation_type_ != ActType_Relu && | |||||
| param->activation_type_ != ActType_Relu6) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| ScaleOpenCLKernel::~ScaleOpenCLKernel() { | ScaleOpenCLKernel::~ScaleOpenCLKernel() { | ||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| if (scale_ptr_ != nullptr) { | if (scale_ptr_ != nullptr) { | ||||
| @@ -185,7 +194,7 @@ int ScaleOpenCLKernel::Init() { | |||||
| kernel_name += "_BUF"; | kernel_name += "_BUF"; | ||||
| } | } | ||||
| std::string program_name = "Scale"; | std::string program_name = "Scale"; | ||||
| std::string source = scale_source; | |||||
| std::string source = GetActDefines() + scale_source; | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| @@ -202,13 +211,6 @@ int ScaleOpenCLKernel::Init() { | |||||
| int ScaleOpenCLKernel::Run() { | int ScaleOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); | ||||
| cl_int act_type = 0; | |||||
| if (param->activation_type_ == ActType_Relu) { | |||||
| act_type = 1; | |||||
| } else if (param->activation_type_ == ActType_Relu6) { | |||||
| act_type = 3; | |||||
| } | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| if (weight_vector_flag_) { | if (weight_vector_flag_) { | ||||
| @@ -242,7 +244,7 @@ int ScaleOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); | ||||
| } | } | ||||
| } | } | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_); | ocl_runtime_->RunKernel(kernel_, global_size_, local_size_); | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -30,6 +30,7 @@ class ScaleOpenCLKernel : public OpenCLKernel { | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | : OpenCLKernel(parameter, inputs, outputs) {} | ||||
| ~ScaleOpenCLKernel() override; | ~ScaleOpenCLKernel() override; | ||||
| int CheckSpecs() override; | |||||
| int Init() override; | int Init() override; | ||||
| int Run() override; | int Run() override; | ||||
| int InitWeights() override; | int InitWeights() override; | ||||
| @@ -0,0 +1,23 @@ | |||||
| /** | |||||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #include "src/runtime/kernel/opencl/opencl_subgraph.h" | |||||
| namespace mindspore::kernel { | |||||
| void OpenCLSubGraph::Fusion() {} | |||||
| } // namespace mindspore::kernel | |||||
| @@ -222,6 +222,7 @@ class OpenCLKernel : public LiteKernel { | |||||
| lite::opencl::MemType GetMemType() { return out_mem_type_; } | lite::opencl::MemType GetMemType() { return out_mem_type_; } | ||||
| void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | ||||
| OpParameter *GetParameter() { return op_parameter_; } | |||||
| virtual std::vector<BaseTuningParameter> GenerateTuningParam() { | virtual std::vector<BaseTuningParameter> GenerateTuningParam() { | ||||
| size_t ndim = global_size_.size(); | size_t ndim = global_size_.size(); | ||||
| @@ -352,7 +352,7 @@ int OpenCLSubGraph::Prepare() { | |||||
| MS_LOG(ERROR) << "Create OpenCLExecutor fail"; | MS_LOG(ERROR) << "Create OpenCLExecutor fail"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| Fusion(); | |||||
| auto ret = Init(); | auto ret = Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "OpenCL subgraph init fail"; | MS_LOG(ERROR) << "OpenCL subgraph init fail"; | ||||
| @@ -78,6 +78,9 @@ class OpenCLSubGraph : public SubGraphKernel { | |||||
| std::set<LiteKernel *> nodes_set_; | std::set<LiteKernel *> nodes_set_; | ||||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | ||||
| lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; | lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; | ||||
| private: | |||||
| void Fusion(); | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -25,6 +25,11 @@ | |||||
| using mindspore::lite::KernelRegistrar; | using mindspore::lite::KernelRegistrar; | ||||
| using mindspore::lite::opencl::MemType; | using mindspore::lite::opencl::MemType; | ||||
| using mindspore::schema::ActivationType_LEAKY_RELU; | |||||
| using mindspore::schema::ActivationType_RELU; | |||||
| using mindspore::schema::ActivationType_RELU6; | |||||
| using mindspore::schema::ActivationType_SIGMOID; | |||||
| using mindspore::schema::ActivationType_TANH; | |||||
| namespace mindspore::lite { | namespace mindspore::lite { | ||||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | ||||
| @@ -40,6 +45,15 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| std::string GetActDefines() { | |||||
| static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) + | |||||
| "\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) + | |||||
| "\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) + | |||||
| "\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) + | |||||
| "\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n"; | |||||
| return act_defines; | |||||
| } | |||||
| int GetUpPow2(int n) { | int GetUpPow2(int n) { | ||||
| int i = 0; | int i = 0; | ||||
| int j = 0; | int j = 0; | ||||
| @@ -34,6 +34,8 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| std::string GetActDefines(); | |||||
| int GetUpPow2(int n); | int GetUpPow2(int n); | ||||
| int GetMaxDivisor(int x, int divisor); | int GetMaxDivisor(int x, int divisor); | ||||
| @@ -84,6 +84,7 @@ if (SUPPORT_GPU) | |||||
| ${KERNEL_OP_SRC} | ${KERNEL_OP_SRC} | ||||
| ${GPU_KERNEL_OP_SRC} | ${GPU_KERNEL_OP_SRC} | ||||
| ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc | ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc | ||||
| ${LITE_DIR}/src/runtime/kernel/opencl/opencl_fusion.cc | |||||
| ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc | ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc | ||||
| ) | ) | ||||
| endif() | endif() | ||||