From 5be7fe3cae16b4ea755230c67924278f8a76be97 Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Sat, 28 Nov 2020 19:41:05 +0800 Subject: [PATCH] add opencl_fusion --- mindspore/lite/src/CMakeLists.txt | 1 + .../runtime/kernel/opencl/cl/arithmetic.cl | 196 +++++--------- .../src/runtime/kernel/opencl/cl/conv2d.cl | 251 +++++++++++------- .../kernel/opencl/cl/conv2d_transpose.cl | 15 +- .../kernel/opencl/cl/fullconnection.cl | 16 +- .../src/runtime/kernel/opencl/cl/scale.cl | 20 +- .../src/runtime/kernel/opencl/cl/winograd.cl | 32 ++- .../kernel/opencl/kernel/arithmetic.cc | 128 +++------ .../runtime/kernel/opencl/kernel/arithmetic.h | 3 + .../runtime/kernel/opencl/kernel/conv2d.cc | 46 ++-- .../src/runtime/kernel/opencl/kernel/conv2d.h | 3 + .../kernel/opencl/kernel/conv2d_transpose.cc | 10 +- .../kernel/opencl/kernel/fullconnection.cc | 42 ++- .../kernel/opencl/kernel/fullconnection.h | 3 - .../src/runtime/kernel/opencl/kernel/scale.cc | 20 +- .../src/runtime/kernel/opencl/kernel/scale.h | 1 + .../runtime/kernel/opencl/opencl_fusion.cc | 23 ++ .../src/runtime/kernel/opencl/opencl_kernel.h | 1 + .../runtime/kernel/opencl/opencl_subgraph.cc | 2 +- .../runtime/kernel/opencl/opencl_subgraph.h | 3 + .../lite/src/runtime/kernel/opencl/utils.cc | 14 + .../lite/src/runtime/kernel/opencl/utils.h | 2 + mindspore/lite/test/CMakeLists.txt | 1 + 23 files changed, 446 insertions(+), 387 deletions(-) create mode 100644 mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt index b0bccd135f..b4cbd79109 100644 --- a/mindspore/lite/src/CMakeLists.txt +++ b/mindspore/lite/src/CMakeLists.txt @@ -42,6 +42,7 @@ if (SUPPORT_GPU) set(LITE_SRC ${LITE_SRC} ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc + ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_fusion.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl index ae3cb51dc1..c8f50ffe01 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl @@ -2,8 +2,8 @@ #define divide_no_check(a, b) (a / b) __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -17,8 +17,8 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -32,8 +32,8 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -47,8 +47,8 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -62,8 +62,8 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementLogicalAnd(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -77,8 +77,8 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void ElementLogicalOr(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -92,7 +92,7 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, +__kernel void ElementMaximum(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); @@ -107,7 +107,7 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, +__kernel void ElementMinimum(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); @@ -122,9 +122,8 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementFloorDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -138,9 +137,8 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementFloorMod(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -154,9 +152,9 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementSquaredDifference(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -170,8 +168,8 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -186,9 +184,8 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2 WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementNotEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -202,8 +199,8 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementLess(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -217,9 +214,8 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementLessEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -233,8 +229,8 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { +__kernel void ElementGreater(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -248,9 +244,9 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int2 output_shape, float act_min, - float act_max) { +__kernel void ElementGreaterEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int2 output_shape, float act_min, + float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -264,9 +260,9 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { +__kernel void BroadcastNHWC4Add(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -293,9 +289,9 @@ __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only i WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } -__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { +__kernel void BroadcastNHWC4Sub(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -322,9 +318,9 @@ __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only i WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } -__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { +__kernel void BroadcastNHWC4Mul(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -351,9 +347,9 @@ __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only i WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } -__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, - __write_only image2d_t output, const int4 a_shape, const int4 b_shape, - const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { +__kernel void BroadcastNHWC4Div(__read_only image2d_t input_a, __read_only image2d_t input_b, + __write_only image2d_t output, const int4 a_shape, const int4 b_shape, + const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) { int X = get_global_id(0); // C4 int Y = get_global_id(1); // W int Z = get_global_id(2); // H @@ -380,8 +376,8 @@ __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only i WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); } -__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastLogicalAnd(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -394,8 +390,8 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastLogicalOr(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -408,7 +404,7 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, +__kernel void BroadcastMaximum(__read_only image2d_t input_a, float b, __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); @@ -422,7 +418,7 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, +__kernel void BroadcastMinimum(__read_only image2d_t input_a, float b, __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); @@ -436,8 +432,8 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastFloorDiv(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -450,8 +446,8 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastFloorMod(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -464,8 +460,8 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastSquaredDifference(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -478,8 +474,8 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -492,8 +488,8 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastNotEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -506,8 +502,8 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastLess(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -520,8 +516,8 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_ WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastLessEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -534,8 +530,8 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastGreater(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -548,8 +544,8 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri WRITE_IMAGE(output, (int2)(X, Y), result); } -__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output, - const int2 output_shape, float act_min, float act_max) { +__kernel void BroadcastGreaterEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output, + const int2 output_shape, float act_min, float act_max) { int X = get_global_id(0); int Y = get_global_id(1); if (X >= output_shape.x || Y >= output_shape.y) { @@ -561,55 +557,3 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, result = clamp(result, (FLT)(act_min), (FLT)(act_max)); WRITE_IMAGE(output, (int2)(X, Y), result); } - -__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] + input_b[idx]; -} - -__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] - input_b[idx]; -} - -__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * input_b[idx]; -} - -__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output, - const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * input_b[idx]; -} - -__kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] + (FLT)b; -} - -__kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] - (FLT)b; -} - -__kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = input_a[idx] * (FLT)b; -} - -__kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) { - int idx = get_global_id(0); - if (idx >= n) return; - output[idx] = divide_no_check(input_a[idx], (FLT)b); -} diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl index a89146d904..f603ef5cf1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl @@ -6,51 +6,59 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP #define MAX_IMAGE2D_SIZE 65535 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define ActType_Relu 1 -#define ActType_Relu6 3 - -#define DEFINE_ARGS \ - const int N = input_shape.x; \ - const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ - const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ - const int KH = kernel_stride.x, KW = kernel_stride.y; \ - const int strideH = kernel_stride.z, strideW = kernel_stride.w; \ - const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ - const int dilationH = dilation.x, dilationW = dilation.y; \ - \ - const int n_oh = get_global_id(0); \ - const int ow = get_global_id(1) * BlockW; \ - const int co_slice = get_global_id(2) * BlockC; \ - const int OH_SLICES = UP_DIV(OH, BlockH); \ - const int n = n_oh / OH_SLICES; \ - const int oh = (n_oh % OH_SLICES) * BlockH; \ - if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ - return; \ +#define DEFINE_ARGS \ + int N = input_shape.x; \ + int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \ + int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \ + int KH = kernel_stride.x, KW = kernel_stride.y; \ + int strideH = kernel_stride.z, strideW = kernel_stride.w; \ + int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \ + int dilationH = dilation.x, dilationW = dilation.y; \ + \ + int n_oh = get_global_id(0); \ + int ow = get_global_id(1) * BlockW; \ + int co_slice = get_global_id(2) * BlockC; \ + int OH_SLICES = UP_DIV(OH, BlockH); \ + int n = n_oh / OH_SLICES; \ + int oh = (n_oh % OH_SLICES) * BlockH; \ + if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \ + return; \ } +#define DO_TANH(data) \ + exp0 = exp(data); \ + exp1 = exp(-data); \ + data = (exp0 - exp1) / (exp0 + exp1); + +#define DO_LEAKY_RELU(data) \ + if (data.x < 0) data.x *= alpha; \ + if (data.y < 0) data.y *= alpha; \ + if (data.z < 0) data.z *= alpha; \ + if (data.w < 0) data.w *= alpha; + __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, - __global FLT4 *bias, const int4 input_shape, const int4 output_shape, - const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { - const int BlockH = 1; - const int BlockW = 1; - const int BlockC = 1; + __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, + int2 dilation, int act_type, float alpha) { + int BlockH = 1; + int BlockW = 1; + int BlockC = 1; DEFINE_ARGS; - const int oh0 = oh + 0; - const int n_oh0 = n * OH + oh0; - const int ow0 = ow + 0; - const int co_slice0 = co_slice + 0; + int oh0 = oh + 0; + int n_oh0 = n * OH + oh0; + int ow0 = ow + 0; + int co_slice0 = co_slice + 0; FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; for (int kh = 0; kh < KH; ++kh) { - const int ih0 = kh * dilationH + oh0 * strideH - padTop; - const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + int ih0 = kh * dilationH + oh0 * strideH - padTop; + int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; for (int kw = 0; kw < KW; ++kw) { - const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw0 = kw * dilationW + ow0 * strideW - padLeft; int x_idx0 = iw0 * CI_SLICES; for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { @@ -71,10 +79,17 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t out_h0_w0_c0 += bias[co_slice0]; } - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0, exp1; + DO_TANH(out_h0_w0_c0); + } else if (act_type == ActivationType_LEAKY_RELU) { + DO_LEAKY_RELU(out_h0_w0_c0); + } else if (act_type == ActivationType_SIGMOID) { + out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); } if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { @@ -85,19 +100,19 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t } __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, - __global FLT4 *bias, const int4 input_shape, const int4 output_shape, - const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { - const int BlockH = 2; - const int BlockW = 1; - const int BlockC = 1; + __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, + int2 dilation, int act_type, float alpha) { + int BlockH = 2; + int BlockW = 1; + int BlockC = 1; DEFINE_ARGS; - const int oh0 = oh + 0; - const int oh1 = oh + 1; - const int n_oh0 = n * OH + oh0; - const int n_oh1 = n * OH + oh1; - const int ow0 = ow + 0; - const int co_slice0 = co_slice + 0; + int oh0 = oh + 0; + int oh1 = oh + 1; + int n_oh0 = n * OH + oh0; + int n_oh1 = n * OH + oh1; + int ow0 = ow + 0; + int co_slice0 = co_slice + 0; FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); @@ -105,15 +120,15 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; for (int kh = 0; kh < KH; ++kh) { - const int ih0 = kh * dilationH + oh0 * strideH - padTop; + int ih0 = kh * dilationH + oh0 * strideH - padTop; // no need to check oh1, finally write out will check (oh1 < OH) - const int ih1 = kh * dilationH + oh1 * strideH - padTop; + int ih1 = kh * dilationH + oh1 * strideH - padTop; // check ih0 and ih1 - const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; - const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; + int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; for (int kw = 0; kw < KW; ++kw) { - const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw0 = kw * dilationW + ow0 * strideW - padLeft; int x_idx0 = iw0 * CI_SLICES; for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { @@ -140,12 +155,22 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t out_h1_w0_c0 += bias[co_slice0]; } - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0, exp1; + DO_TANH(out_h0_w0_c0); + DO_TANH(out_h1_w0_c0); + } else if (act_type == ActivationType_LEAKY_RELU) { + DO_LEAKY_RELU(out_h0_w0_c0); + DO_LEAKY_RELU(out_h1_w0_c0); + } else if (act_type == ActivationType_SIGMOID) { + out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); + out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); } if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { @@ -162,20 +187,20 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t } __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, - __global FLT4 *bias, const int4 input_shape, const int4 output_shape, - const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { - const int BlockH = 2; - const int BlockW = 1; - const int BlockC = 2; + __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, + int2 dilation, int act_type, float alpha) { + int BlockH = 2; + int BlockW = 1; + int BlockC = 2; DEFINE_ARGS; - const int oh0 = oh + 0; - const int oh1 = oh + 1; - const int n_oh0 = n * OH + oh0; - const int n_oh1 = n * OH + oh1; - const int ow0 = ow + 0; - const int co_slice0 = co_slice + 0; - const int co_slice1 = co_slice + 1; + int oh0 = oh + 0; + int oh1 = oh + 1; + int n_oh0 = n * OH + oh0; + int n_oh1 = n * OH + oh1; + int ow0 = ow + 0; + int co_slice0 = co_slice + 0; + int co_slice1 = co_slice + 1; FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); @@ -185,15 +210,15 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; for (int kh = 0; kh < KH; ++kh) { - const int ih0 = kh * dilationH + oh0 * strideH - padTop; + int ih0 = kh * dilationH + oh0 * strideH - padTop; // no need to check oh1, finally write out will check (oh1 < OH) - const int ih1 = kh * dilationH + oh1 * strideH - padTop; + int ih1 = kh * dilationH + oh1 * strideH - padTop; // check ih0 and ih1 - const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; - const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; + int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; for (int kw = 0; kw < KW; ++kw) { - const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw0 = kw * dilationW + ow0 * strideW - padLeft; int x_idx0 = iw0 * CI_SLICES; for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { @@ -231,16 +256,32 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t out_h1_w0_c1 += bias[co_slice1]; } - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0, exp1; + DO_TANH(out_h0_w0_c0); + DO_TANH(out_h1_w0_c0); + DO_TANH(out_h0_w0_c1); + DO_TANH(out_h1_w0_c1); + } else if (act_type == ActivationType_LEAKY_RELU) { + DO_LEAKY_RELU(out_h0_w0_c0); + DO_LEAKY_RELU(out_h1_w0_c0); + DO_LEAKY_RELU(out_h0_w0_c1); + DO_LEAKY_RELU(out_h1_w0_c1); + } else if (act_type == ActivationType_SIGMOID) { + out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); + out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); + out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1)); + out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); } if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { @@ -269,21 +310,21 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t } __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, - __global FLT4 *bias, const int4 input_shape, const int4 output_shape, - const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) { - const int BlockH = 2; - const int BlockW = 2; - const int BlockC = 2; + __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad, + int2 dilation, int act_type, float alpha) { + int BlockH = 2; + int BlockW = 2; + int BlockC = 2; DEFINE_ARGS; - const int oh0 = oh + 0; - const int oh1 = oh + 1; - const int n_oh0 = n * OH + oh0; - const int n_oh1 = n * OH + oh1; - const int ow0 = ow + 0; - const int ow1 = ow + 1; - const int co_slice0 = co_slice + 0; - const int co_slice1 = co_slice + 1; + int oh0 = oh + 0; + int oh1 = oh + 1; + int n_oh0 = n * OH + oh0; + int n_oh1 = n * OH + oh1; + int ow0 = ow + 0; + int ow1 = ow + 1; + int co_slice0 = co_slice + 0; + int co_slice1 = co_slice + 1; FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); @@ -297,15 +338,15 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; for (int kh = 0; kh < KH; ++kh) { - const int ih0 = kh * dilationH + oh0 * strideH - padTop; + int ih0 = kh * dilationH + oh0 * strideH - padTop; // no need to check oh1, finally write out will check (oh1 < OH) - const int ih1 = kh * dilationH + oh1 * strideH - padTop; + int ih1 = kh * dilationH + oh1 * strideH - padTop; // check ih0 and ih1 - const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; - const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; + int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1; + int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1; for (int kw = 0; kw < KW; ++kw) { - const int iw0 = kw * dilationW + ow0 * strideW - padLeft; + int iw0 = kw * dilationW + ow0 * strideW - padLeft; int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; int x_idx0 = iw0 * CI_SLICES; int x_idx1 = iw1 * CI_SLICES; @@ -368,7 +409,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t out_h1_w1_c1 += bias[co_slice1]; } - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); @@ -377,7 +418,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); @@ -386,6 +427,34 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0, exp1; + DO_TANH(out_h0_w0_c0); + DO_TANH(out_h0_w1_c0); + DO_TANH(out_h1_w0_c0); + DO_TANH(out_h1_w1_c0); + DO_TANH(out_h0_w0_c1); + DO_TANH(out_h0_w1_c1); + DO_TANH(out_h1_w0_c1); + DO_TANH(out_h1_w1_c1); + } else if (act_type == ActivationType_LEAKY_RELU) { + DO_LEAKY_RELU(out_h0_w0_c0); + DO_LEAKY_RELU(out_h0_w1_c0); + DO_LEAKY_RELU(out_h1_w0_c0); + DO_LEAKY_RELU(out_h1_w1_c0); + DO_LEAKY_RELU(out_h0_w0_c1); + DO_LEAKY_RELU(out_h0_w1_c1); + DO_LEAKY_RELU(out_h1_w0_c1); + DO_LEAKY_RELU(out_h1_w1_c1); + } else if (act_type == ActivationType_SIGMOID) { + out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0)); + out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0)); + out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0)); + out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0)); + out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1)); + out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1)); + out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1)); + out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1)); } if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl index 591243c719..2c2afd7fc7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl @@ -2,7 +2,7 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, - int2 stride, int2 padding, int4 src_size, int4 dst_size) { + int2 stride, int2 padding, int4 src_size, int4 dst_size, int act_type) { int dst_h = get_global_id(0); int rem_h = dst_h % stride.x; int ceil_h = dst_h / stride.x; @@ -70,6 +70,19 @@ __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_onl r1 += bias_val; r2 += bias_val; r3 += bias_val; + + if (act_type == ActivationType_RELU) { + r0 = max(r0, (FLT4)(0.0f)); + r1 = max(r1, (FLT4)(0.0f)); + r2 = max(r2, (FLT4)(0.0f)); + r3 = max(r3, (FLT4)(0.0f)); + } else if (act_type == ActivationType_RELU6) { + r0 = clamp(r0, (FLT4)(0.0f), (FLT4)(6.0f)); + r1 = clamp(r1, (FLT4)(0.0f), (FLT4)(6.0f)); + r2 = clamp(r2, (FLT4)(0.0f), (FLT4)(6.0f)); + r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f)); + } + WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0); if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) { WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1); diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl index 8b0cb16095..d09f93cfde 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl @@ -2,9 +2,9 @@ #define C4NUM 4 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, - __read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min, - float act_max) { + +__kernel void FullConnection(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, + __read_only image2d_t bias, int4 in_shape, int2 out_shape, int act_type) { int gidx = get_global_id(0); // CO4 int gidz = get_global_id(2); // N int lidx = get_local_id(0); @@ -34,7 +34,15 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only ima result += temp[lidx][2]; result += temp[lidx][3]; result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); - result = clamp(result, (FLT)(act_min), (FLT)(act_max)); + if (act_type == ActivationType_RELU) { + result = max(result, (FLT4)(0.0f)); + } else if (act_type == ActivationType_RELU6) { + result = clamp(result, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0 = exp(result); + FLT4 exp1 = exp(-result); + result = (exp0 - exp1) / (exp0 + exp1); + } WRITE_IMAGE(output, (int2)(gidx, gidz), result); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl index d47ddcdf0c..6fbdc4ff38 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl @@ -1,10 +1,6 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; -#define ActType_No 0 -#define ActType_Relu 1 -#define ActType_Sigmod 2 -#define ActType_Relu6 3 #define C4NUM 4 __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, @@ -19,9 +15,9 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); FLT4 out = in * s + o; - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out = max(out, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); } WRITE_IMAGE(output, (int2)(X, Y), out); @@ -37,9 +33,9 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); FLT4 out = in * (FLT)scale + (FLT)offset; - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out = max(out, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); } WRITE_IMAGE(output, (int2)(X, Y), out); @@ -57,9 +53,9 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); FLT4 out = in * s + o; - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out = max(out, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); } WRITE_IMAGE(output, (int2)(X, Y), out); @@ -94,9 +90,9 @@ __kernel void Scale_H_IMG(__read_only image2d_t input, __read_only image2d_t sca o_real = o.w; } FLT4 out = in * s_real + o_real; - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { out = max(out, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); } WRITE_IMAGE(output, (int2)(X, Y), out); diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl index fad2a98e96..295265a6ad 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl @@ -4,9 +4,6 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP #define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) -#define ActType_Relu 1 -#define ActType_Relu6 3 - constant FLT Bt[36] = { 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, @@ -17,8 +14,8 @@ constant FLT Bt[36] = { }; __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, - const int4 input_shape, // N H W CI_SLICES - const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES + int4 input_shape, // N H W CI_SLICES + int4 output_shape) { // N 36 H/4*W/4 CI_SLICES #define PAD 1 int tile_xy = get_global_id(0); int row = get_global_id(1); @@ -63,8 +60,8 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_ } __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, - const int4 input_shape, // N 36 H/4*W/4 CI_SLICES - const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES + int4 input_shape, // N 36 H/4*W/4 CI_SLICES + int4 output_shape) { // N 36 H/4*W/4 CO_SLICES #define H 36 int w = get_global_id(0) * 2; int h = get_global_id(1); @@ -134,9 +131,9 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.000000000 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, - const int4 input_shape, // N 36 H/4*W/4 CO_SLICES - const int4 output_shape, // N H W CO_SLICES - const int act_type) { + int4 input_shape, // N 36 H/4*W/4 CO_SLICES + int4 output_shape, // N H W CO_SLICES + int act_type, float alpha) { int tile_xy = get_global_id(0); int row = get_global_id(1); int slice = get_global_id(2); @@ -175,10 +172,21 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ acc += bias[slice]; } - if (act_type == ActType_Relu) { + if (act_type == ActivationType_RELU) { acc = max(acc, (FLT4)(0.0f)); - } else if (act_type == ActType_Relu6) { + } else if (act_type == ActivationType_RELU6) { acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); + } else if (act_type == ActivationType_TANH) { + FLT4 exp0 = exp(acc); + FLT4 exp1 = exp(-acc); + acc = (exp0 - exp1) / (exp0 + exp1); + } else if (act_type == ActivationType_LEAKY_RELU) { + if (acc.x < 0) acc.x *= alpha; + if (acc.y < 0) acc.y *= alpha; + if (acc.z < 0) acc.z *= alpha; + if (acc.w < 0) acc.w *= alpha; + } else if (act_type == ActivationType_SIGMOID) { + acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc)); } WRITE_IMAGE(output, (int2)(x_idx, oh), acc); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index c8aa34908e..1dccb1acf6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -31,93 +31,46 @@ using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; using mindspore::lite::opencl::MemType; +using mindspore::schema::ActivationType_NO_ACTIVATION; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; using mindspore::schema::PrimitiveType_Eltwise; namespace mindspore::kernel { +std::set SupportedOpenCLArithmetics = {PrimitiveType_Mul, + PrimitiveType_Add, + PrimitiveType_Sub, + PrimitiveType_Div, + PrimitiveType_LogicalAnd, + PrimitiveType_LogicalOr, + PrimitiveType_Maximum, + PrimitiveType_Minimum, + PrimitiveType_FloorDiv, + PrimitiveType_FloorMod, + PrimitiveType_SquaredDifference, + PrimitiveType_Equal, + PrimitiveType_NotEqual, + PrimitiveType_Less, + PrimitiveType_LessEqual, + PrimitiveType_Greater, + PrimitiveType_GreaterEqual, + PrimitiveType_Eltwise}; + int ArithmeticOpenCLKernel::CheckSpecs() { - auto *arithmetic_parameter = reinterpret_cast(op_parameter_); - if (arithmetic_parameter->broadcasting_) { - element_flag_ = false; - kernel_name_ = "BroadcastNHWC4"; - if (out_tensors_[0]->shape()[0] > 1) { - MS_LOG(ERROR) << "Broadcasting don't support N > 1"; - return RET_ERROR; - } - } else { - kernel_name_ = "Element"; + auto *param = reinterpret_cast(op_parameter_); + if (param->broadcasting_ && out_tensors_[0]->shape()[0] > 1) { + MS_LOG(ERROR) << "Broadcasting don't support N > 1"; + return RET_ERROR; } - - switch (op_parameter_->type_) { - case PrimitiveType_Mul: - kernel_name_ += "Mul"; - break; - case PrimitiveType_Add: - kernel_name_ += "Add"; - break; - case PrimitiveType_Sub: - kernel_name_ += "Sub"; - break; - case PrimitiveType_Div: - kernel_name_ += "Div"; - break; - case PrimitiveType_LogicalAnd: - kernel_name_ += "And"; - break; - case PrimitiveType_LogicalOr: - kernel_name_ += "Or"; - break; - case PrimitiveType_Maximum: - kernel_name_ += "Max"; - break; - case PrimitiveType_Minimum: - kernel_name_ += "Min"; - break; - case PrimitiveType_FloorDiv: - kernel_name_ += "FloorDiv"; - break; - case PrimitiveType_FloorMod: - kernel_name_ += "FloorMod"; - break; - case PrimitiveType_SquaredDifference: - kernel_name_ += "SquaredDifference"; - break; - case PrimitiveType_Equal: - kernel_name_ += "Equal"; - break; - case PrimitiveType_NotEqual: - kernel_name_ += "NotEqual"; - break; - case PrimitiveType_Less: - kernel_name_ += "Less"; - break; - case PrimitiveType_LessEqual: - kernel_name_ += "LessEqual"; - break; - case PrimitiveType_Greater: - kernel_name_ += "Greater"; - break; - case PrimitiveType_GreaterEqual: - kernel_name_ += "GreaterEqual"; - break; - default: - MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_; - return RET_ERROR; + if (SupportedOpenCLArithmetics.count(static_cast(op_parameter_->type_)) == 0) { + MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamesPrimitiveType()[op_parameter_->type_]; + return RET_ERROR; } - - switch (arithmetic_parameter->activation_type_) { - case schema::ActivationType_NO_ACTIVATION: - break; - case schema::ActivationType_RELU: - activation_min_ = 0.f; - break; - case schema::ActivationType_RELU6: - activation_min_ = 0.f; - activation_max_ = 6.f; - break; - default: - MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_; - return RET_ERROR; + if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU || + param->activation_type_ == ActivationType_RELU6)) { + MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_; + return RET_ERROR; } return RET_OK; } @@ -240,11 +193,18 @@ int ArithmeticOpenCLKernel::Prepare() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); #else - if (out_mem_type_ == MemType::IMG) { - kernel_name_ += "_IMG"; - } else { - kernel_name_ += "_BUF"; + + auto *param = reinterpret_cast(op_parameter_); + element_flag_ = !param->broadcasting_; + kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element"; + kernel_name_ += schema::EnumNamesPrimitiveType()[op_parameter_->type_]; + if (param->activation_type_ == ActivationType_RELU) { + activation_min_ = 0.f; + } else if (param->activation_type_ == ActivationType_RELU6) { + activation_min_ = 0.f; + activation_max_ = 6.f; } + std::string program_name = "Arithmetic"; std::string source = arithmetic_source; ocl_runtime_->LoadSource(program_name, source); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 7e49f8562f..c4d9f504da 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -18,12 +18,15 @@ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ #include +#include #include #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" #include "src/runtime/kernel/opencl/opencl_kernel.h" namespace mindspore::kernel { +extern std::set SupportedOpenCLArithmetics; + class ArithmeticOpenCLKernel : public OpenCLKernel { public: ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector &inputs, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 549b420081..6d81fc9d5b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -23,6 +23,7 @@ #include "src/runtime/kernel/opencl/utils.h" #include "src/kernel_registry.h" #include "include/errorcode.h" +#include "schema/ops_generated.h" #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc" #include "src/runtime/kernel/opencl/cl/winograd.cl.inc" @@ -30,6 +31,11 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::schema::ActivationType_LEAKY_RELU; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; +using mindspore::schema::ActivationType_SIGMOID; +using mindspore::schema::ActivationType_TANH; using mindspore::schema::PrimitiveType_Conv2D; using mindspore::schema::PrimitiveType_FullConnection; @@ -59,6 +65,10 @@ int Conv2DOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; return RET_ERROR; } + if (param_->act_type_ != ActType_No && param_->act_type_ != ActType_Relu && param_->act_type_ != ActType_Relu6) { + MS_LOG(ERROR) << "Unsupported activation type " << param_->act_type_; + return RET_ERROR; + } return RET_OK; } @@ -72,9 +82,16 @@ int Conv2DOpenCLKernel::Prepare() { IH_ = input_shape[1]; IW_ = input_shape[2]; CI_ = input_shape[3]; - OH_ = output_shape[1]; - OW_ = output_shape[2]; - CO_ = output_shape[3]; + // for fusion Conv2D and Reshape(N11C->NC) + if (output_shape.size() == 2) { + OH_ = 1; + OW_ = 1; + CO_ = output_shape[1]; + } else { // output_shape.size()==4 + OH_ = output_shape[1]; + OW_ = output_shape[2]; + CO_ = output_shape[3]; + } CI_SLICES_ = UP_DIV(CI_, C4NUM); CO_SLICES_ = UP_DIV(CO_, C4NUM); KH_ = param_->kernel_h_; @@ -91,7 +108,7 @@ int Conv2DOpenCLKernel::Prepare() { if (use_winograd_) { MS_LOG(DEBUG) << "use winograd"; std::string program_name = "winograd"; - ocl_runtime_->LoadSource(program_name, winograd_source); + ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); @@ -100,7 +117,7 @@ int Conv2DOpenCLKernel::Prepare() { std::string program_name = "conv2d"; std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + std::to_string(block_size_.C); - ocl_runtime_->LoadSource(program_name, conv2d_source); + ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); } @@ -347,13 +364,6 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { } void Conv2DOpenCLKernel::SetConstArgs() { - auto param = reinterpret_cast(op_parameter_); - cl_int act_type = 0; - if (param->act_type_ == ActType_Relu) { - act_type = 1; - } else if (param->act_type_ == ActType_Relu6) { - act_type = 3; - } cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; @@ -380,12 +390,13 @@ void Conv2DOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); - ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, act_type); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast(param_->act_type_)); + ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast(alpha_)); } else { arg_cn = 2; - cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; - cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; - cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; + cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_}; + cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_}; + cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_}; ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); @@ -393,7 +404,8 @@ void Conv2DOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); - ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast(param_->act_type_)); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast(alpha_)); } } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h index 95bef1d513..bdc2fcdddf 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h @@ -44,6 +44,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel { int Run() override; int Tune() override; + // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU + float alpha_{0.0f}; + private: void SetBlockSize(); int InitFilter(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 60bd36d7ea..953d3e4c08 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -22,11 +22,14 @@ #ifndef PROGRAM_WITH_IL #include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc" #endif +#include "src/runtime/kernel/opencl/utils.h" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; using mindspore::schema::PrimitiveType_DeConv2D; namespace mindspore::kernel { @@ -38,6 +41,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() { MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; return RET_ERROR; } + if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) { + MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; + return RET_ERROR; + } return RET_OK; } @@ -47,7 +54,7 @@ int Conv2dTransposeOpenCLKernel::Prepare() { #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else - std::string source = conv2d_transpose_source; + std::string source = GetActDefines() + conv2d_transpose_source; std::string program_name = "conv2d_transpose"; ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); @@ -102,6 +109,7 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, static_cast(param->act_type_)); } int Conv2dTransposeOpenCLKernel::InitWeights() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index 7367ce7e4e..a288e660fb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -29,48 +29,38 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_ERROR; using mindspore::lite::RET_OK; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; +using mindspore::schema::ActivationType_TANH; using mindspore::schema::PrimitiveType_FullConnection; namespace mindspore::kernel { -int FullConnectionOpenCLKernel::Init() { - // deleted soon - return CheckSpecs(); -} - int FullConnectionOpenCLKernel::CheckSpecs() { auto param = reinterpret_cast(op_parameter_); - transposeA = param->a_transpose_; - if (transposeA) { + if (param->a_transpose_) { MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; return RET_ERROR; } - transposeB = param->b_transpose_; - enable_fp16_ = ocl_runtime_->GetFp16Enable(); if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || (out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) { MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; return RET_ERROR; } - switch (param->act_type_) { - case ActType_No: - break; - case ActType_Relu: - activation_min_ = 0.f; - break; - case ActType_Relu6: - activation_min_ = 0.f; - activation_max_ = 6.f; - break; - default: - MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; - return RET_ERROR; + if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) { + MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_; + return RET_ERROR; } return RET_OK; } int FullConnectionOpenCLKernel::Prepare() { - std::string kernel_name = "FullConnection_NHWC4"; + auto param = reinterpret_cast(op_parameter_); + transposeA = param->a_transpose_; + transposeB = param->b_transpose_; + enable_fp16_ = ocl_runtime_->GetFp16Enable(); + + std::string kernel_name = "FullConnection"; inShape = GpuTensorInfo(in_tensors_[0]); outShape = GpuTensorInfo(out_tensors_[0]); #ifdef PROGRAM_WITH_IL @@ -78,7 +68,7 @@ int FullConnectionOpenCLKernel::Prepare() { #else std::string source = fullconnection_source; std::string program_name = "FullConnection"; - ocl_runtime_->LoadSource(program_name, source); + ocl_runtime_->LoadSource(program_name, GetActDefines() + source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); #endif auto ret = InitWeights(); @@ -200,11 +190,11 @@ void FullConnectionOpenCLKernel::SetConstArgs() { static_cast(inShape.C)}; cl_int2 out_shape = {static_cast(outShape.N), static_cast(outShape.C)}; ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); + auto *param = reinterpret_cast(op_parameter_); ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_); - ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_); + ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast(param->act_type_)); } int FullConnectionOpenCLKernel::Run() { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h index d507eecf35..275ce3f12a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h @@ -37,7 +37,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { int InitWeights() override; void SetConstArgs() override; void SetGlobalLocal() override; - int Init() override; int Tune() override { return lite::RET_OK; } private: @@ -46,8 +45,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { bool enable_fp16_{false}; bool transposeA{false}; bool transposeB{true}; - float activation_min_{-FLT_MAX}; - float activation_max_{FLT_MAX}; GpuTensorInfo inShape = GpuTensorInfo(nullptr); GpuTensorInfo outShape = GpuTensorInfo(nullptr); }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc index e79c367b85..89d892a04d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc @@ -35,6 +35,15 @@ using mindspore::schema::PrimitiveType_Scale; namespace mindspore::kernel { +int ScaleOpenCLKernel::CheckSpecs() { + auto *param = reinterpret_cast(op_parameter_); + if (param->activation_type_ != ActType_No && param->activation_type_ != ActType_Relu && + param->activation_type_ != ActType_Relu6) { + return RET_ERROR; + } + return RET_OK; +} + ScaleOpenCLKernel::~ScaleOpenCLKernel() { auto allocator = ocl_runtime_->GetAllocator(); if (scale_ptr_ != nullptr) { @@ -185,7 +194,7 @@ int ScaleOpenCLKernel::Init() { kernel_name += "_BUF"; } std::string program_name = "Scale"; - std::string source = scale_source; + std::string source = GetActDefines() + scale_source; ocl_runtime_->LoadSource(program_name, source); error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); #endif @@ -202,13 +211,6 @@ int ScaleOpenCLKernel::Init() { int ScaleOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; auto *param = reinterpret_cast(op_parameter_); - cl_int act_type = 0; - if (param->activation_type_ == ActType_Relu) { - act_type = 1; - } else if (param->activation_type_ == ActType_Relu6) { - act_type = 3; - } - int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); if (weight_vector_flag_) { @@ -242,7 +244,7 @@ int ScaleOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); } } - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_); ocl_runtime_->RunKernel(kernel_, global_size_, local_size_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h index e214fabb37..958d2489dc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -30,6 +30,7 @@ class ScaleOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ScaleOpenCLKernel() override; + int CheckSpecs() override; int Init() override; int Run() override; int InitWeights() override; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc new file mode 100644 index 0000000000..0488a92f59 --- /dev/null +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc @@ -0,0 +1,23 @@ +/** + * Copyright 2020 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "src/runtime/kernel/opencl/opencl_subgraph.h" + +namespace mindspore::kernel { + +void OpenCLSubGraph::Fusion() {} + +} // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index e79f689d27..481968f1e3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -222,6 +222,7 @@ class OpenCLKernel : public LiteKernel { lite::opencl::MemType GetMemType() { return out_mem_type_; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } + OpParameter *GetParameter() { return op_parameter_; } virtual std::vector GenerateTuningParam() { size_t ndim = global_size_.size(); diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 8668e14790..095fc45b47 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -352,7 +352,7 @@ int OpenCLSubGraph::Prepare() { MS_LOG(ERROR) << "Create OpenCLExecutor fail"; return RET_ERROR; } - + Fusion(); auto ret = Init(); if (ret != RET_OK) { MS_LOG(ERROR) << "OpenCL subgraph init fail"; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h index d14e5dbd12..88a7f3781b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h @@ -78,6 +78,9 @@ class OpenCLSubGraph : public SubGraphKernel { std::set nodes_set_; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; + + private: + void Fusion(); }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc index 0d8f313b35..185461c6db 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc @@ -25,6 +25,11 @@ using mindspore::lite::KernelRegistrar; using mindspore::lite::opencl::MemType; +using mindspore::schema::ActivationType_LEAKY_RELU; +using mindspore::schema::ActivationType_RELU; +using mindspore::schema::ActivationType_RELU6; +using mindspore::schema::ActivationType_SIGMOID; +using mindspore::schema::ActivationType_TANH; namespace mindspore::lite { kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, const std::vector &out_tensors, @@ -40,6 +45,15 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, con namespace mindspore::kernel { +std::string GetActDefines() { + static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) + + "\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) + + "\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) + + "\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) + + "\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n"; + return act_defines; +} + int GetUpPow2(int n) { int i = 0; int j = 0; diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h index 5e71f8bb1d..0971b88913 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/utils.h +++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h @@ -34,6 +34,8 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector &in_tensors, con namespace mindspore::kernel { +std::string GetActDefines(); + int GetUpPow2(int n); int GetMaxDivisor(int x, int divisor); diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt index 571c0cd046..82b989a1f8 100644 --- a/mindspore/lite/test/CMakeLists.txt +++ b/mindspore/lite/test/CMakeLists.txt @@ -84,6 +84,7 @@ if (SUPPORT_GPU) ${KERNEL_OP_SRC} ${GPU_KERNEL_OP_SRC} ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc + ${LITE_DIR}/src/runtime/kernel/opencl/opencl_fusion.cc ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc ) endif()