Browse Source

!9184 【MSLITE】opencl support runtime fusion

From: @wangdongxu6
Reviewed-by: @ddwsky
Signed-off-by: @ddwsky
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
e313ab5313
23 changed files with 446 additions and 387 deletions
  1. +1
    -0
      mindspore/lite/src/CMakeLists.txt
  2. +70
    -126
      mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
  3. +160
    -91
      mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
  4. +14
    -1
      mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
  5. +12
    -4
      mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
  6. +8
    -12
      mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
  7. +20
    -12
      mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
  8. +44
    -84
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
  9. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
  10. +29
    -17
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
  11. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
  12. +9
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
  13. +16
    -26
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
  14. +0
    -3
      mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
  15. +11
    -9
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
  16. +1
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
  17. +23
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
  18. +1
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  19. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
  20. +3
    -0
      mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
  21. +14
    -0
      mindspore/lite/src/runtime/kernel/opencl/utils.cc
  22. +2
    -0
      mindspore/lite/src/runtime/kernel/opencl/utils.h
  23. +1
    -0
      mindspore/lite/test/CMakeLists.txt

+ 1
- 0
mindspore/lite/src/CMakeLists.txt View File

@@ -42,6 +42,7 @@ if (SUPPORT_GPU)
set(LITE_SRC set(LITE_SRC
${LITE_SRC} ${LITE_SRC}
${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc
${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_fusion.cc
${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc
${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc
${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc


+ 70
- 126
mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl View File

@@ -2,8 +2,8 @@
#define divide_no_check(a, b) (a / b) #define divide_no_check(a, b) (a / b)
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;


__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -17,8 +17,8 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -32,8 +32,8 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -47,8 +47,8 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -62,8 +62,8 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementLogicalAnd(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -77,8 +77,8 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void ElementLogicalOr(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -92,7 +92,7 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__kernel void ElementMaximum(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
@@ -107,7 +107,7 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__kernel void ElementMinimum(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) { __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
@@ -122,9 +122,8 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementFloorDiv(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -138,9 +137,8 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementFloorMod(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -154,9 +152,9 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementSquaredDifference(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -170,8 +168,8 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -186,9 +184,8 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementNotEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -202,8 +199,8 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementLess(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -217,9 +214,8 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementLessEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -233,8 +229,8 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
__kernel void ElementGreater(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -248,9 +244,9 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
__kernel void ElementGreaterEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int2 output_shape, float act_min,
float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -264,9 +260,9 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
__kernel void BroadcastNHWC4Add(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
int X = get_global_id(0); // C4 int X = get_global_id(0); // C4
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H int Z = get_global_id(2); // H
@@ -293,9 +289,9 @@ __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only i
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
} }


__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
__kernel void BroadcastNHWC4Sub(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
int X = get_global_id(0); // C4 int X = get_global_id(0); // C4
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H int Z = get_global_id(2); // H
@@ -322,9 +318,9 @@ __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only i
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
} }


__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
__kernel void BroadcastNHWC4Mul(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
int X = get_global_id(0); // C4 int X = get_global_id(0); // C4
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H int Z = get_global_id(2); // H
@@ -351,9 +347,9 @@ __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only i
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
} }


__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
__kernel void BroadcastNHWC4Div(__read_only image2d_t input_a, __read_only image2d_t input_b,
__write_only image2d_t output, const int4 a_shape, const int4 b_shape,
const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
int X = get_global_id(0); // C4 int X = get_global_id(0); // C4
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H int Z = get_global_id(2); // H
@@ -380,8 +376,8 @@ __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only i
WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result); WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
} }


__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastLogicalAnd(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -394,8 +390,8 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastLogicalOr(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -408,7 +404,7 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
__kernel void BroadcastMaximum(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) { const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
@@ -422,7 +418,7 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
__kernel void BroadcastMinimum(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) { const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
@@ -436,8 +432,8 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastFloorDiv(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -450,8 +446,8 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastFloorMod(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -464,8 +460,8 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastSquaredDifference(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -478,8 +474,8 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -492,8 +488,8 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastNotEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -506,8 +502,8 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastLess(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -520,8 +516,8 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastLessEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -534,8 +530,8 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastGreater(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -548,8 +544,8 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }


__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
__kernel void BroadcastGreaterEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
const int2 output_shape, float act_min, float act_max) {
int X = get_global_id(0); int X = get_global_id(0);
int Y = get_global_id(1); int Y = get_global_id(1);
if (X >= output_shape.x || Y >= output_shape.y) { if (X >= output_shape.x || Y >= output_shape.y) {
@@ -561,55 +557,3 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b,
result = clamp(result, (FLT)(act_min), (FLT)(act_max)); result = clamp(result, (FLT)(act_min), (FLT)(act_max));
WRITE_IMAGE(output, (int2)(X, Y), result); WRITE_IMAGE(output, (int2)(X, Y), result);
} }

__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + input_b[idx];
}

__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - input_b[idx];
}

__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}

__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * input_b[idx];
}

__kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] + (FLT)b;
}

__kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] - (FLT)b;
}

__kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = input_a[idx] * (FLT)b;
}

__kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
int idx = get_global_id(0);
if (idx >= n) return;
output[idx] = divide_no_check(input_a[idx], (FLT)b);
}

+ 160
- 91
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl View File

@@ -6,51 +6,59 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
#define MAX_IMAGE2D_SIZE 65535 #define MAX_IMAGE2D_SIZE 65535
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))


#define ActType_Relu 1
#define ActType_Relu6 3

#define DEFINE_ARGS \
const int N = input_shape.x; \
const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \
const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \
const int KH = kernel_stride.x, KW = kernel_stride.y; \
const int strideH = kernel_stride.z, strideW = kernel_stride.w; \
const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \
const int dilationH = dilation.x, dilationW = dilation.y; \
\
const int n_oh = get_global_id(0); \
const int ow = get_global_id(1) * BlockW; \
const int co_slice = get_global_id(2) * BlockC; \
const int OH_SLICES = UP_DIV(OH, BlockH); \
const int n = n_oh / OH_SLICES; \
const int oh = (n_oh % OH_SLICES) * BlockH; \
if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \
return; \
#define DEFINE_ARGS \
int N = input_shape.x; \
int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w; \
int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \
int KH = kernel_stride.x, KW = kernel_stride.y; \
int strideH = kernel_stride.z, strideW = kernel_stride.w; \
int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \
int dilationH = dilation.x, dilationW = dilation.y; \
\
int n_oh = get_global_id(0); \
int ow = get_global_id(1) * BlockW; \
int co_slice = get_global_id(2) * BlockC; \
int OH_SLICES = UP_DIV(OH, BlockH); \
int n = n_oh / OH_SLICES; \
int oh = (n_oh % OH_SLICES) * BlockH; \
if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) { \
return; \
} }


#define DO_TANH(data) \
exp0 = exp(data); \
exp1 = exp(-data); \
data = (exp0 - exp1) / (exp0 + exp1);

#define DO_LEAKY_RELU(data) \
if (data.x < 0) data.x *= alpha; \
if (data.y < 0) data.y *= alpha; \
if (data.z < 0) data.z *= alpha; \
if (data.w < 0) data.w *= alpha;

__kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, const int4 input_shape, const int4 output_shape,
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
const int BlockH = 1;
const int BlockW = 1;
const int BlockC = 1;
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
int BlockH = 1;
int BlockW = 1;
int BlockC = 1;
DEFINE_ARGS; DEFINE_ARGS;


const int oh0 = oh + 0;
const int n_oh0 = n * OH + oh0;
const int ow0 = ow + 0;
const int co_slice0 = co_slice + 0;
int oh0 = oh + 0;
int n_oh0 = n * OH + oh0;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;


FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);


__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;


for (int kh = 0; kh < KH; ++kh) { for (int kh = 0; kh < KH; ++kh) {
const int ih0 = kh * dilationH + oh0 * strideH - padTop;
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int ih0 = kh * dilationH + oh0 * strideH - padTop;
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;


for (int kw = 0; kw < KW; ++kw) { for (int kw = 0; kw < KW; ++kw) {
const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES; int x_idx0 = iw0 * CI_SLICES;


for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -71,10 +79,17 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
out_h0_w0_c0 += bias[co_slice0]; out_h0_w0_c0 += bias[co_slice0];
} }


if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
} }


if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -85,19 +100,19 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
} }


__kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, const int4 input_shape, const int4 output_shape,
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
const int BlockH = 2;
const int BlockW = 1;
const int BlockC = 1;
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
int BlockH = 2;
int BlockW = 1;
int BlockC = 1;
DEFINE_ARGS; DEFINE_ARGS;


const int oh0 = oh + 0;
const int oh1 = oh + 1;
const int n_oh0 = n * OH + oh0;
const int n_oh1 = n * OH + oh1;
const int ow0 = ow + 0;
const int co_slice0 = co_slice + 0;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;


FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -105,15 +120,15 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;


for (int kh = 0; kh < KH; ++kh) { for (int kh = 0; kh < KH; ++kh) {
const int ih0 = kh * dilationH + oh0 * strideH - padTop;
int ih0 = kh * dilationH + oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH) // no need to check oh1, finally write out will check (oh1 < OH)
const int ih1 = kh * dilationH + oh1 * strideH - padTop;
int ih1 = kh * dilationH + oh1 * strideH - padTop;
// check ih0 and ih1 // check ih0 and ih1
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;


for (int kw = 0; kw < KW; ++kw) { for (int kw = 0; kw < KW; ++kw) {
const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES; int x_idx0 = iw0 * CI_SLICES;


for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -140,12 +155,22 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
out_h1_w0_c0 += bias[co_slice0]; out_h1_w0_c0 += bias[co_slice0];
} }


if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h1_w0_c0);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0);
DO_LEAKY_RELU(out_h1_w0_c0);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
} }


if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -162,20 +187,20 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
} }


__kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, const int4 input_shape, const int4 output_shape,
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
const int BlockH = 2;
const int BlockW = 1;
const int BlockC = 2;
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
int BlockH = 2;
int BlockW = 1;
int BlockC = 2;
DEFINE_ARGS; DEFINE_ARGS;


const int oh0 = oh + 0;
const int oh1 = oh + 1;
const int n_oh0 = n * OH + oh0;
const int n_oh1 = n * OH + oh1;
const int ow0 = ow + 0;
const int co_slice0 = co_slice + 0;
const int co_slice1 = co_slice + 1;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int co_slice0 = co_slice + 0;
int co_slice1 = co_slice + 1;


FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -185,15 +210,15 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;


for (int kh = 0; kh < KH; ++kh) { for (int kh = 0; kh < KH; ++kh) {
const int ih0 = kh * dilationH + oh0 * strideH - padTop;
int ih0 = kh * dilationH + oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH) // no need to check oh1, finally write out will check (oh1 < OH)
const int ih1 = kh * dilationH + oh1 * strideH - padTop;
int ih1 = kh * dilationH + oh1 * strideH - padTop;
// check ih0 and ih1 // check ih0 and ih1
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;


for (int kw = 0; kw < KW; ++kw) { for (int kw = 0; kw < KW; ++kw) {
const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int x_idx0 = iw0 * CI_SLICES; int x_idx0 = iw0 * CI_SLICES;


for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) { for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -231,16 +256,32 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
out_h1_w0_c1 += bias[co_slice1]; out_h1_w0_c1 += bias[co_slice1];
} }


if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f)); out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h0_w0_c1);
DO_TANH(out_h1_w0_c1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0);
DO_LEAKY_RELU(out_h1_w0_c0);
DO_LEAKY_RELU(out_h0_w0_c1);
DO_LEAKY_RELU(out_h1_w0_c1);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
} }


if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -269,21 +310,21 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
} }


__kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight, __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
__global FLT4 *bias, const int4 input_shape, const int4 output_shape,
const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
const int BlockH = 2;
const int BlockW = 2;
const int BlockC = 2;
__global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
int2 dilation, int act_type, float alpha) {
int BlockH = 2;
int BlockW = 2;
int BlockC = 2;
DEFINE_ARGS; DEFINE_ARGS;


const int oh0 = oh + 0;
const int oh1 = oh + 1;
const int n_oh0 = n * OH + oh0;
const int n_oh1 = n * OH + oh1;
const int ow0 = ow + 0;
const int ow1 = ow + 1;
const int co_slice0 = co_slice + 0;
const int co_slice1 = co_slice + 1;
int oh0 = oh + 0;
int oh1 = oh + 1;
int n_oh0 = n * OH + oh0;
int n_oh1 = n * OH + oh1;
int ow0 = ow + 0;
int ow1 = ow + 1;
int co_slice0 = co_slice + 0;
int co_slice1 = co_slice + 1;


FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f); FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -297,15 +338,15 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
__global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE; __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;


for (int kh = 0; kh < KH; ++kh) { for (int kh = 0; kh < KH; ++kh) {
const int ih0 = kh * dilationH + oh0 * strideH - padTop;
int ih0 = kh * dilationH + oh0 * strideH - padTop;
// no need to check oh1, finally write out will check (oh1 < OH) // no need to check oh1, finally write out will check (oh1 < OH)
const int ih1 = kh * dilationH + oh1 * strideH - padTop;
int ih1 = kh * dilationH + oh1 * strideH - padTop;
// check ih0 and ih1 // check ih0 and ih1
const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;


for (int kw = 0; kw < KW; ++kw) { for (int kw = 0; kw < KW; ++kw) {
const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw0 = kw * dilationW + ow0 * strideW - padLeft;
int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2; int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
int x_idx0 = iw0 * CI_SLICES; int x_idx0 = iw0 * CI_SLICES;
int x_idx1 = iw1 * CI_SLICES; int x_idx1 = iw1 * CI_SLICES;
@@ -368,7 +409,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
out_h1_w1_c1 += bias[co_slice1]; out_h1_w1_c1 += bias[co_slice1];
} }


if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f)); out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f)); out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f)); out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
@@ -377,7 +418,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f)); out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f)); out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f)); out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
@@ -386,6 +427,34 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f)); out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0, exp1;
DO_TANH(out_h0_w0_c0);
DO_TANH(out_h0_w1_c0);
DO_TANH(out_h1_w0_c0);
DO_TANH(out_h1_w1_c0);
DO_TANH(out_h0_w0_c1);
DO_TANH(out_h0_w1_c1);
DO_TANH(out_h1_w0_c1);
DO_TANH(out_h1_w1_c1);
} else if (act_type == ActivationType_LEAKY_RELU) {
DO_LEAKY_RELU(out_h0_w0_c0);
DO_LEAKY_RELU(out_h0_w1_c0);
DO_LEAKY_RELU(out_h1_w0_c0);
DO_LEAKY_RELU(out_h1_w1_c0);
DO_LEAKY_RELU(out_h0_w0_c1);
DO_LEAKY_RELU(out_h0_w1_c1);
DO_LEAKY_RELU(out_h1_w0_c1);
DO_LEAKY_RELU(out_h1_w1_c1);
} else if (act_type == ActivationType_SIGMOID) {
out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
} }


if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) { if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {


+ 14
- 1
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl View File

@@ -2,7 +2,7 @@
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data, __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
__global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size, __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size,
int2 stride, int2 padding, int4 src_size, int4 dst_size) {
int2 stride, int2 padding, int4 src_size, int4 dst_size, int act_type) {
int dst_h = get_global_id(0); int dst_h = get_global_id(0);
int rem_h = dst_h % stride.x; int rem_h = dst_h % stride.x;
int ceil_h = dst_h / stride.x; int ceil_h = dst_h / stride.x;
@@ -70,6 +70,19 @@ __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_onl
r1 += bias_val; r1 += bias_val;
r2 += bias_val; r2 += bias_val;
r3 += bias_val; r3 += bias_val;

if (act_type == ActivationType_RELU) {
r0 = max(r0, (FLT4)(0.0f));
r1 = max(r1, (FLT4)(0.0f));
r2 = max(r2, (FLT4)(0.0f));
r3 = max(r3, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
r0 = clamp(r0, (FLT4)(0.0f), (FLT4)(6.0f));
r1 = clamp(r1, (FLT4)(0.0f), (FLT4)(6.0f));
r2 = clamp(r2, (FLT4)(0.0f), (FLT4)(6.0f));
r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f));
}

WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0); WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0);
if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) { if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) {
WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1); WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1);


+ 12
- 4
mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl View File

@@ -2,9 +2,9 @@
#define C4NUM 4 #define C4NUM 4
#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
__read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min,
float act_max) {
__kernel void FullConnection(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
__read_only image2d_t bias, int4 in_shape, int2 out_shape, int act_type) {
int gidx = get_global_id(0); // CO4 int gidx = get_global_id(0); // CO4
int gidz = get_global_id(2); // N int gidz = get_global_id(2); // N
int lidx = get_local_id(0); int lidx = get_local_id(0);
@@ -34,7 +34,15 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only ima
result += temp[lidx][2]; result += temp[lidx][2];
result += temp[lidx][3]; result += temp[lidx][3];
result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0)); result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
result = clamp(result, (FLT)(act_min), (FLT)(act_max));
if (act_type == ActivationType_RELU) {
result = max(result, (FLT4)(0.0f));
} else if (act_type == ActivationType_RELU6) {
result = clamp(result, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0 = exp(result);
FLT4 exp1 = exp(-result);
result = (exp0 - exp1) / (exp0 + exp1);
}
WRITE_IMAGE(output, (int2)(gidx, gidz), result); WRITE_IMAGE(output, (int2)(gidx, gidz), result);
} }
} }

+ 8
- 12
mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl View File

@@ -1,10 +1,6 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable #pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;


#define ActType_No 0
#define ActType_Relu 1
#define ActType_Sigmod 2
#define ActType_Relu6 3
#define C4NUM 4 #define C4NUM 4


__kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset, __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
@@ -19,9 +15,9 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale
FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y)); FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y));
FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y)); FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y));
FLT4 out = in * s + o; FLT4 out = in * s + o;
if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out = max(out, (FLT4)(0.0f)); out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
} }
WRITE_IMAGE(output, (int2)(X, Y), out); WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -37,9 +33,9 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float


FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y)); FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y));
FLT4 out = in * (FLT)scale + (FLT)offset; FLT4 out = in * (FLT)scale + (FLT)offset;
if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out = max(out, (FLT4)(0.0f)); out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
} }
WRITE_IMAGE(output, (int2)(X, Y), out); WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -57,9 +53,9 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca
FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0)); FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0));
FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0)); FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0));
FLT4 out = in * s + o; FLT4 out = in * s + o;
if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out = max(out, (FLT4)(0.0f)); out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
} }
WRITE_IMAGE(output, (int2)(X, Y), out); WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -94,9 +90,9 @@ __kernel void Scale_H_IMG(__read_only image2d_t input, __read_only image2d_t sca
o_real = o.w; o_real = o.w;
} }
FLT4 out = in * s_real + o_real; FLT4 out = in * s_real + o_real;
if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
out = max(out, (FLT4)(0.0f)); out = max(out, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f)); out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
} }
WRITE_IMAGE(output, (int2)(X, Y), out); WRITE_IMAGE(output, (int2)(X, Y), out);


+ 20
- 12
mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl View File

@@ -4,9 +4,6 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP


#define UP_DIV(x, y) (((x) + (y) - (1)) / (y)) #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))


#define ActType_Relu 1
#define ActType_Relu6 3

constant FLT Bt[36] = { constant FLT Bt[36] = {
1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f, 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,
0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f, 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,
@@ -17,8 +14,8 @@ constant FLT Bt[36] = {
}; };


__kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output, __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
const int4 input_shape, // N H W CI_SLICES
const int4 output_shape) { // N 36 H/4*W/4 CI_SLICES
int4 input_shape, // N H W CI_SLICES
int4 output_shape) { // N 36 H/4*W/4 CI_SLICES
#define PAD 1 #define PAD 1
int tile_xy = get_global_id(0); int tile_xy = get_global_id(0);
int row = get_global_id(1); int row = get_global_id(1);
@@ -63,8 +60,8 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_
} }


__kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight, __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
const int4 input_shape, // N 36 H/4*W/4 CI_SLICES
const int4 output_shape) { // N 36 H/4*W/4 CO_SLICES
int4 input_shape, // N 36 H/4*W/4 CI_SLICES
int4 output_shape) { // N 36 H/4*W/4 CO_SLICES
#define H 36 #define H 36
int w = get_global_id(0) * 2; int w = get_global_id(0) * 2;
int h = get_global_id(1); int h = get_global_id(1);
@@ -134,9 +131,9 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f, 1.000000000
0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f}; 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};


__kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias, __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
const int4 input_shape, // N 36 H/4*W/4 CO_SLICES
const int4 output_shape, // N H W CO_SLICES
const int act_type) {
int4 input_shape, // N 36 H/4*W/4 CO_SLICES
int4 output_shape, // N H W CO_SLICES
int act_type, float alpha) {
int tile_xy = get_global_id(0); int tile_xy = get_global_id(0);
int row = get_global_id(1); int row = get_global_id(1);
int slice = get_global_id(2); int slice = get_global_id(2);
@@ -175,10 +172,21 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_
acc += bias[slice]; acc += bias[slice];
} }


if (act_type == ActType_Relu) {
if (act_type == ActivationType_RELU) {
acc = max(acc, (FLT4)(0.0f)); acc = max(acc, (FLT4)(0.0f));
} else if (act_type == ActType_Relu6) {
} else if (act_type == ActivationType_RELU6) {
acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f)); acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
} else if (act_type == ActivationType_TANH) {
FLT4 exp0 = exp(acc);
FLT4 exp1 = exp(-acc);
acc = (exp0 - exp1) / (exp0 + exp1);
} else if (act_type == ActivationType_LEAKY_RELU) {
if (acc.x < 0) acc.x *= alpha;
if (acc.y < 0) acc.y *= alpha;
if (acc.z < 0) acc.z *= alpha;
if (acc.w < 0) acc.w *= alpha;
} else if (act_type == ActivationType_SIGMOID) {
acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
} }


WRITE_IMAGE(output, (int2)(x_idx, oh), acc); WRITE_IMAGE(output, (int2)(x_idx, oh), acc);


+ 44
- 84
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc View File

@@ -31,93 +31,46 @@ using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK; using mindspore::lite::RET_OK;
using mindspore::lite::opencl::MemType; using mindspore::lite::opencl::MemType;
using mindspore::schema::ActivationType_NO_ACTIVATION;
using mindspore::schema::ActivationType_RELU;
using mindspore::schema::ActivationType_RELU6;
using mindspore::schema::PrimitiveType_Eltwise; using mindspore::schema::PrimitiveType_Eltwise;


namespace mindspore::kernel { namespace mindspore::kernel {


std::set<schema::PrimitiveType> SupportedOpenCLArithmetics = {PrimitiveType_Mul,
PrimitiveType_Add,
PrimitiveType_Sub,
PrimitiveType_Div,
PrimitiveType_LogicalAnd,
PrimitiveType_LogicalOr,
PrimitiveType_Maximum,
PrimitiveType_Minimum,
PrimitiveType_FloorDiv,
PrimitiveType_FloorMod,
PrimitiveType_SquaredDifference,
PrimitiveType_Equal,
PrimitiveType_NotEqual,
PrimitiveType_Less,
PrimitiveType_LessEqual,
PrimitiveType_Greater,
PrimitiveType_GreaterEqual,
PrimitiveType_Eltwise};

int ArithmeticOpenCLKernel::CheckSpecs() { int ArithmeticOpenCLKernel::CheckSpecs() {
auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
if (arithmetic_parameter->broadcasting_) {
element_flag_ = false;
kernel_name_ = "BroadcastNHWC4";
if (out_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Broadcasting don't support N > 1";
return RET_ERROR;
}
} else {
kernel_name_ = "Element";
auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
if (param->broadcasting_ && out_tensors_[0]->shape()[0] > 1) {
MS_LOG(ERROR) << "Broadcasting don't support N > 1";
return RET_ERROR;
} }

switch (op_parameter_->type_) {
case PrimitiveType_Mul:
kernel_name_ += "Mul";
break;
case PrimitiveType_Add:
kernel_name_ += "Add";
break;
case PrimitiveType_Sub:
kernel_name_ += "Sub";
break;
case PrimitiveType_Div:
kernel_name_ += "Div";
break;
case PrimitiveType_LogicalAnd:
kernel_name_ += "And";
break;
case PrimitiveType_LogicalOr:
kernel_name_ += "Or";
break;
case PrimitiveType_Maximum:
kernel_name_ += "Max";
break;
case PrimitiveType_Minimum:
kernel_name_ += "Min";
break;
case PrimitiveType_FloorDiv:
kernel_name_ += "FloorDiv";
break;
case PrimitiveType_FloorMod:
kernel_name_ += "FloorMod";
break;
case PrimitiveType_SquaredDifference:
kernel_name_ += "SquaredDifference";
break;
case PrimitiveType_Equal:
kernel_name_ += "Equal";
break;
case PrimitiveType_NotEqual:
kernel_name_ += "NotEqual";
break;
case PrimitiveType_Less:
kernel_name_ += "Less";
break;
case PrimitiveType_LessEqual:
kernel_name_ += "LessEqual";
break;
case PrimitiveType_Greater:
kernel_name_ += "Greater";
break;
case PrimitiveType_GreaterEqual:
kernel_name_ += "GreaterEqual";
break;
default:
MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
return RET_ERROR;
if (SupportedOpenCLArithmetics.count(static_cast<schema::PrimitiveType>(op_parameter_->type_)) == 0) {
MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamesPrimitiveType()[op_parameter_->type_];
return RET_ERROR;
} }

switch (arithmetic_parameter->activation_type_) {
case schema::ActivationType_NO_ACTIVATION:
break;
case schema::ActivationType_RELU:
activation_min_ = 0.f;
break;
case schema::ActivationType_RELU6:
activation_min_ = 0.f;
activation_max_ = 6.f;
break;
default:
MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_;
return RET_ERROR;
if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU ||
param->activation_type_ == ActivationType_RELU6)) {
MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_;
return RET_ERROR;
} }
return RET_OK; return RET_OK;
} }
@@ -240,11 +193,18 @@ int ArithmeticOpenCLKernel::Prepare() {
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_); kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_);
#else #else
if (out_mem_type_ == MemType::IMG) {
kernel_name_ += "_IMG";
} else {
kernel_name_ += "_BUF";

auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
element_flag_ = !param->broadcasting_;
kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element";
kernel_name_ += schema::EnumNamesPrimitiveType()[op_parameter_->type_];
if (param->activation_type_ == ActivationType_RELU) {
activation_min_ = 0.f;
} else if (param->activation_type_ == ActivationType_RELU6) {
activation_min_ = 0.f;
activation_max_ = 6.f;
} }

std::string program_name = "Arithmetic"; std::string program_name = "Arithmetic";
std::string source = arithmetic_source; std::string source = arithmetic_source;
ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->LoadSource(program_name, source);


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h View File

@@ -18,12 +18,15 @@
#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_ #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_


#include <vector> #include <vector>
#include <set>
#include <string> #include <string>
#include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h" #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h"
#include "src/runtime/kernel/opencl/opencl_kernel.h" #include "src/runtime/kernel/opencl/opencl_kernel.h"


namespace mindspore::kernel { namespace mindspore::kernel {


extern std::set<schema::PrimitiveType> SupportedOpenCLArithmetics;

class ArithmeticOpenCLKernel : public OpenCLKernel { class ArithmeticOpenCLKernel : public OpenCLKernel {
public: public:
ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,


+ 29
- 17
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc View File

@@ -23,6 +23,7 @@
#include "src/runtime/kernel/opencl/utils.h" #include "src/runtime/kernel/opencl/utils.h"
#include "src/kernel_registry.h" #include "src/kernel_registry.h"
#include "include/errorcode.h" #include "include/errorcode.h"
#include "schema/ops_generated.h"
#include "src/runtime/kernel/opencl/cl/conv2d.cl.inc" #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc"
#include "src/runtime/kernel/opencl/cl/winograd.cl.inc" #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"


@@ -30,6 +31,11 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK; using mindspore::lite::RET_OK;
using mindspore::schema::ActivationType_LEAKY_RELU;
using mindspore::schema::ActivationType_RELU;
using mindspore::schema::ActivationType_RELU6;
using mindspore::schema::ActivationType_SIGMOID;
using mindspore::schema::ActivationType_TANH;
using mindspore::schema::PrimitiveType_Conv2D; using mindspore::schema::PrimitiveType_Conv2D;
using mindspore::schema::PrimitiveType_FullConnection; using mindspore::schema::PrimitiveType_FullConnection;


@@ -59,6 +65,10 @@ int Conv2DOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D."; MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D.";
return RET_ERROR; return RET_ERROR;
} }
if (param_->act_type_ != ActType_No && param_->act_type_ != ActType_Relu && param_->act_type_ != ActType_Relu6) {
MS_LOG(ERROR) << "Unsupported activation type " << param_->act_type_;
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }


@@ -72,9 +82,16 @@ int Conv2DOpenCLKernel::Prepare() {
IH_ = input_shape[1]; IH_ = input_shape[1];
IW_ = input_shape[2]; IW_ = input_shape[2];
CI_ = input_shape[3]; CI_ = input_shape[3];
OH_ = output_shape[1];
OW_ = output_shape[2];
CO_ = output_shape[3];
// for fusion Conv2D and Reshape(N11C->NC)
if (output_shape.size() == 2) {
OH_ = 1;
OW_ = 1;
CO_ = output_shape[1];
} else { // output_shape.size()==4
OH_ = output_shape[1];
OW_ = output_shape[2];
CO_ = output_shape[3];
}
CI_SLICES_ = UP_DIV(CI_, C4NUM); CI_SLICES_ = UP_DIV(CI_, C4NUM);
CO_SLICES_ = UP_DIV(CO_, C4NUM); CO_SLICES_ = UP_DIV(CO_, C4NUM);
KH_ = param_->kernel_h_; KH_ = param_->kernel_h_;
@@ -91,7 +108,7 @@ int Conv2DOpenCLKernel::Prepare() {
if (use_winograd_) { if (use_winograd_) {
MS_LOG(DEBUG) << "use winograd"; MS_LOG(DEBUG) << "use winograd";
std::string program_name = "winograd"; std::string program_name = "winograd";
ocl_runtime_->LoadSource(program_name, winograd_source);
ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source);
ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution");
ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
@@ -100,7 +117,7 @@ int Conv2DOpenCLKernel::Prepare() {
std::string program_name = "conv2d"; std::string program_name = "conv2d";
std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" +
std::to_string(block_size_.C); std::to_string(block_size_.C);
ocl_runtime_->LoadSource(program_name, conv2d_source);
ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
} }


@@ -347,13 +364,6 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
} }


void Conv2DOpenCLKernel::SetConstArgs() { void Conv2DOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
cl_int act_type = 0;
if (param->act_type_ == ActType_Relu) {
act_type = 1;
} else if (param->act_type_ == ActType_Relu6) {
act_type = 3;
}
cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};


@@ -380,12 +390,13 @@ void Conv2DOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape); ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, act_type);
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast<cl_int>(param_->act_type_));
ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast<cl_float>(alpha_));
} else { } else {
arg_cn = 2; arg_cn = 2;
cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_};
cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
cl_int2 dilation = {param->dilation_h_, param->dilation_w_};
cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_};
cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
@@ -393,7 +404,8 @@ void Conv2DOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast<cl_int>(param_->act_type_));
ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(alpha_));
} }
} }




+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h View File

@@ -44,6 +44,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
int Run() override; int Run() override;
int Tune() override; int Tune() override;


// for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
float alpha_{0.0f};

private: private:
void SetBlockSize(); void SetBlockSize();
int InitFilter(); int InitFilter();


+ 9
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc View File

@@ -22,11 +22,14 @@
#ifndef PROGRAM_WITH_IL #ifndef PROGRAM_WITH_IL
#include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc" #include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc"
#endif #endif
#include "src/runtime/kernel/opencl/utils.h"


using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK; using mindspore::lite::RET_OK;
using mindspore::schema::ActivationType_RELU;
using mindspore::schema::ActivationType_RELU6;
using mindspore::schema::PrimitiveType_DeConv2D; using mindspore::schema::PrimitiveType_DeConv2D;


namespace mindspore::kernel { namespace mindspore::kernel {
@@ -38,6 +41,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "only support kernel - stride == 2 * pad"; MS_LOG(ERROR) << "only support kernel - stride == 2 * pad";
return RET_ERROR; return RET_ERROR;
} }
if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) {
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }


@@ -47,7 +54,7 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else #else
std::string source = conv2d_transpose_source;
std::string source = GetActDefines() + conv2d_transpose_source;
std::string program_name = "conv2d_transpose"; std::string program_name = "conv2d_transpose";
ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
@@ -102,6 +109,7 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, static_cast<cl_int>(param->act_type_));
} }


int Conv2dTransposeOpenCLKernel::InitWeights() { int Conv2dTransposeOpenCLKernel::InitWeights() {


+ 16
- 26
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc View File

@@ -29,48 +29,38 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_ERROR; using mindspore::lite::RET_ERROR;
using mindspore::lite::RET_OK; using mindspore::lite::RET_OK;
using mindspore::schema::ActivationType_RELU;
using mindspore::schema::ActivationType_RELU6;
using mindspore::schema::ActivationType_TANH;
using mindspore::schema::PrimitiveType_FullConnection; using mindspore::schema::PrimitiveType_FullConnection;


namespace mindspore::kernel { namespace mindspore::kernel {


int FullConnectionOpenCLKernel::Init() {
// deleted soon
return CheckSpecs();
}

int FullConnectionOpenCLKernel::CheckSpecs() { int FullConnectionOpenCLKernel::CheckSpecs() {
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_); auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
transposeA = param->a_transpose_;
if (transposeA) {
if (param->a_transpose_) {
MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet."; MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet.";
return RET_ERROR; return RET_ERROR;
} }
transposeB = param->b_transpose_;
enable_fp16_ = ocl_runtime_->GetFp16Enable();
if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) || if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) ||
(out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) { (out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) {
MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4"; MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4";
return RET_ERROR; return RET_ERROR;
} }
switch (param->act_type_) {
case ActType_No:
break;
case ActType_Relu:
activation_min_ = 0.f;
break;
case ActType_Relu6:
activation_min_ = 0.f;
activation_max_ = 6.f;
break;
default:
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR;
if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) {
MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
return RET_ERROR;
} }
return RET_OK; return RET_OK;
} }


int FullConnectionOpenCLKernel::Prepare() { int FullConnectionOpenCLKernel::Prepare() {
std::string kernel_name = "FullConnection_NHWC4";
auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
transposeA = param->a_transpose_;
transposeB = param->b_transpose_;
enable_fp16_ = ocl_runtime_->GetFp16Enable();

std::string kernel_name = "FullConnection";
inShape = GpuTensorInfo(in_tensors_[0]); inShape = GpuTensorInfo(in_tensors_[0]);
outShape = GpuTensorInfo(out_tensors_[0]); outShape = GpuTensorInfo(out_tensors_[0]);
#ifdef PROGRAM_WITH_IL #ifdef PROGRAM_WITH_IL
@@ -78,7 +68,7 @@ int FullConnectionOpenCLKernel::Prepare() {
#else #else
std::string source = fullconnection_source; std::string source = fullconnection_source;
std::string program_name = "FullConnection"; std::string program_name = "FullConnection";
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->LoadSource(program_name, GetActDefines() + source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
#endif #endif
auto ret = InitWeights(); auto ret = InitWeights();
@@ -200,11 +190,11 @@ void FullConnectionOpenCLKernel::SetConstArgs() {
static_cast<int>(inShape.C)}; static_cast<int>(inShape.C)};
cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)}; cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)};
ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_); ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_);
ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_);
ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
} }


int FullConnectionOpenCLKernel::Run() { int FullConnectionOpenCLKernel::Run() {


+ 0
- 3
mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h View File

@@ -37,7 +37,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
int InitWeights() override; int InitWeights() override;
void SetConstArgs() override; void SetConstArgs() override;
void SetGlobalLocal() override; void SetGlobalLocal() override;
int Init() override;
int Tune() override { return lite::RET_OK; } int Tune() override { return lite::RET_OK; }


private: private:
@@ -46,8 +45,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
bool enable_fp16_{false}; bool enable_fp16_{false};
bool transposeA{false}; bool transposeA{false};
bool transposeB{true}; bool transposeB{true};
float activation_min_{-FLT_MAX};
float activation_max_{FLT_MAX};
GpuTensorInfo inShape = GpuTensorInfo(nullptr); GpuTensorInfo inShape = GpuTensorInfo(nullptr);
GpuTensorInfo outShape = GpuTensorInfo(nullptr); GpuTensorInfo outShape = GpuTensorInfo(nullptr);
}; };


+ 11
- 9
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc View File

@@ -35,6 +35,15 @@ using mindspore::schema::PrimitiveType_Scale;


namespace mindspore::kernel { namespace mindspore::kernel {


int ScaleOpenCLKernel::CheckSpecs() {
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
if (param->activation_type_ != ActType_No && param->activation_type_ != ActType_Relu &&
param->activation_type_ != ActType_Relu6) {
return RET_ERROR;
}
return RET_OK;
}

ScaleOpenCLKernel::~ScaleOpenCLKernel() { ScaleOpenCLKernel::~ScaleOpenCLKernel() {
auto allocator = ocl_runtime_->GetAllocator(); auto allocator = ocl_runtime_->GetAllocator();
if (scale_ptr_ != nullptr) { if (scale_ptr_ != nullptr) {
@@ -185,7 +194,7 @@ int ScaleOpenCLKernel::Init() {
kernel_name += "_BUF"; kernel_name += "_BUF";
} }
std::string program_name = "Scale"; std::string program_name = "Scale";
std::string source = scale_source;
std::string source = GetActDefines() + scale_source;
ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->LoadSource(program_name, source);
error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
#endif #endif
@@ -202,13 +211,6 @@ int ScaleOpenCLKernel::Init() {
int ScaleOpenCLKernel::Run() { int ScaleOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_); auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
cl_int act_type = 0;
if (param->activation_type_ == ActType_Relu) {
act_type = 1;
} else if (param->activation_type_ == ActType_Relu6) {
act_type = 3;
}

int arg_idx = 0; int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
if (weight_vector_flag_) { if (weight_vector_flag_) {
@@ -242,7 +244,7 @@ int ScaleOpenCLKernel::Run() {
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM)); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
} }
} }
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
ocl_runtime_->RunKernel(kernel_, global_size_, local_size_); ocl_runtime_->RunKernel(kernel_, global_size_, local_size_);
return RET_OK; return RET_OK;
} }


+ 1
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h View File

@@ -30,6 +30,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {} : OpenCLKernel(parameter, inputs, outputs) {}
~ScaleOpenCLKernel() override; ~ScaleOpenCLKernel() override;


int CheckSpecs() override;
int Init() override; int Init() override;
int Run() override; int Run() override;
int InitWeights() override; int InitWeights() override;


+ 23
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc View File

@@ -0,0 +1,23 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "src/runtime/kernel/opencl/opencl_subgraph.h"

namespace mindspore::kernel {

void OpenCLSubGraph::Fusion() {}

} // namespace mindspore::kernel

+ 1
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -222,6 +222,7 @@ class OpenCLKernel : public LiteKernel {


lite::opencl::MemType GetMemType() { return out_mem_type_; } lite::opencl::MemType GetMemType() { return out_mem_type_; }
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
OpParameter *GetParameter() { return op_parameter_; }


virtual std::vector<BaseTuningParameter> GenerateTuningParam() { virtual std::vector<BaseTuningParameter> GenerateTuningParam() {
size_t ndim = global_size_.size(); size_t ndim = global_size_.size();


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc View File

@@ -352,7 +352,7 @@ int OpenCLSubGraph::Prepare() {
MS_LOG(ERROR) << "Create OpenCLExecutor fail"; MS_LOG(ERROR) << "Create OpenCLExecutor fail";
return RET_ERROR; return RET_ERROR;
} }
Fusion();
auto ret = Init(); auto ret = Init();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "OpenCL subgraph init fail"; MS_LOG(ERROR) << "OpenCL subgraph init fail";


+ 3
- 0
mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h View File

@@ -78,6 +78,9 @@ class OpenCLSubGraph : public SubGraphKernel {
std::set<LiteKernel *> nodes_set_; std::set<LiteKernel *> nodes_set_;
lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr}; lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};

private:
void Fusion();
}; };
} // namespace mindspore::kernel } // namespace mindspore::kernel




+ 14
- 0
mindspore/lite/src/runtime/kernel/opencl/utils.cc View File

@@ -25,6 +25,11 @@


using mindspore::lite::KernelRegistrar; using mindspore::lite::KernelRegistrar;
using mindspore::lite::opencl::MemType; using mindspore::lite::opencl::MemType;
using mindspore::schema::ActivationType_LEAKY_RELU;
using mindspore::schema::ActivationType_RELU;
using mindspore::schema::ActivationType_RELU6;
using mindspore::schema::ActivationType_SIGMOID;
using mindspore::schema::ActivationType_TANH;


namespace mindspore::lite { namespace mindspore::lite {
kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@@ -40,6 +45,15 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con


namespace mindspore::kernel { namespace mindspore::kernel {


std::string GetActDefines() {
static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) +
"\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) +
"\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) +
"\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) +
"\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n";
return act_defines;
}

int GetUpPow2(int n) { int GetUpPow2(int n) {
int i = 0; int i = 0;
int j = 0; int j = 0;


+ 2
- 0
mindspore/lite/src/runtime/kernel/opencl/utils.h View File

@@ -34,6 +34,8 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con


namespace mindspore::kernel { namespace mindspore::kernel {


std::string GetActDefines();

int GetUpPow2(int n); int GetUpPow2(int n);


int GetMaxDivisor(int x, int divisor); int GetMaxDivisor(int x, int divisor);


+ 1
- 0
mindspore/lite/test/CMakeLists.txt View File

@@ -84,6 +84,7 @@ if (SUPPORT_GPU)
${KERNEL_OP_SRC} ${KERNEL_OP_SRC}
${GPU_KERNEL_OP_SRC} ${GPU_KERNEL_OP_SRC}
${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc
${LITE_DIR}/src/runtime/kernel/opencl/opencl_fusion.cc
${LITE_DIR}/src/runtime/kernel/opencl/utils.cc ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc
) )
endif() endif()


Loading…
Cancel
Save