From 5be7fe3cae16b4ea755230c67924278f8a76be97 Mon Sep 17 00:00:00 2001
From: wangdongxu <wangdongxu6@huawei.com>
Date: Sat, 28 Nov 2020 19:41:05 +0800
Subject: [PATCH] add opencl_fusion

---
 mindspore/lite/src/CMakeLists.txt             |   1 +
 .../runtime/kernel/opencl/cl/arithmetic.cl    | 196 +++++---------
 .../src/runtime/kernel/opencl/cl/conv2d.cl    | 251 +++++++++++-------
 .../kernel/opencl/cl/conv2d_transpose.cl      |  15 +-
 .../kernel/opencl/cl/fullconnection.cl        |  16 +-
 .../src/runtime/kernel/opencl/cl/scale.cl     |  20 +-
 .../src/runtime/kernel/opencl/cl/winograd.cl  |  32 ++-
 .../kernel/opencl/kernel/arithmetic.cc        | 128 +++------
 .../runtime/kernel/opencl/kernel/arithmetic.h |   3 +
 .../runtime/kernel/opencl/kernel/conv2d.cc    |  46 ++--
 .../src/runtime/kernel/opencl/kernel/conv2d.h |   3 +
 .../kernel/opencl/kernel/conv2d_transpose.cc  |  10 +-
 .../kernel/opencl/kernel/fullconnection.cc    |  42 ++-
 .../kernel/opencl/kernel/fullconnection.h     |   3 -
 .../src/runtime/kernel/opencl/kernel/scale.cc |  20 +-
 .../src/runtime/kernel/opencl/kernel/scale.h  |   1 +
 .../runtime/kernel/opencl/opencl_fusion.cc    |  23 ++
 .../src/runtime/kernel/opencl/opencl_kernel.h |   1 +
 .../runtime/kernel/opencl/opencl_subgraph.cc  |   2 +-
 .../runtime/kernel/opencl/opencl_subgraph.h   |   3 +
 .../lite/src/runtime/kernel/opencl/utils.cc   |  14 +
 .../lite/src/runtime/kernel/opencl/utils.h    |   2 +
 mindspore/lite/test/CMakeLists.txt            |   1 +
 23 files changed, 446 insertions(+), 387 deletions(-)
 create mode 100644 mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc

diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index b0bccd135f..b4cbd79109 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -42,6 +42,7 @@ if (SUPPORT_GPU)
     set(LITE_SRC
             ${LITE_SRC}
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_subgraph.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/opencl_fusion.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/kernel/opencl/utils.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_executor.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/runtime/opencl/opencl_allocator.cc
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
index ae3cb51dc1..c8f50ffe01 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/arithmetic.cl
@@ -2,8 +2,8 @@
 #define divide_no_check(a, b) (a / b)
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
-__kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementAdd(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -17,8 +17,8 @@ __kernel void ElementAdd_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementSub(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -32,8 +32,8 @@ __kernel void ElementSub_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementMul(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -47,8 +47,8 @@ __kernel void ElementMul_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementDiv(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                         const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -62,8 +62,8 @@ __kernel void ElementDiv_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementLogicalAnd(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -77,8 +77,8 @@ __kernel void ElementAnd_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
-                            const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementLogicalOr(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                               __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -92,7 +92,7 @@ __kernel void ElementOr_IMG(__read_only image2d_t input_a, __read_only image2d_t
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+__kernel void ElementMaximum(__read_only image2d_t input_a, __read_only image2d_t input_b,
                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -107,7 +107,7 @@ __kernel void ElementMax_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
+__kernel void ElementMinimum(__read_only image2d_t input_a, __read_only image2d_t input_b,
                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -122,9 +122,8 @@ __kernel void ElementMin_IMG(__read_only image2d_t input_a, __read_only image2d_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape, float act_min,
-                                  float act_max) {
+__kernel void ElementFloorDiv(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -138,9 +137,8 @@ __kernel void ElementFloorDiv_IMG(__read_only image2d_t input_a, __read_only ima
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape, float act_min,
-                                  float act_max) {
+__kernel void ElementFloorMod(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -154,9 +152,9 @@ __kernel void ElementFloorMod_IMG(__read_only image2d_t input_a, __read_only ima
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                           __write_only image2d_t output, const int2 output_shape, float act_min,
-                                           float act_max) {
+__kernel void ElementSquaredDifference(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                       __write_only image2d_t output, const int2 output_shape, float act_min,
+                                       float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -170,8 +168,8 @@ __kernel void ElementSquaredDifference_IMG(__read_only image2d_t input_a, __read
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                               __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementEqual(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                           const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -186,9 +184,8 @@ __kernel void ElementEqual_IMG(__read_only image2d_t input_a, __read_only image2
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                  __write_only image2d_t output, const int2 output_shape, float act_min,
-                                  float act_max) {
+__kernel void ElementNotEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -202,8 +199,8 @@ __kernel void ElementNotEqual_IMG(__read_only image2d_t input_a, __read_only ima
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                              __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementLess(__read_only image2d_t input_a, __read_only image2d_t input_b, __write_only image2d_t output,
+                          const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -217,9 +214,8 @@ __kernel void ElementLess_IMG(__read_only image2d_t input_a, __read_only image2d
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                   __write_only image2d_t output, const int2 output_shape, float act_min,
-                                   float act_max) {
+__kernel void ElementLessEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                               __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -233,8 +229,8 @@ __kernel void ElementLessEqual_IMG(__read_only image2d_t input_a, __read_only im
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                 __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
+__kernel void ElementGreater(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                             __write_only image2d_t output, const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -248,9 +244,9 @@ __kernel void ElementGreater_IMG(__read_only image2d_t input_a, __read_only imag
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                      __write_only image2d_t output, const int2 output_shape, float act_min,
-                                      float act_max) {
+__kernel void ElementGreaterEqual(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                  __write_only image2d_t output, const int2 output_shape, float act_min,
+                                  float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -264,9 +260,9 @@ __kernel void ElementGreaterEqual_IMG(__read_only image2d_t input_a, __read_only
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
-                                    const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
+__kernel void BroadcastNHWC4Add(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
   int X = get_global_id(0);  // C4
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // H
@@ -293,9 +289,9 @@ __kernel void BroadcastNHWC4Add_IMG(__read_only image2d_t input_a, __read_only i
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
 }
 
-__kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
-                                    const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
+__kernel void BroadcastNHWC4Sub(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
   int X = get_global_id(0);  // C4
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // H
@@ -322,9 +318,9 @@ __kernel void BroadcastNHWC4Sub_IMG(__read_only image2d_t input_a, __read_only i
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
 }
 
-__kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
-                                    const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
+__kernel void BroadcastNHWC4Mul(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
   int X = get_global_id(0);  // C4
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // H
@@ -351,9 +347,9 @@ __kernel void BroadcastNHWC4Mul_IMG(__read_only image2d_t input_a, __read_only i
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
 }
 
-__kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only image2d_t input_b,
-                                    __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
-                                    const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
+__kernel void BroadcastNHWC4Div(__read_only image2d_t input_a, __read_only image2d_t input_b,
+                                __write_only image2d_t output, const int4 a_shape, const int4 b_shape,
+                                const int4 output_shape, const int broadcastC_flag, float act_min, float act_max) {
   int X = get_global_id(0);  // C4
   int Y = get_global_id(1);  // W
   int Z = get_global_id(2);  // H
@@ -380,8 +376,8 @@ __kernel void BroadcastNHWC4Div_IMG(__read_only image2d_t input_a, __read_only i
   WRITE_IMAGE(output, (int2)(Y * output_shape.w + X, Z), result);
 }
 
-__kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                               const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastLogicalAnd(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                  const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -394,8 +390,8 @@ __kernel void BroadcastAnd_IMG(__read_only image2d_t input_a, float b, __write_o
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                              const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastLogicalOr(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                 const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -408,7 +404,7 @@ __kernel void BroadcastOr_IMG(__read_only image2d_t input_a, float b, __write_on
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+__kernel void BroadcastMaximum(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -422,7 +418,7 @@ __kernel void BroadcastMax_IMG(__read_only image2d_t input_a, float b, __write_o
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+__kernel void BroadcastMinimum(__read_only image2d_t input_a, float b, __write_only image2d_t output,
                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
@@ -436,8 +432,8 @@ __kernel void BroadcastMin_IMG(__read_only image2d_t input_a, float b, __write_o
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastFloorDiv(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -450,8 +446,8 @@ __kernel void BroadcastFloorDiv_IMG(__read_only image2d_t input_a, float b, __wr
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastFloorMod(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -464,8 +460,8 @@ __kernel void BroadcastFloorMod_IMG(__read_only image2d_t input_a, float b, __wr
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                             const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastSquaredDifference(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                         const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -478,8 +474,8 @@ __kernel void BroadcastSquaredDifference_IMG(__read_only image2d_t input_a, floa
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                 const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                             const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -492,8 +488,8 @@ __kernel void BroadcastEqual_IMG(__read_only image2d_t input_a, float b, __write
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                    const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastNotEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -506,8 +502,8 @@ __kernel void BroadcastNotEqual_IMG(__read_only image2d_t input_a, float b, __wr
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastLess(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                            const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -520,8 +516,8 @@ __kernel void BroadcastLess_IMG(__read_only image2d_t input_a, float b, __write_
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                     const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastLessEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                 const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -534,8 +530,8 @@ __kernel void BroadcastLessEqual_IMG(__read_only image2d_t input_a, float b, __w
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                   const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastGreater(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                               const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -548,8 +544,8 @@ __kernel void BroadcastGreater_IMG(__read_only image2d_t input_a, float b, __wri
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
 
-__kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b, __write_only image2d_t output,
-                                        const int2 output_shape, float act_min, float act_max) {
+__kernel void BroadcastGreaterEqual(__read_only image2d_t input_a, float b, __write_only image2d_t output,
+                                    const int2 output_shape, float act_min, float act_max) {
   int X = get_global_id(0);
   int Y = get_global_id(1);
   if (X >= output_shape.x || Y >= output_shape.y) {
@@ -561,55 +557,3 @@ __kernel void BroadcastGreaterEqual_IMG(__read_only image2d_t input_a, float b,
   result = clamp(result, (FLT)(act_min), (FLT)(act_max));
   WRITE_IMAGE(output, (int2)(X, Y), result);
 }
-
-__kernel void ElementAdd_BUF(__global float *input_a, __global float *input_b, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] + input_b[idx];
-}
-
-__kernel void ElementSub_BUF(__global float *input_a, __global float *input_b, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - input_b[idx];
-}
-
-__kernel void ElementMul_BUF(__global float *input_a, __global float *input_b, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b[idx];
-}
-
-__kernel void ElementDiv_BUF(__global float *input_a, __global float *input_b, __global float *output,
-                             const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * input_b[idx];
-}
-
-__kernel void BroadcastAdd_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] + (FLT)b;
-}
-
-__kernel void BroadcastSub_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] - (FLT)b;
-}
-
-__kernel void BroadcastMul_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = input_a[idx] * (FLT)b;
-}
-
-__kernel void BroadcastDiv_BUF(__global float *input_a, float b, __global float *output, const unsigned int n) {
-  int idx = get_global_id(0);
-  if (idx >= n) return;
-  output[idx] = divide_no_check(input_a[idx], (FLT)b);
-}
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
index a89146d904..f603ef5cf1 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
@@ -6,51 +6,59 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
 #define MAX_IMAGE2D_SIZE 65535
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 
-#define ActType_Relu 1
-#define ActType_Relu6 3
-
-#define DEFINE_ARGS                                                               \
-  const int N = input_shape.x;                                                    \
-  const int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w;    \
-  const int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \
-  const int KH = kernel_stride.x, KW = kernel_stride.y;                           \
-  const int strideH = kernel_stride.z, strideW = kernel_stride.w;                 \
-  const int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \
-  const int dilationH = dilation.x, dilationW = dilation.y;                       \
-                                                                                  \
-  const int n_oh = get_global_id(0);                                              \
-  const int ow = get_global_id(1) * BlockW;                                       \
-  const int co_slice = get_global_id(2) * BlockC;                                 \
-  const int OH_SLICES = UP_DIV(OH, BlockH);                                       \
-  const int n = n_oh / OH_SLICES;                                                 \
-  const int oh = (n_oh % OH_SLICES) * BlockH;                                     \
-  if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {                  \
-    return;                                                                       \
+#define DEFINE_ARGS                                                         \
+  int N = input_shape.x;                                                    \
+  int IH = input_shape.y, IW = input_shape.z, CI_SLICES = input_shape.w;    \
+  int OH = output_shape.y, OW = output_shape.z, CO_SLICES = output_shape.w; \
+  int KH = kernel_stride.x, KW = kernel_stride.y;                           \
+  int strideH = kernel_stride.z, strideW = kernel_stride.w;                 \
+  int padTop = pad.x, padBottom = pad.y, padLeft = pad.z, padRight = pad.w; \
+  int dilationH = dilation.x, dilationW = dilation.y;                       \
+                                                                            \
+  int n_oh = get_global_id(0);                                              \
+  int ow = get_global_id(1) * BlockW;                                       \
+  int co_slice = get_global_id(2) * BlockC;                                 \
+  int OH_SLICES = UP_DIV(OH, BlockH);                                       \
+  int n = n_oh / OH_SLICES;                                                 \
+  int oh = (n_oh % OH_SLICES) * BlockH;                                     \
+  if (n >= N || oh >= OH || ow >= OW || co_slice >= CO_SLICES) {            \
+    return;                                                                 \
   }
 
+#define DO_TANH(data) \
+  exp0 = exp(data);   \
+  exp1 = exp(-data);  \
+  data = (exp0 - exp1) / (exp0 + exp1);
+
+#define DO_LEAKY_RELU(data)        \
+  if (data.x < 0) data.x *= alpha; \
+  if (data.y < 0) data.y *= alpha; \
+  if (data.z < 0) data.z *= alpha; \
+  if (data.w < 0) data.w *= alpha;
+
 __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
-  const int BlockH = 1;
-  const int BlockW = 1;
-  const int BlockC = 1;
+                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                            int2 dilation, int act_type, float alpha) {
+  int BlockH = 1;
+  int BlockW = 1;
+  int BlockC = 1;
   DEFINE_ARGS;
 
-  const int oh0 = oh + 0;
-  const int n_oh0 = n * OH + oh0;
-  const int ow0 = ow + 0;
-  const int co_slice0 = co_slice + 0;
+  int oh0 = oh + 0;
+  int n_oh0 = n * OH + oh0;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
 
   FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
 
   __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
   for (int kh = 0; kh < KH; ++kh) {
-    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
-    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
 
     for (int kw = 0; kw < KW; ++kw) {
-      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
       int x_idx0 = iw0 * CI_SLICES;
 
       for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -71,10 +79,17 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
     out_h0_w0_c0 += bias[co_slice0];
   }
 
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
   }
 
   if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -85,19 +100,19 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
 }
 
 __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
-  const int BlockH = 2;
-  const int BlockW = 1;
-  const int BlockC = 1;
+                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                            int2 dilation, int act_type, float alpha) {
+  int BlockH = 2;
+  int BlockW = 1;
+  int BlockC = 1;
   DEFINE_ARGS;
 
-  const int oh0 = oh + 0;
-  const int oh1 = oh + 1;
-  const int n_oh0 = n * OH + oh0;
-  const int n_oh1 = n * OH + oh1;
-  const int ow0 = ow + 0;
-  const int co_slice0 = co_slice + 0;
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
 
   FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
   FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -105,15 +120,15 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
   __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
   for (int kh = 0; kh < KH; ++kh) {
-    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
     // no need to check oh1, finally write out will check (oh1 < OH)
-    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    int ih1 = kh * dilationH + oh1 * strideH - padTop;
     // check ih0 and ih1
-    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
-    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
 
     for (int kw = 0; kw < KW; ++kw) {
-      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
       int x_idx0 = iw0 * CI_SLICES;
 
       for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -140,12 +155,22 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
     out_h1_w0_c0 += bias[co_slice0];
   }
 
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
     out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h1_w0_c0);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0);
+    DO_LEAKY_RELU(out_h1_w0_c0);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
   }
 
   if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -162,20 +187,20 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
 }
 
 __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
-  const int BlockH = 2;
-  const int BlockW = 1;
-  const int BlockC = 2;
+                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                            int2 dilation, int act_type, float alpha) {
+  int BlockH = 2;
+  int BlockW = 1;
+  int BlockC = 2;
   DEFINE_ARGS;
 
-  const int oh0 = oh + 0;
-  const int oh1 = oh + 1;
-  const int n_oh0 = n * OH + oh0;
-  const int n_oh1 = n * OH + oh1;
-  const int ow0 = ow + 0;
-  const int co_slice0 = co_slice + 0;
-  const int co_slice1 = co_slice + 1;
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
 
   FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
   FLT4 out_h1_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -185,15 +210,15 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
   __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
   for (int kh = 0; kh < KH; ++kh) {
-    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
     // no need to check oh1, finally write out will check (oh1 < OH)
-    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    int ih1 = kh * dilationH + oh1 * strideH - padTop;
     // check ih0 and ih1
-    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
-    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
 
     for (int kw = 0; kw < KW; ++kw) {
-      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
       int x_idx0 = iw0 * CI_SLICES;
 
       for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++) {
@@ -231,16 +256,32 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
     out_h1_w0_c1 += bias[co_slice1];
   }
 
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
     out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
     out_h0_w0_c1 = max(out_h0_w0_c1, (FLT4)(0.0f));
     out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h0_w0_c1 = clamp(out_h0_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h1_w0_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0);
+    DO_LEAKY_RELU(out_h1_w0_c0);
+    DO_LEAKY_RELU(out_h0_w0_c1);
+    DO_LEAKY_RELU(out_h1_w0_c1);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
   }
 
   if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
@@ -269,21 +310,21 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
 }
 
 __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *weight,
-                            __global FLT4 *bias, const int4 input_shape, const int4 output_shape,
-                            const int4 kernel_stride, const int4 pad, const int2 dilation, const int act_type) {
-  const int BlockH = 2;
-  const int BlockW = 2;
-  const int BlockC = 2;
+                            __global FLT4 *bias, int4 input_shape, int4 output_shape, int4 kernel_stride, int4 pad,
+                            int2 dilation, int act_type, float alpha) {
+  int BlockH = 2;
+  int BlockW = 2;
+  int BlockC = 2;
   DEFINE_ARGS;
 
-  const int oh0 = oh + 0;
-  const int oh1 = oh + 1;
-  const int n_oh0 = n * OH + oh0;
-  const int n_oh1 = n * OH + oh1;
-  const int ow0 = ow + 0;
-  const int ow1 = ow + 1;
-  const int co_slice0 = co_slice + 0;
-  const int co_slice1 = co_slice + 1;
+  int oh0 = oh + 0;
+  int oh1 = oh + 1;
+  int n_oh0 = n * OH + oh0;
+  int n_oh1 = n * OH + oh1;
+  int ow0 = ow + 0;
+  int ow1 = ow + 1;
+  int co_slice0 = co_slice + 0;
+  int co_slice1 = co_slice + 1;
 
   FLT4 out_h0_w0_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
   FLT4 out_h0_w1_c0 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);
@@ -297,15 +338,15 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
   __global FLT4 *weight_ptr = weight + co_slice / BlockC * KH * KW * CI_SLICES * BlockC * CI_TILE;
 
   for (int kh = 0; kh < KH; ++kh) {
-    const int ih0 = kh * dilationH + oh0 * strideH - padTop;
+    int ih0 = kh * dilationH + oh0 * strideH - padTop;
     // no need to check oh1, finally write out will check (oh1 < OH)
-    const int ih1 = kh * dilationH + oh1 * strideH - padTop;
+    int ih1 = kh * dilationH + oh1 * strideH - padTop;
     // check ih0 and ih1
-    const int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
-    const int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
+    int y_idx0 = (ih0 >= 0 && ih0 < IH) ? n * IH + ih0 : -1;
+    int y_idx1 = (ih1 >= 0 && ih1 < IH) ? n * IH + ih1 : -1;
 
     for (int kw = 0; kw < KW; ++kw) {
-      const int iw0 = kw * dilationW + ow0 * strideW - padLeft;
+      int iw0 = kw * dilationW + ow0 * strideW - padLeft;
       int iw1 = (ow1 < OW) ? kw * dilationW + ow1 * strideW - padLeft : -2;
       int x_idx0 = iw0 * CI_SLICES;
       int x_idx1 = iw1 * CI_SLICES;
@@ -368,7 +409,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
     out_h1_w1_c1 += bias[co_slice1];
   }
 
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out_h0_w0_c0 = max(out_h0_w0_c0, (FLT4)(0.0f));
     out_h0_w1_c0 = max(out_h0_w1_c0, (FLT4)(0.0f));
     out_h1_w0_c0 = max(out_h1_w0_c0, (FLT4)(0.0f));
@@ -377,7 +418,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
     out_h0_w1_c1 = max(out_h0_w1_c1, (FLT4)(0.0f));
     out_h1_w0_c1 = max(out_h1_w0_c1, (FLT4)(0.0f));
     out_h1_w1_c1 = max(out_h1_w1_c1, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out_h0_w0_c0 = clamp(out_h0_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h0_w1_c0 = clamp(out_h0_w1_c0, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w0_c0 = clamp(out_h1_w0_c0, (FLT4)(0.0f), (FLT4)(6.0f));
@@ -386,6 +427,34 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
     out_h0_w1_c1 = clamp(out_h0_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w0_c1 = clamp(out_h1_w0_c1, (FLT4)(0.0f), (FLT4)(6.0f));
     out_h1_w1_c1 = clamp(out_h1_w1_c1, (FLT4)(0.0f), (FLT4)(6.0f));
+  } else if (act_type == ActivationType_TANH) {
+    FLT4 exp0, exp1;
+    DO_TANH(out_h0_w0_c0);
+    DO_TANH(out_h0_w1_c0);
+    DO_TANH(out_h1_w0_c0);
+    DO_TANH(out_h1_w1_c0);
+    DO_TANH(out_h0_w0_c1);
+    DO_TANH(out_h0_w1_c1);
+    DO_TANH(out_h1_w0_c1);
+    DO_TANH(out_h1_w1_c1);
+  } else if (act_type == ActivationType_LEAKY_RELU) {
+    DO_LEAKY_RELU(out_h0_w0_c0);
+    DO_LEAKY_RELU(out_h0_w1_c0);
+    DO_LEAKY_RELU(out_h1_w0_c0);
+    DO_LEAKY_RELU(out_h1_w1_c0);
+    DO_LEAKY_RELU(out_h0_w0_c1);
+    DO_LEAKY_RELU(out_h0_w1_c1);
+    DO_LEAKY_RELU(out_h1_w0_c1);
+    DO_LEAKY_RELU(out_h1_w1_c1);
+  } else if (act_type == ActivationType_SIGMOID) {
+    out_h0_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c0));
+    out_h0_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c0));
+    out_h1_w0_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c0));
+    out_h1_w1_c0 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c0));
+    out_h0_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w0_c1));
+    out_h0_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h0_w1_c1));
+    out_h1_w0_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w0_c1));
+    out_h1_w1_c1 = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-out_h1_w1_c1));
   }
 
   if (OW * CO_SLICES <= MAX_IMAGE2D_SIZE) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
index 591243c719..2c2afd7fc7 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/conv2d_transpose.cl
@@ -2,7 +2,7 @@
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
 __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_only image2d_t dst_data,
                                      __global FLT16 *weight, __read_only image2d_t biases, int2 kernel_size,
-                                     int2 stride, int2 padding, int4 src_size, int4 dst_size) {
+                                     int2 stride, int2 padding, int4 src_size, int4 dst_size, int act_type) {
   int dst_h = get_global_id(0);
   int rem_h = dst_h % stride.x;
   int ceil_h = dst_h / stride.x;
@@ -70,6 +70,19 @@ __kernel void conv2d_transpose_NHWC4(__read_only image2d_t src_data, __write_onl
   r1 += bias_val;
   r2 += bias_val;
   r3 += bias_val;
+
+  if (act_type == ActivationType_RELU) {
+    r0 = max(r0, (FLT4)(0.0f));
+    r1 = max(r1, (FLT4)(0.0f));
+    r2 = max(r2, (FLT4)(0.0f));
+    r3 = max(r3, (FLT4)(0.0f));
+  } else if (act_type == ActivationType_RELU6) {
+    r0 = clamp(r0, (FLT4)(0.0f), (FLT4)(6.0f));
+    r1 = clamp(r1, (FLT4)(0.0f), (FLT4)(6.0f));
+    r2 = clamp(r2, (FLT4)(0.0f), (FLT4)(6.0f));
+    r3 = clamp(r3, (FLT4)(0.0f), (FLT4)(6.0f));
+  }
+
   WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h), r0);
   if (dst_h + stride.x < dst_size.x && dst_w < dst_size.y) {
     WRITE_IMAGE(dst_data, (int2)(dst_w * dst_size.z + dst_c, dst_h + stride.x), r1);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
index 8b0cb16095..d09f93cfde 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/fullconnection.cl
@@ -2,9 +2,9 @@
 #define C4NUM 4
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-__kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
-                                   __read_only image2d_t bias, int4 in_shape, int2 out_shape, float act_min,
-                                   float act_max) {
+
+__kernel void FullConnection(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
+                             __read_only image2d_t bias, int4 in_shape, int2 out_shape, int act_type) {
   int gidx = get_global_id(0);  // CO4
   int gidz = get_global_id(2);  // N
   int lidx = get_local_id(0);
@@ -34,7 +34,15 @@ __kernel void FullConnection_NHWC4(__read_only image2d_t input, __write_only ima
     result += temp[lidx][2];
     result += temp[lidx][3];
     result += READ_IMAGE(bias, smp_zero, (int2)(gidx, 0));
-    result = clamp(result, (FLT)(act_min), (FLT)(act_max));
+    if (act_type == ActivationType_RELU) {
+      result = max(result, (FLT4)(0.0f));
+    } else if (act_type == ActivationType_RELU6) {
+      result = clamp(result, (FLT4)(0.0f), (FLT4)(6.0f));
+    } else if (act_type == ActivationType_TANH) {
+      FLT4 exp0 = exp(result);
+      FLT4 exp1 = exp(-result);
+      result = (exp0 - exp1) / (exp0 + exp1);
+    }
     WRITE_IMAGE(output, (int2)(gidx, gidz), result);
   }
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
index d47ddcdf0c..6fbdc4ff38 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/scale.cl
@@ -1,10 +1,6 @@
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;
 
-#define ActType_No 0
-#define ActType_Relu 1
-#define ActType_Sigmod 2
-#define ActType_Relu6 3
 #define C4NUM 4
 
 __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale, __read_only image2d_t offset,
@@ -19,9 +15,9 @@ __kernel void Scale_IMG(__read_only image2d_t input, __read_only image2d_t scale
   FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X, Y));
   FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X, Y));
   FLT4 out = in * s + o;
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out = max(out, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
   }
   WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -37,9 +33,9 @@ __kernel void BoardcastScale_IMG(__read_only image2d_t input, float scale, float
 
   FLT4 in = READ_IMAGE(input, smp_none, (int2)(X, Y));
   FLT4 out = in * (FLT)scale + (FLT)offset;
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out = max(out, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
   }
   WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -57,9 +53,9 @@ __kernel void Scale_C_IMG(__read_only image2d_t input, __read_only image2d_t sca
   FLT4 s = READ_IMAGE(scale, smp_none, (int2)(X % C, 0));
   FLT4 o = READ_IMAGE(offset, smp_none, (int2)(X % C, 0));
   FLT4 out = in * s + o;
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out = max(out, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
   }
   WRITE_IMAGE(output, (int2)(X, Y), out);
@@ -94,9 +90,9 @@ __kernel void Scale_H_IMG(__read_only image2d_t input, __read_only image2d_t sca
     o_real = o.w;
   }
   FLT4 out = in * s_real + o_real;
-  if (act_type == ActType_Relu) {
+  if (act_type == ActivationType_RELU) {
     out = max(out, (FLT4)(0.0f));
-  } else if (act_type == ActType_Relu6) {
+  } else if (act_type == ActivationType_RELU6) {
     out = clamp(out, (FLT4)(0.0f), (FLT4)(6.0f));
   }
   WRITE_IMAGE(output, (int2)(X, Y), out);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
index fad2a98e96..295265a6ad 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
+++ b/mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
@@ -4,9 +4,6 @@ __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP
 
 #define UP_DIV(x, y) (((x) + (y) - (1)) / (y))
 
-#define ActType_Relu 1
-#define ActType_Relu6 3
-
 constant FLT Bt[36] = {
   1.0000000000f, 0.0000000000f,  -2.5000004768f, -0.0000001192f, 1.0000001192f,  0.0000000000f,
   0.0000000000f, 0.9428091049f,  1.3333333731f,  -0.4714044929f, -0.6666667461f, 0.0000000000f,
@@ -17,8 +14,8 @@ constant FLT Bt[36] = {
 };
 
 __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_t output,
-                              const int4 input_shape,     // N H W CI_SLICES
-                              const int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
+                              int4 input_shape,     // N H W CI_SLICES
+                              int4 output_shape) {  // N 36 H/4*W/4 CI_SLICES
 #define PAD 1
   int tile_xy = get_global_id(0);
   int row = get_global_id(1);
@@ -63,8 +60,8 @@ __kernel void Winograd4x4To36(__read_only image2d_t input, __write_only image2d_
 }
 
 __kernel void WinogradConvolution(__read_only image2d_t input, __write_only image2d_t output, __global FLT16 *weight,
-                                  const int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
-                                  const int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
+                                  int4 input_shape,     // N 36 H/4*W/4 CI_SLICES
+                                  int4 output_shape) {  // N 36 H/4*W/4 CO_SLICES
 #define H 36
   int w = get_global_id(0) * 2;
   int h = get_global_id(1);
@@ -134,9 +131,9 @@ constant FLT At[24] = {1.0000000000f, 1.0000000000f, 1.0000000000f,  1.000000000
                        0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f};
 
 __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_t output, __global FLT4 *bias,
-                              const int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
-                              const int4 output_shape,  // N H W CO_SLICES
-                              const int act_type) {
+                              int4 input_shape,   // N 36 H/4*W/4 CO_SLICES
+                              int4 output_shape,  // N H W CO_SLICES
+                              int act_type, float alpha) {
   int tile_xy = get_global_id(0);
   int row = get_global_id(1);
   int slice = get_global_id(2);
@@ -175,10 +172,21 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_
       acc += bias[slice];
     }
 
-    if (act_type == ActType_Relu) {
+    if (act_type == ActivationType_RELU) {
       acc = max(acc, (FLT4)(0.0f));
-    } else if (act_type == ActType_Relu6) {
+    } else if (act_type == ActivationType_RELU6) {
       acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));
+    } else if (act_type == ActivationType_TANH) {
+      FLT4 exp0 = exp(acc);
+      FLT4 exp1 = exp(-acc);
+      acc = (exp0 - exp1) / (exp0 + exp1);
+    } else if (act_type == ActivationType_LEAKY_RELU) {
+      if (acc.x < 0) acc.x *= alpha;
+      if (acc.y < 0) acc.y *= alpha;
+      if (acc.z < 0) acc.z *= alpha;
+      if (acc.w < 0) acc.w *= alpha;
+    } else if (act_type == ActivationType_SIGMOID) {
+      acc = (FLT4)(1.f) / ((FLT4)(1.f) + exp(-acc));
     }
 
     WRITE_IMAGE(output, (int2)(x_idx, oh), acc);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
index c8aa34908e..1dccb1acf6 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc
@@ -31,93 +31,46 @@ using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
 using mindspore::lite::opencl::MemType;
+using mindspore::schema::ActivationType_NO_ACTIVATION;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
 using mindspore::schema::PrimitiveType_Eltwise;
 
 namespace mindspore::kernel {
 
+std::set<schema::PrimitiveType> SupportedOpenCLArithmetics = {PrimitiveType_Mul,
+                                                              PrimitiveType_Add,
+                                                              PrimitiveType_Sub,
+                                                              PrimitiveType_Div,
+                                                              PrimitiveType_LogicalAnd,
+                                                              PrimitiveType_LogicalOr,
+                                                              PrimitiveType_Maximum,
+                                                              PrimitiveType_Minimum,
+                                                              PrimitiveType_FloorDiv,
+                                                              PrimitiveType_FloorMod,
+                                                              PrimitiveType_SquaredDifference,
+                                                              PrimitiveType_Equal,
+                                                              PrimitiveType_NotEqual,
+                                                              PrimitiveType_Less,
+                                                              PrimitiveType_LessEqual,
+                                                              PrimitiveType_Greater,
+                                                              PrimitiveType_GreaterEqual,
+                                                              PrimitiveType_Eltwise};
+
 int ArithmeticOpenCLKernel::CheckSpecs() {
-  auto *arithmetic_parameter = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
-  if (arithmetic_parameter->broadcasting_) {
-    element_flag_ = false;
-    kernel_name_ = "BroadcastNHWC4";
-    if (out_tensors_[0]->shape()[0] > 1) {
-      MS_LOG(ERROR) << "Broadcasting don't support  N > 1";
-      return RET_ERROR;
-    }
-  } else {
-    kernel_name_ = "Element";
+  auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
+  if (param->broadcasting_ && out_tensors_[0]->shape()[0] > 1) {
+    MS_LOG(ERROR) << "Broadcasting don't support  N > 1";
+    return RET_ERROR;
   }
-
-  switch (op_parameter_->type_) {
-    case PrimitiveType_Mul:
-      kernel_name_ += "Mul";
-      break;
-    case PrimitiveType_Add:
-      kernel_name_ += "Add";
-      break;
-    case PrimitiveType_Sub:
-      kernel_name_ += "Sub";
-      break;
-    case PrimitiveType_Div:
-      kernel_name_ += "Div";
-      break;
-    case PrimitiveType_LogicalAnd:
-      kernel_name_ += "And";
-      break;
-    case PrimitiveType_LogicalOr:
-      kernel_name_ += "Or";
-      break;
-    case PrimitiveType_Maximum:
-      kernel_name_ += "Max";
-      break;
-    case PrimitiveType_Minimum:
-      kernel_name_ += "Min";
-      break;
-    case PrimitiveType_FloorDiv:
-      kernel_name_ += "FloorDiv";
-      break;
-    case PrimitiveType_FloorMod:
-      kernel_name_ += "FloorMod";
-      break;
-    case PrimitiveType_SquaredDifference:
-      kernel_name_ += "SquaredDifference";
-      break;
-    case PrimitiveType_Equal:
-      kernel_name_ += "Equal";
-      break;
-    case PrimitiveType_NotEqual:
-      kernel_name_ += "NotEqual";
-      break;
-    case PrimitiveType_Less:
-      kernel_name_ += "Less";
-      break;
-    case PrimitiveType_LessEqual:
-      kernel_name_ += "LessEqual";
-      break;
-    case PrimitiveType_Greater:
-      kernel_name_ += "Greater";
-      break;
-    case PrimitiveType_GreaterEqual:
-      kernel_name_ += "GreaterEqual";
-      break;
-    default:
-      MS_LOG(ERROR) << "Error Operator type " << op_parameter_->type_;
-      return RET_ERROR;
+  if (SupportedOpenCLArithmetics.count(static_cast<schema::PrimitiveType>(op_parameter_->type_)) == 0) {
+    MS_LOG(ERROR) << "UnSupported Operator: " << schema::EnumNamesPrimitiveType()[op_parameter_->type_];
+    return RET_ERROR;
   }
-
-  switch (arithmetic_parameter->activation_type_) {
-    case schema::ActivationType_NO_ACTIVATION:
-      break;
-    case schema::ActivationType_RELU:
-      activation_min_ = 0.f;
-      break;
-    case schema::ActivationType_RELU6:
-      activation_min_ = 0.f;
-      activation_max_ = 6.f;
-      break;
-    default:
-      MS_LOG(ERROR) << "Unsupported activation type " << arithmetic_parameter->activation_type_;
-      return RET_ERROR;
+  if (!(param->activation_type_ == ActivationType_NO_ACTIVATION || param->activation_type_ == ActivationType_RELU ||
+        param->activation_type_ == ActivationType_RELU6)) {
+    MS_LOG(ERROR) << "Unsupported activation type " << param->activation_type_;
+    return RET_ERROR;
   }
   return RET_OK;
 }
@@ -240,11 +193,18 @@ int ArithmeticOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name_);
 #else
-  if (out_mem_type_ == MemType::IMG) {
-    kernel_name_ += "_IMG";
-  } else {
-    kernel_name_ += "_BUF";
+
+  auto *param = reinterpret_cast<const ArithmeticParameter *>(op_parameter_);
+  element_flag_ = !param->broadcasting_;
+  kernel_name_ = param->broadcasting_ ? "BroadcastNHWC4" : "Element";
+  kernel_name_ += schema::EnumNamesPrimitiveType()[op_parameter_->type_];
+  if (param->activation_type_ == ActivationType_RELU) {
+    activation_min_ = 0.f;
+  } else if (param->activation_type_ == ActivationType_RELU6) {
+    activation_min_ = 0.f;
+    activation_max_ = 6.f;
   }
+
   std::string program_name = "Arithmetic";
   std::string source = arithmetic_source;
   ocl_runtime_->LoadSource(program_name, source);
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
index 7e49f8562f..c4d9f504da 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h
@@ -18,12 +18,15 @@
 #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_ARITHMETIC_H_
 
 #include <vector>
+#include <set>
 #include <string>
 #include "src/runtime/kernel/arm/fp32/arithmetic_fp32.h"
 #include "src/runtime/kernel/opencl/opencl_kernel.h"
 
 namespace mindspore::kernel {
 
+extern std::set<schema::PrimitiveType> SupportedOpenCLArithmetics;
+
 class ArithmeticOpenCLKernel : public OpenCLKernel {
  public:
   ArithmeticOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index 549b420081..6d81fc9d5b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -23,6 +23,7 @@
 #include "src/runtime/kernel/opencl/utils.h"
 #include "src/kernel_registry.h"
 #include "include/errorcode.h"
+#include "schema/ops_generated.h"
 #include "src/runtime/kernel/opencl/cl/conv2d.cl.inc"
 #include "src/runtime/kernel/opencl/cl/winograd.cl.inc"
 
@@ -30,6 +31,11 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_LEAKY_RELU;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::ActivationType_SIGMOID;
+using mindspore::schema::ActivationType_TANH;
 using mindspore::schema::PrimitiveType_Conv2D;
 using mindspore::schema::PrimitiveType_FullConnection;
 
@@ -59,6 +65,10 @@ int Conv2DOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "Conv2D only supports 4D output Tensor but get " << out_tensors_.front()->shape().size() << "D.";
     return RET_ERROR;
   }
+  if (param_->act_type_ != ActType_No && param_->act_type_ != ActType_Relu && param_->act_type_ != ActType_Relu6) {
+    MS_LOG(ERROR) << "Unsupported activation type " << param_->act_type_;
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -72,9 +82,16 @@ int Conv2DOpenCLKernel::Prepare() {
   IH_ = input_shape[1];
   IW_ = input_shape[2];
   CI_ = input_shape[3];
-  OH_ = output_shape[1];
-  OW_ = output_shape[2];
-  CO_ = output_shape[3];
+  // for fusion Conv2D and Reshape(N11C->NC)
+  if (output_shape.size() == 2) {
+    OH_ = 1;
+    OW_ = 1;
+    CO_ = output_shape[1];
+  } else {  // output_shape.size()==4
+    OH_ = output_shape[1];
+    OW_ = output_shape[2];
+    CO_ = output_shape[3];
+  }
   CI_SLICES_ = UP_DIV(CI_, C4NUM);
   CO_SLICES_ = UP_DIV(CO_, C4NUM);
   KH_ = param_->kernel_h_;
@@ -91,7 +108,7 @@ int Conv2DOpenCLKernel::Prepare() {
   if (use_winograd_) {
     MS_LOG(DEBUG) << "use winograd";
     std::string program_name = "winograd";
-    ocl_runtime_->LoadSource(program_name, winograd_source);
+    ocl_runtime_->LoadSource(program_name, GetActDefines() + winograd_source);
     ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36");
     ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution");
     ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4");
@@ -100,7 +117,7 @@ int Conv2DOpenCLKernel::Prepare() {
     std::string program_name = "conv2d";
     std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" +
                               std::to_string(block_size_.C);
-    ocl_runtime_->LoadSource(program_name, conv2d_source);
+    ocl_runtime_->LoadSource(program_name, GetActDefines() + conv2d_source);
     ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
   }
 
@@ -347,13 +364,6 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
 }
 
 void Conv2DOpenCLKernel::SetConstArgs() {
-  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-  cl_int act_type = 0;
-  if (param->act_type_ == ActType_Relu) {
-    act_type = 1;
-  } else if (param->act_type_ == ActType_Relu6) {
-    act_type = 3;
-  }
   cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
   cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
 
@@ -380,12 +390,13 @@ void Conv2DOpenCLKernel::SetConstArgs() {
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
     ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, output_shape);
-    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, act_type);
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn++, static_cast<cl_int>(param_->act_type_));
+    ocl_runtime_->SetKernelArg(kernel_36to4x4_, arg_cn, static_cast<cl_float>(alpha_));
   } else {
     arg_cn = 2;
-    cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_};
-    cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_};
-    cl_int2 dilation = {param->dilation_h_, param->dilation_w_};
+    cl_int4 kernel_stride = {KH_, KW_, param_->stride_h_, param_->stride_w_};
+    cl_int4 pad = {param_->pad_u_, param_->pad_d_, param_->pad_l_, param_->pad_r_};
+    cl_int2 dilation = {param_->dilation_h_, param_->dilation_w_};
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape);
@@ -393,7 +404,8 @@ void Conv2DOpenCLKernel::SetConstArgs() {
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride);
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad);
     ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation);
-    ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type);
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn++, static_cast<cl_int>(param_->act_type_));
+    ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(alpha_));
   }
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
index 95bef1d513..bdc2fcdddf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@@ -44,6 +44,9 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
   int Run() override;
   int Tune() override;
 
+  // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
+  float alpha_{0.0f};
+
  private:
   void SetBlockSize();
   int InitFilter();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
index 60bd36d7ea..953d3e4c08 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc
@@ -22,11 +22,14 @@
 #ifndef PROGRAM_WITH_IL
 #include "src/runtime/kernel/opencl/cl/conv2d_transpose.cl.inc"
 #endif
+#include "src/runtime/kernel/opencl/utils.h"
 
 using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
 using mindspore::schema::PrimitiveType_DeConv2D;
 
 namespace mindspore::kernel {
@@ -38,6 +41,10 @@ int Conv2dTransposeOpenCLKernel::CheckSpecs() {
     MS_LOG(ERROR) << "only support kernel - stride == 2 * pad";
     return RET_ERROR;
   }
+  if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) {
+    MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
+    return RET_ERROR;
+  }
   return RET_OK;
 }
 
@@ -47,7 +54,7 @@ int Conv2dTransposeOpenCLKernel::Prepare() {
 #ifdef PROGRAM_WITH_IL
   kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
 #else
-  std::string source = conv2d_transpose_source;
+  std::string source = GetActDefines() + conv2d_transpose_source;
   std::string program_name = "conv2d_transpose";
   ocl_runtime_->LoadSource(program_name, source);
   ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
@@ -102,6 +109,7 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() {
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding);
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
   ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
+  ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, static_cast<cl_int>(param->act_type_));
 }
 
 int Conv2dTransposeOpenCLKernel::InitWeights() {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
index 7367ce7e4e..a288e660fb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc
@@ -29,48 +29,38 @@ using mindspore::kernel::KERNEL_ARCH::kGPU;
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::RET_ERROR;
 using mindspore::lite::RET_OK;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::ActivationType_TANH;
 using mindspore::schema::PrimitiveType_FullConnection;
 
 namespace mindspore::kernel {
 
-int FullConnectionOpenCLKernel::Init() {
-  // deleted soon
-  return CheckSpecs();
-}
-
 int FullConnectionOpenCLKernel::CheckSpecs() {
   auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
-  transposeA = param->a_transpose_;
-  if (transposeA) {
+  if (param->a_transpose_) {
     MS_LOG(ERROR) << "fullconnection only support a_transpose_=false yet.";
     return RET_ERROR;
   }
-  transposeB = param->b_transpose_;
-  enable_fp16_ = ocl_runtime_->GetFp16Enable();
   if ((in_tensors_[0]->shape().size() != 4 && in_tensors_[0]->shape().size() != 2) ||
       (out_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 2)) {
     MS_LOG(ERROR) << "fullconnection only support input output shape size = 2 or 4";
     return RET_ERROR;
   }
-  switch (param->act_type_) {
-    case ActType_No:
-      break;
-    case ActType_Relu:
-      activation_min_ = 0.f;
-      break;
-    case ActType_Relu6:
-      activation_min_ = 0.f;
-      activation_max_ = 6.f;
-      break;
-    default:
-      MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
-      return RET_ERROR;
+  if (param->act_type_ != ActType_No && param->act_type_ != ActType_Relu && param->act_type_ != ActType_Relu6) {
+    MS_LOG(ERROR) << "Unsupported activation type " << param->act_type_;
+    return RET_ERROR;
   }
   return RET_OK;
 }
 
 int FullConnectionOpenCLKernel::Prepare() {
-  std::string kernel_name = "FullConnection_NHWC4";
+  auto param = reinterpret_cast<MatMulParameter *>(op_parameter_);
+  transposeA = param->a_transpose_;
+  transposeB = param->b_transpose_;
+  enable_fp16_ = ocl_runtime_->GetFp16Enable();
+
+  std::string kernel_name = "FullConnection";
   inShape = GpuTensorInfo(in_tensors_[0]);
   outShape = GpuTensorInfo(out_tensors_[0]);
 #ifdef PROGRAM_WITH_IL
@@ -78,7 +68,7 @@ int FullConnectionOpenCLKernel::Prepare() {
 #else
   std::string source = fullconnection_source;
   std::string program_name = "FullConnection";
-  ocl_runtime_->LoadSource(program_name, source);
+  ocl_runtime_->LoadSource(program_name, GetActDefines() + source);
   ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
   auto ret = InitWeights();
@@ -200,11 +190,11 @@ void FullConnectionOpenCLKernel::SetConstArgs() {
                       static_cast<int>(inShape.C)};
   cl_int2 out_shape = {static_cast<int>(outShape.N), static_cast<int>(outShape.C)};
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, padWeight_, lite::opencl::MemType::BUF);
+  auto *param = reinterpret_cast<MatMulParameter *>(op_parameter_);
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, bias_);
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_shape);
   ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_shape);
-  ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_min_);
-  ocl_runtime_->SetKernelArg(kernel_, arg_count++, activation_max_);
+  ocl_runtime_->SetKernelArg(kernel_, arg_count, static_cast<cl_int>(param->act_type_));
 }
 
 int FullConnectionOpenCLKernel::Run() {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
index d507eecf35..275ce3f12a 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h
@@ -37,7 +37,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
   int InitWeights() override;
   void SetConstArgs() override;
   void SetGlobalLocal() override;
-  int Init() override;
   int Tune() override { return lite::RET_OK; }
 
  private:
@@ -46,8 +45,6 @@ class FullConnectionOpenCLKernel : public OpenCLKernel {
   bool enable_fp16_{false};
   bool transposeA{false};
   bool transposeB{true};
-  float activation_min_{-FLT_MAX};
-  float activation_max_{FLT_MAX};
   GpuTensorInfo inShape = GpuTensorInfo(nullptr);
   GpuTensorInfo outShape = GpuTensorInfo(nullptr);
 };
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
index e79c367b85..89d892a04d 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.cc
@@ -35,6 +35,15 @@ using mindspore::schema::PrimitiveType_Scale;
 
 namespace mindspore::kernel {
 
+int ScaleOpenCLKernel::CheckSpecs() {
+  auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
+  if (param->activation_type_ != ActType_No && param->activation_type_ != ActType_Relu &&
+      param->activation_type_ != ActType_Relu6) {
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
 ScaleOpenCLKernel::~ScaleOpenCLKernel() {
   auto allocator = ocl_runtime_->GetAllocator();
   if (scale_ptr_ != nullptr) {
@@ -185,7 +194,7 @@ int ScaleOpenCLKernel::Init() {
     kernel_name += "_BUF";
   }
   std::string program_name = "Scale";
-  std::string source = scale_source;
+  std::string source = GetActDefines() + scale_source;
   ocl_runtime_->LoadSource(program_name, source);
   error_code = ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name);
 #endif
@@ -202,13 +211,6 @@ int ScaleOpenCLKernel::Init() {
 int ScaleOpenCLKernel::Run() {
   MS_LOG(DEBUG) << this->name() << " Running!";
   auto *param = reinterpret_cast<const ScaleParameter *>(op_parameter_);
-  cl_int act_type = 0;
-  if (param->activation_type_ == ActType_Relu) {
-    act_type = 1;
-  } else if (param->activation_type_ == ActType_Relu6) {
-    act_type = 3;
-  }
-
   int arg_idx = 0;
   ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
   if (weight_vector_flag_) {
@@ -242,7 +244,7 @@ int ScaleOpenCLKernel::Run() {
       ocl_runtime_->SetKernelArg(kernel_, arg_idx++, UP_DIV(in_tensors_[1]->shape()[0], C4NUM));
     }
   }
-  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, act_type);
+  ocl_runtime_->SetKernelArg(kernel_, arg_idx++, param->activation_type_);
   ocl_runtime_->RunKernel(kernel_, global_size_, local_size_);
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
index e214fabb37..958d2489dc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h
@@ -30,6 +30,7 @@ class ScaleOpenCLKernel : public OpenCLKernel {
       : OpenCLKernel(parameter, inputs, outputs) {}
   ~ScaleOpenCLKernel() override;
 
+  int CheckSpecs() override;
   int Init() override;
   int Run() override;
   int InitWeights() override;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
new file mode 100644
index 0000000000..0488a92f59
--- /dev/null
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_fusion.cc
@@ -0,0 +1,23 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/kernel/opencl/opencl_subgraph.h"
+
+namespace mindspore::kernel {
+
+void OpenCLSubGraph::Fusion() {}
+
+}  // namespace mindspore::kernel
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index e79f689d27..481968f1e3 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -222,6 +222,7 @@ class OpenCLKernel : public LiteKernel {
 
   lite::opencl::MemType GetMemType() { return out_mem_type_; }
   void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
+  OpParameter *GetParameter() { return op_parameter_; }
 
   virtual std::vector<BaseTuningParameter> GenerateTuningParam() {
     size_t ndim = global_size_.size();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
index 8668e14790..095fc45b47 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc
@@ -352,7 +352,7 @@ int OpenCLSubGraph::Prepare() {
     MS_LOG(ERROR) << "Create OpenCLExecutor fail";
     return RET_ERROR;
   }
-
+  Fusion();
   auto ret = Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "OpenCL subgraph init fail";
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
index d14e5dbd12..88a7f3781b 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.h
@@ -78,6 +78,9 @@ class OpenCLSubGraph : public SubGraphKernel {
   std::set<LiteKernel *> nodes_set_;
   lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
   lite::opencl::OpenCLRuntime *ocl_runtime_{nullptr};
+
+ private:
+  void Fusion();
 };
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.cc b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
index 0d8f313b35..185461c6db 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.cc
@@ -25,6 +25,11 @@
 
 using mindspore::lite::KernelRegistrar;
 using mindspore::lite::opencl::MemType;
+using mindspore::schema::ActivationType_LEAKY_RELU;
+using mindspore::schema::ActivationType_RELU;
+using mindspore::schema::ActivationType_RELU6;
+using mindspore::schema::ActivationType_SIGMOID;
+using mindspore::schema::ActivationType_TANH;
 
 namespace mindspore::lite {
 kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
@@ -40,6 +45,15 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
 
 namespace mindspore::kernel {
 
+std::string GetActDefines() {
+  static std::string act_defines = "#define ActivationType_RELU " + std::to_string(ActivationType_RELU) +
+                                   "\n#define ActivationType_RELU6 " + std::to_string(ActivationType_RELU6) +
+                                   "\n#define ActivationType_LEAKY_RELU " + std::to_string(ActivationType_LEAKY_RELU) +
+                                   "\n#define ActivationType_TANH " + std::to_string(ActivationType_TANH) +
+                                   "\n#define ActivationType_SIGMOID " + std::to_string(ActivationType_SIGMOID) + "\n";
+  return act_defines;
+}
+
 int GetUpPow2(int n) {
   int i = 0;
   int j = 0;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/utils.h b/mindspore/lite/src/runtime/kernel/opencl/utils.h
index 5e71f8bb1d..0971b88913 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/utils.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/utils.h
@@ -34,6 +34,8 @@ kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, con
 
 namespace mindspore::kernel {
 
+std::string GetActDefines();
+
 int GetUpPow2(int n);
 
 int GetMaxDivisor(int x, int divisor);
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index 571c0cd046..82b989a1f8 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -84,6 +84,7 @@ if (SUPPORT_GPU)
             ${KERNEL_OP_SRC}
             ${GPU_KERNEL_OP_SRC}
             ${LITE_DIR}/src/runtime/kernel/opencl/opencl_subgraph.cc
+            ${LITE_DIR}/src/runtime/kernel/opencl/opencl_fusion.cc
             ${LITE_DIR}/src/runtime/kernel/opencl/utils.cc
             )
 endif()