From d0ff232aa02184acad8d7077d4682def63f5acc3 Mon Sep 17 00:00:00 2001
From: wangdongxu <wangdongxu6@huawei.com>
Date: Tue, 18 Aug 2020 18:17:07 +0800
Subject: [PATCH] optimize opencl winograd kernel performance

---
 .../kernel/opencl/kernel/convolution.cc       | 82 +++++++++++--------
 .../kernel/opencl/kernel/convolution.h        |  4 +-
 .../kernel/opencl/convolution_tests.cc        | 82 +++++++++++++------
 3 files changed, 103 insertions(+), 65 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index e6d4c9b0a8..788f6fa3fc 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -476,12 +476,10 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
 
 std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
   return "#define CI_TILE 4\n"
+         "#define H 36\n"
+         "//#define W 256\n"
          "//#define CI 96\n"
-         "#define IH 36\n"
-         "//#define IW 256\n"
-         "//#define CO 80\n"
-         "#define OH 36\n"
-         "//#define OW 256\n"
+         "//#define CO 80s\n"
          "//#define CI_SLICES 24\n"
          "//#define CO_SLICES 20\n"
          "\n"
@@ -500,59 +498,71 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
          "                                  int4 input_shape,         // N 36 H/4*W/4 CI_SLICES\n"
          "                                  int4 output_shape)        // N 36 H/4*W/4 CO_SLICES\n"
          "{\n"
-         "    int ow = get_global_id(0) * 2;\n"
-         "    int oh = get_global_id(1);\n"
+         "    int w = get_global_id(0) * 2;\n"
+         "    int h = get_global_id(1);\n"
          "    int co_slice = get_global_id(2) * 2;\n"
          "\n"
          "    int CI_SLICES = input_shape.w;\n"
-         "    int IW = input_shape.z;\n"
+         "    int W = input_shape.z;\n"
          "    int CO_SLICES = output_shape.w;\n"
-         "    int OW = IW;\n"
          "\n"
-         "    if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
+         "    if (h >= H || w >= W || co_slice >= CO_SLICES)\n"
          "    {\n"
          "        return;\n"
          "    }\n"
          "\n"
-         "    __global float16 *w_ptr = weight + (co_slice / 2 * 36 + oh) * CI_SLICES * 2;\n"
-         "    int y_idx = oh;\n"
          "    FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
          "    FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
          "    FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
          "    FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
+         "\n"
+         "    int y_idx = h;\n"
+         "    __global float16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n"
          "    for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
          "    {\n"
-         "        FLT4 in0 = READ_FLT4(input, smp_none, (int2)(ow + 0, y_idx));\n"
-         "        FLT4 in1 = READ_FLT4(input, smp_none, (int2)(ow + 1, y_idx));\n"
+         "        FLT4 in0 = READ_FLT4(input, smp_none, (int2)(w + 0, y_idx));\n"
+         "        FLT4 in1 = READ_FLT4(input, smp_none, (int2)(w + 1, y_idx));\n"
          "        y_idx += 36;\n"
          "\n"
-         "        float16 w0 = w_ptr[0], w1 = w_ptr[1];\n"
-         "        w_ptr += 2;\n"
+         "        float16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n"
+         "        weight_ptr += 2;\n"
+         "\n"
          "\n"
-         "        out00 += in0.x * w0.s0123;\n"
-         "        out00 += in0.y * w0.s4567;\n"
-         "        out00 += in0.z * w0.s89ab;\n"
-         "        out00 += in0.w * w0.scdef;\n"
+         "        out00 += in0.x * weight0.s0123;\n"
+         "        out00 += in0.y * weight0.s4567;\n"
+         "        out00 += in0.z * weight0.s89ab;\n"
+         "        out00 += in0.w * weight0.scdef;\n"
          "\n"
-         "        out01 += in1.x * w0.s0123;\n"
-         "        out01 += in1.y * w0.s4567;\n"
-         "        out01 += in1.z * w0.s89ab;\n"
-         "        out01 += in1.w * w0.scdef;\n"
+         "        out01 += in1.x * weight0.s0123;\n"
+         "        out01 += in1.y * weight0.s4567;\n"
+         "        out01 += in1.z * weight0.s89ab;\n"
+         "        out01 += in1.w * weight0.scdef;\n"
          "\n"
-         "        out10 += in0.x * w1.s0123;\n"
-         "        out10 += in0.y * w1.s4567;\n"
-         "        out10 += in0.z * w1.s89ab;\n"
-         "        out10 += in0.w * w1.scdef;\n"
+         "        out10 += in0.x * weight1.s0123;\n"
+         "        out10 += in0.y * weight1.s4567;\n"
+         "        out10 += in0.z * weight1.s89ab;\n"
+         "        out10 += in0.w * weight1.scdef;\n"
          "\n"
-         "        out11 += in1.x * w1.s0123;\n"
-         "        out11 += in1.y * w1.s4567;\n"
-         "        out11 += in1.z * w1.s89ab;\n"
-         "        out11 += in1.w * w1.scdef;\n"
+         "        out11 += in1.x * weight1.s0123;\n"
+         "        out11 += in1.y * weight1.s4567;\n"
+         "        out11 += in1.z * weight1.s89ab;\n"
+         "        out11 += in1.w * weight1.scdef;\n"
+         "    }\n"
+         "\n"
+         "    WRITE_FLT4(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n"
+         "    if (w + 1 < W)\n"
+         "    {\n"
+         "        WRITE_FLT4(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n"
+         "    }\n"
+         "\n"
+         "    if (co_slice + 1 < CO_SLICES)\n"
+         "    {\n"
+         "        WRITE_FLT4(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n"
+         "        if (w + 1 < W)\n"
+         "        {\n"
+         "            WRITE_FLT4(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n"
+         "        }\n"
          "    }\n"
-         "    WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 0) * 36 + oh), out00);\n"
-         "    WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 0) * 36 + oh), out01);\n"
-         "    WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 1) * 36 + oh), out10);\n"
-         "    WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 1) * 36 + oh), out11);\n"
          "}";
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index cfd426b944..8c651740ef 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -66,8 +66,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
     auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
     const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
                             param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
-    const bool channel_good = CO_SLICES % 4 == 0 && CI_SLICES >= 16 && CO_SLICES >= 16;
-    const bool hw_good = TILES_X * TILES_Y >= 32;
+    const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
+    const bool hw_good = TILES_X * TILES_Y >= 16;
     return attr_valid && channel_good && hw_good;
   }
 
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
index 966bd32af2..f269d59b54 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc
@@ -77,33 +77,33 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   auto testcase_path = data_path + "/" + attr_str + "/";
   auto input_file = testcase_path + (input_format == schema::Format_NHWC4 ? "input_NHWC4.bin" : "input_NHWC.bin");
   auto weight_file = testcase_path + "weight_OHWI.bin";
-  auto bias_file = testcase_path + "bias_C4.bin";
+  auto bias_file = testcase_path + "bias_C.bin";
   auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin");
-  std::cout << input_file << std::endl;
-  std::cout << weight_file << std::endl;
-  std::cout << bias_file << std::endl;
-  std::cout << expect_file << std::endl;
+  std::cout << "input_file:" << input_file << std::endl;
+  std::cout << "weight_file:" << weight_file << std::endl;
+  std::cout << "bias_file:" << bias_file << std::endl;
+  std::cout << "expect_file:" << expect_file << std::endl;
 
   std::cout << "initialize OpenCLRuntime";
   auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
   ocl_runtime->Init();
   auto allocator = ocl_runtime->GetAllocator();
 
-  std::cout << "create inputs/weights/outputs Tensors(framework do)";
+  std::cout << "create Tensors(framework will do!!!)";
   std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_};
   std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
   std::vector<int> bias_shape = {param->output_channel_};
   std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
   auto data_type = kNumberTypeFloat32;
-  auto tensorType = schema::NodeType_ValueNode;
-  auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensorType);
-  auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType);
-  auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType);
-  auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensorType);
+  auto tensor_type = schema::NodeType_ValueNode;
+  auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type);
+  auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type);
+  auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type);
+  auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type);
   std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor};
   std::vector<lite::tensor::Tensor *> outputs{output_tensor};
 
-  std::cout << "initialize weight Tensors data(framework do)";
+  std::cout << "allocate and initialize weight/bias memory by hand here(framework will do!!!)";
   std::vector<float> weight_vec(weight_tensor->ElementsNum());
   std::vector<float> bias_vec(weight_tensor->ElementsNum());
   weight_tensor->SetData(weight_vec.data());
@@ -111,25 +111,18 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   LoadData(weight_tensor->Data(), weight_tensor->Size(), weight_file);
   LoadData(bias_tensor->Data(), bias_tensor->Size(), bias_file);
 
-  std::cout << "create OpenCL Kernel";  // weight has been allcated by framework
+  std::cout << "create OpenCL Kernel";  // weight/bias has been allcated by framework
   auto *conv_kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
   conv_kernel->Init();
-  std::vector<LiteKernel *> kernels{conv_kernel};
-
-  // freamework to do!!! allocate memory by hand
-  inputs[0]->MallocData(allocator);
 
   std::cout << "create SubGraphOpenCLKernel";
-  auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, kernels, kernels, kernels);
+  inputs[0]->MallocData(allocator);  // allocate input memory by hand here, framework will do!!!
+  auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, {conv_kernel}, {conv_kernel}, {conv_kernel});
   sub_graph->Init();
-
-  std::cout << "initialize input Tensors data";  // inputs has been allcated by sub_graph->Init()
-  LoadData(input_tensor->Data(), input_tensor->Size(), input_file);
+  LoadData(input_tensor->Data(), input_tensor->Size(), input_file);  // initialize input Tensors data
   printf("input[0] =%.3f\n", reinterpret_cast<float *>(input_tensor->Data())[0]);
   printf("weight[0]=%.3f\n", reinterpret_cast<float *>(weight_tensor->Data())[0]);
   printf("bias[0]  =%.3f\n", reinterpret_cast<float *>(bias_tensor->Data())[0]);
-
-  std::cout << "sub_graph->Run()";
   sub_graph->Run();
 
   std::cout << "compare result";
@@ -144,7 +137,7 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
   }
   delete conv_kernel;
   delete sub_graph;
-  mindspore::lite::opencl::OpenCLRuntime::DeleteInstance();
+  lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
 TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
@@ -161,18 +154,53 @@ TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
 //    "1x1");
 //}
 
-TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
+TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
   TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
             "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
             "dilationHW_1x1");
 }
-TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) {
+TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) {
   TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
             "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
             "dilationHW_1x1");
 }
 
-TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) {
+// TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) {
+//  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
+//            "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+//            "dilationHW_1x1");
+//}
+
+TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+            "inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+
+TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x250) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+            "inputNHWC_1x8x128x100_outputNHWC_1x8x128x250_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+
+TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x300) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+            "inputNHWC_1x8x128x100_outputNHWC_1x8x128x300_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+
+TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x350) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+            "inputNHWC_1x4x64x150_outputNHWC_1x4x64x350_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x400) {
+  TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
+            "inputNHWC_1x4x64x150_outputNHWC_1x4x64x400_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
+            "dilationHW_1x1");
+}
+
+TEST_F(TestConvolutionOpenCL, winograd_08_origin_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) {
   TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
             "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_"
             "1x1x1x1_dilationHW_1x1");