From 1f89ea759bc8d13229a9498c1a65be1b5f85ab0a Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Thu, 17 Sep 2020 10:39:13 +0800
Subject: [PATCH] transpose support NCHW2NHWC fix: GPU init failed, disable
 winagrad

---
 mindspore/lite/src/lite_session.cc            |  8 ++-
 .../kernel/opencl/kernel/arithmetic_self.cc   |  1 +
 .../kernel/opencl/kernel/convolution.cc       | 45 +++++++-------
 .../kernel/opencl/kernel/convolution.h        |  6 +-
 .../lite/src/runtime/opencl/opencl_runtime.h  |  4 +-
 mindspore/lite/test/models_tflite_gpu.cfg     |  1 +
 .../runtime/kernel/opencl/transpose_tests.cc  | 59 +++++++++++++++----
 7 files changed, 88 insertions(+), 36 deletions(-)

diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc
index 0af5b7d992..ddce537920 100644
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -346,8 +346,12 @@ int LiteSession::Init(Context *context) {
   if (context_->device_type_ == DT_GPU) {
     auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
     opencl_runtime->SetFp16Enable(context_->float16_priority);
-    opencl_runtime->Init();
-    MS_LOG(INFO) << "Init OpenCL runtime.";
+    if (opencl_runtime->Init() != RET_OK) {
+      context_->device_type_ = DT_CPU;
+      MS_LOG(WARNING) << "Init OpenCL runtime failed, change to CPU mode.";
+    } else {
+      MS_LOG(INFO) << "Init OpenCL runtime success.";
+    }
   }
 #endif
   executor = new Executor();
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
index 4473fc5040..71294b6edf 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@@ -97,6 +97,7 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
       break;
     case PrimitiveType_Round:
       kernel_name[0] += "_ElementRound";
+      break;
     case PrimitiveType_Neg:
       kernel_name[0] += "_ElementNeg";
       break;
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
index 5196799b8c..35d8abb470 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -68,7 +68,7 @@ int ConvolutionOpenCLKernel::Init() {
   TILES_X_ = UP_DIV(OW_, 4);
   TILES_Y_ = UP_DIV(OH_, 4);
   TILES_XY_ = TILES_X_ * TILES_Y_;
-  use_winograd_ = UseWinograd4x4To6x6() && use_fp16_;
+  use_winograd_ = UseWinograd4x4To6x6();
 
   // build kernel
   if (use_winograd_) {
@@ -247,7 +247,7 @@ int ConvolutionOpenCLKernel::InitBuffer() {
 int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
   size_t im_dst_x, im_dst_y;
   if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
-    if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
+    if (out_tensors_[0]->Width() * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
       {
         im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
         im_dst_y = out_tensors_[0]->Height();
@@ -314,7 +314,8 @@ int ConvolutionOpenCLKernel::Run() {
 
   if (use_winograd_) {
     ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
-    ocl_runtime_->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
+    ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2},
+                            nullptr);
     ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
   } else {
     std::vector<size_t> global, local;
@@ -414,7 +415,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
     code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
   }
 
-  if (OW_ * CO_SLICES_ < 65536) {
+  if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
     code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
   } else {
     code += "    WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@@ -616,23 +617,27 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
     "    FLT4 BtD_row[6] = {0};\n"
     "    for (int y = 0; y < 6; y++)\n"
     "    {\n"
-    "        int y_idx = tile_y * 4 - PAD + y;\n";
+    "        int ih = tile_y * 4 - PAD + y;\n";
 
   if (op_format_ == Format_NHWC4) {
-    code +=
-      "        for (int x = 0; x < 6; x++)\n"
-      "        {\n"
-      "             int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
+    code += "        int y_idx = ih;\n";
   } else if (op_format_ == Format_NC4HW4) {
     code +=
-      "        if(y_idx < 0 || y_idx >= IH)\n"
-      "        {\n"
-      "            continue;\n"
-      "        }\n"
-      "        y_idx += slice * IH;\n"
-      "        for (int x = 0; x < 6; x++)\n"
-      "        {\n"
-      "            int x_idx = tile_x * 4 - PAD + x;\n";
+      "        if(ih < 0 || ih >= IH) {continue;}\n"
+      "        int y_idx = slice * IH + ih;\n";
+  }
+
+  code +=
+    "        for (int x = 0; x < 6; x++)\n"
+    "        {\n"
+    "            int iw = tile_x * 4 - PAD + x;\n";
+
+  if (op_format_ == Format_NHWC4) {
+    code +=
+      "            if(iw < 0 || iw >= IW) {continue;}\n"
+      "            int x_idx = iw * SLICES + slice;\n";
+  } else if (op_format_ == Format_NC4HW4) {
+    code += "            int x_idx = iw;\n";
   }
 
   code +=
@@ -792,9 +797,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
 
   auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
   if (param->act_type_ == ActType_Relu) {
-    code += "    acc = max(acc, (FLT4)(0.0f));\n";
+    code += "        acc = max(acc, (FLT4)(0.0f));\n\n";
   } else if (param->act_type_ == ActType_Relu6) {
-    code += "    acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n";
+    code += "        acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n\n";
   }
 
   code +=
@@ -838,7 +843,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
   }
 
   if (op_format_ == Format_NHWC4) {
-    if (OW_ * CO_SLICES_ > 65536) {
+    if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
       local_w = 4;
     }
   }
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
index 99a7bc522f..41937c3b85 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -81,8 +81,10 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
 
   bool UseWinograd4x4To6x6() {
     auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
-    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
-                            param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
+    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
+                            param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
+                            param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
+                            IW_ == OW_;
     const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
     const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
     return attr_valid && channel_good && hw_good;
diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
index 157400c308..08fff021c4 100644
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@@ -16,7 +16,8 @@ j* you may not use this file except in compliance with the License.
 
 #ifndef MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
 #define MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
-
+// Get from Device?
+#define MAX_IMAGE2D_SIZE 65535
 #include <vector>
 #include <map>
 #include <memory>
@@ -127,7 +128,6 @@ class OpenCLRuntime {
   int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
   int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
   bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
-  bool IsInitOK() {return init_done_;}
 
   /**
    * Get kernel max worker group size.
diff --git a/mindspore/lite/test/models_tflite_gpu.cfg b/mindspore/lite/test/models_tflite_gpu.cfg
index 7d6d2f31f3..dd3839345f 100644
--- a/mindspore/lite/test/models_tflite_gpu.cfg
+++ b/mindspore/lite/test/models_tflite_gpu.cfg
@@ -6,3 +6,4 @@ mtk_AADB_HADB_MBV2_model_fp32.tflite
 hiai_cn_recognize_modify_padv2.tflite
 hiai_cv_focusShootOCRModel_08.tflite
 hiai_model_normalize_object_scene_ps_20200519.tflite
+inception_v3.tflite
diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
index 10c430b470..f81dbc530b 100644
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@@ -44,10 +44,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
     return;
   }
   param->num_axes_ = 4;
-  param->perm_[0] = 0;
-  param->perm_[1] = 3;
-  param->perm_[2] = 1;
-  param->perm_[3] = 2;
+  param->perm_[0] = shape[3];
+  param->perm_[1] = shape[4];
+  param->perm_[2] = shape[5];
+  param->perm_[3] = shape[6];
   auto allocator = ocl_runtime->GetAllocator();
   int h = shape[0];
   int w = shape[1];
@@ -60,9 +60,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
     MS_LOG(ERROR) << "tensor_x create error.";
     return;
   }
-  std::vector<int> out_shape = {1, c, h, w};
+  std::vector<int> out_shape = {input_shape[param->perm_[0]], input_shape[param->perm_[1]],
+                                input_shape[param->perm_[2]], input_shape[param->perm_[3]]};
   auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
-                                                       out_shape, schema::Format_NCHW);
+                                                       out_shape, schema::Format_NHWC);
   auto tensor_out = tensor_out_ptr.get();
   if (tensor_out == nullptr) {
     MS_LOG(ERROR) << "tensor_out create error.";
@@ -105,25 +106,63 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
   lite::opencl::OpenCLRuntime::DeleteInstance();
 }
 
-TEST_F(TestTransposeOpenCL, TransposeFp32) {
+TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp32) {
   int h = 2;
   int w = 2;
   int c = 3;
-  std::vector<int> shape = {h, w, c};
+  int perm0 = 0;
+  int perm1 = 3;
+  int perm2 = 1;
+  int perm3 = 2;
+  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
   std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
   std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
 
   RunTestTranspose(shape, input_data.data(), output_data.data(), false);
 }
 
-TEST_F(TestTransposeOpenCL, TransposeFp16) {
+TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp16) {
   int h = 2;
   int w = 2;
   int c = 3;
-  std::vector<int> shape = {h, w, c};
+  int perm0 = 0;
+  int perm1 = 3;
+  int perm2 = 1;
+  int perm3 = 2;
+  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
   std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
   std::vector<float16_t> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};
 
   RunTestTranspose(shape, input_data.data(), output_data.data(), true);
 }
+
+TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp32) {
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  int perm0 = 0;
+  int perm1 = 2;
+  int perm2 = 3;
+  int perm3 = 1;
+  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
+  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
+
+  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
+}
+
+TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp16) {
+  int h = 2;
+  int w = 2;
+  int c = 3;
+  int perm0 = 0;
+  int perm1 = 2;
+  int perm2 = 3;
+  int perm3 = 1;
+  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
+  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
+  std::vector<float16_t> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};
+
+  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
+}
 }  // namespace mindspore