transpose support NCHW2NHWC

fix: GPU init failed, disable winagrad
5 years ago · 1f89ea759b
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -346,8 +346,12 @@ int LiteSession::Init(Context *context) {
  if (context_->device_type_ == DT_GPU) {
    auto opencl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
    opencl_runtime->SetFp16Enable(context_->float16_priority);
    opencl_runtime->Init();
    MS_LOG(INFO) << "Init OpenCL runtime.";
    if (opencl_runtime->Init() != RET_OK) {
      context_->device_type_ = DT_CPU;
      MS_LOG(WARNING) << "Init OpenCL runtime failed, change to CPU mode.";
    } else {
      MS_LOG(INFO) << "Init OpenCL runtime success.";
    }
  }
 #endif
  executor = new Executor();
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
@@ -97,6 +97,7 @@ void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, Arithme
      break;
    case PrimitiveType_Round:
      kernel_name[0] += "_ElementRound";
      break;
    case PrimitiveType_Neg:
      kernel_name[0] += "_ElementNeg";
      break;
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
@@ -68,7 +68,7 @@ int ConvolutionOpenCLKernel::Init() {
  TILES_X_ = UP_DIV(OW_, 4);
  TILES_Y_ = UP_DIV(OH_, 4);
  TILES_XY_ = TILES_X_ * TILES_Y_;
  use_winograd_ = UseWinograd4x4To6x6() && use_fp16_;
  use_winograd_ = UseWinograd4x4To6x6();

  // build kernel
  if (use_winograd_) {
@@ -247,7 +247,7 @@ int ConvolutionOpenCLKernel::InitBuffer() {
 int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
  size_t im_dst_x, im_dst_y;
  if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
    if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
    if (out_tensors_[0]->Width() * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
      {
        im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
        im_dst_y = out_tensors_[0]->Height();
@@ -314,7 +314,8 @@ int ConvolutionOpenCLKernel::Run() {

  if (use_winograd_) {
    ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
    ocl_runtime_->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
    ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2},
                            nullptr);
    ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
  } else {
    std::vector<size_t> global, local;
@@ -414,7 +415,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
    code += "    out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
  }

  if (OW_ * CO_SLICES_ < 65536) {
  if (OW_ * CO_SLICES_ <= MAX_IMAGE2D_SIZE) {
    code += "    WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
  } else {
    code += "    WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@@ -616,23 +617,27 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
    "    FLT4 BtD_row[6] = {0};\n"
    "    for (int y = 0; y < 6; y++)\n"
    "    {\n"
    "        int y_idx = tile_y * 4 - PAD + y;\n";
    "        int ih = tile_y * 4 - PAD + y;\n";

  if (op_format_ == Format_NHWC4) {
    code +=
      "        for (int x = 0; x < 6; x++)\n"
      "        {\n"
      "             int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
    code += "        int y_idx = ih;\n";
  } else if (op_format_ == Format_NC4HW4) {
    code +=
      "        if(y_idx < 0 || y_idx >= IH)\n"
      "        {\n"
      "            continue;\n"
      "        }\n"
      "        y_idx += slice * IH;\n"
      "        for (int x = 0; x < 6; x++)\n"
      "        {\n"
      "            int x_idx = tile_x * 4 - PAD + x;\n";
      "        if(ih < 0 || ih >= IH) {continue;}\n"
      "        int y_idx = slice * IH + ih;\n";
  }

  code +=
    "        for (int x = 0; x < 6; x++)\n"
    "        {\n"
    "            int iw = tile_x * 4 - PAD + x;\n";

  if (op_format_ == Format_NHWC4) {
    code +=
      "            if(iw < 0 || iw >= IW) {continue;}\n"
      "            int x_idx = iw * SLICES + slice;\n";
  } else if (op_format_ == Format_NC4HW4) {
    code += "            int x_idx = iw;\n";
  }

  code +=
@@ -792,9 +797,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {

  auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
  if (param->act_type_ == ActType_Relu) {
    code += "    acc = max(acc, (FLT4)(0.0f));\n";
    code += "        acc = max(acc, (FLT4)(0.0f));\n\n";
  } else if (param->act_type_ == ActType_Relu6) {
    code += "    acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n";
    code += "        acc = clamp(acc, (FLT4)(0.0f), (FLT4)(6.0f));\n\n";
  }

  code +=
@@ -838,7 +843,7 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
  }

  if (op_format_ == Format_NHWC4) {
    if (OW_ * CO_SLICES_ > 65536) {
    if (OW_ * CO_SLICES_ > MAX_IMAGE2D_SIZE) {
      local_w = 4;
    }
  }
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
@@ -81,8 +81,10 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {

  bool UseWinograd4x4To6x6() {
    auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
                            param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
    const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->stride_h_ == 1 &&
                            param->stride_w_ == 1 && param->pad_u_ == 1 && param->pad_d_ == 1 && param->pad_l_ == 1 &&
                            param->pad_r_ == 1 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && IH_ == OH_ &&
                            IW_ == OW_;
    const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
    const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
    return attr_valid && channel_good && hw_good;
--- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h
+++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h
@@ -16,7 +16,8 @@ j* you may not use this file except in compliance with the License.

 #ifndef MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_
 #define MINDSPORE_LITE_SRC_OPENCL_RUNTIME_H_

 // Get from Device?
 #define MAX_IMAGE2D_SIZE 65535
 #include <vector>
 #include <map>
 #include <memory>
@@ -127,7 +128,6 @@ class OpenCLRuntime {
  int UnmapBuffer(const cl::Memory &buffer, void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
  int UnmapBuffer(void *host_ptr, cl::CommandQueue *command_queue = nullptr) const;
  bool SyncCommandQueue(cl::CommandQueue *command_queue = nullptr);
  bool IsInitOK() {return init_done_;}

  /**
   * Get kernel max worker group size.
--- a/mindspore/lite/test/models_tflite_gpu.cfg
+++ b/mindspore/lite/test/models_tflite_gpu.cfg
@@ -6,3 +6,4 @@ mtk_AADB_HADB_MBV2_model_fp32.tflite
 hiai_cn_recognize_modify_padv2.tflite
 hiai_cv_focusShootOCRModel_08.tflite
 hiai_model_normalize_object_scene_ps_20200519.tflite
 inception_v3.tflite
--- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
+++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/transpose_tests.cc
@@ -44,10 +44,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
    return;
  }
  param->num_axes_ = 4;
  param->perm_[0] = 0;
  param->perm_[1] = 3;
  param->perm_[2] = 1;
  param->perm_[3] = 2;
  param->perm_[0] = shape[3];
  param->perm_[1] = shape[4];
  param->perm_[2] = shape[5];
  param->perm_[3] = shape[6];
  auto allocator = ocl_runtime->GetAllocator();
  int h = shape[0];
  int w = shape[1];
@@ -60,9 +60,10 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
    MS_LOG(ERROR) << "tensor_x create error.";
    return;
  }
  std::vector<int> out_shape = {1, c, h, w};
  std::vector<int> out_shape = {input_shape[param->perm_[0]], input_shape[param->perm_[1]],
                                input_shape[param->perm_[2]], input_shape[param->perm_[3]]};
  auto tensor_out_ptr = std::make_unique<lite::Tensor>(TypeId(enable_fp16 ? kNumberTypeFloat16 : kNumberTypeFloat32),
                                                       out_shape, schema::Format_NCHW);
                                                       out_shape, schema::Format_NHWC);
  auto tensor_out = tensor_out_ptr.get();
  if (tensor_out == nullptr) {
    MS_LOG(ERROR) << "tensor_out create error.";
@@ -105,25 +106,63 @@ void RunTestTranspose(const std::vector<int> &shape, void *input_data, void *out
  lite::opencl::OpenCLRuntime::DeleteInstance();
 }

 TEST_F(TestTransposeOpenCL, TransposeFp32) {
 TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp32) {
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {h, w, c};
  int perm0 = 0;
  int perm1 = 3;
  int perm2 = 1;
  int perm3 = 2;
  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};

  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
 }

 TEST_F(TestTransposeOpenCL, TransposeFp16) {
 TEST_F(TestTransposeOpenCL, TransposeNHWC2NCHWFp16) {
  int h = 2;
  int w = 2;
  int c = 3;
  std::vector<int> shape = {h, w, c};
  int perm0 = 0;
  int perm1 = 3;
  int perm2 = 1;
  int perm3 = 2;
  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float16_t> output_data = {0.0f, 3.0f, 6.0f, 9.0f, 1.0f, 4.0f, 7.0f, 10.0f, 2.0f, 5.0f, 8.0f, 11.0f};

  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
 }

 TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp32) {
  int h = 2;
  int w = 2;
  int c = 3;
  int perm0 = 0;
  int perm1 = 2;
  int perm2 = 3;
  int perm3 = 1;
  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
  std::vector<float> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};

  RunTestTranspose(shape, input_data.data(), output_data.data(), false);
 }

 TEST_F(TestTransposeOpenCL, TransposeNCHW2NHWCFp16) {
  int h = 2;
  int w = 2;
  int c = 3;
  int perm0 = 0;
  int perm1 = 2;
  int perm2 = 3;
  int perm3 = 1;
  std::vector<int> shape = {h, w, c, perm0, perm1, perm2, perm3};
  std::vector<float16_t> input_data = {0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f};
  std::vector<float16_t> output_data = {0.0f, 6.0f, 1.0f, 7.0f, 2.0f, 8.0f, 3.0f, 9.0f, 4.0f, 10.0f, 5.0f, 11.0f};

  RunTestTranspose(shape, input_data.data(), output_data.data(), true);
 }
 }  // namespace mindspore