From ce4b3e743f57f1cc9e401f0a65470d7058ed1513 Mon Sep 17 00:00:00 2001
From: chenzupeng <chenzupeng@huawei.com>
Date: Thu, 3 Dec 2020 17:36:01 +0800
Subject: [PATCH] optimize conv2d tuning

---
 .../runtime/kernel/opencl/kernel/conv2d.cc    | 39 +++++++++++++++++++
 .../src/runtime/kernel/opencl/kernel/conv2d.h |  2 +
 .../runtime/kernel/opencl/kernel/reshape.cc   |  4 +-
 .../src/runtime/kernel/opencl/opencl_kernel.h | 20 ++++++++++
 4 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
index 6d81fc9d5b..453ce0035c 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -363,6 +363,45 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
   }
 }
 
+std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
+  // don't need to tune local_c
+  std::vector<BaseTuningParameter> tuning_params = {};
+  if (use_winograd_) {
+    return tuning_params;
+  }
+  BaseTuningParameter default_tuning_param = BaseTuningParameter();
+  default_tuning_param.local_size = local_size_;
+  tuning_params.push_back(default_tuning_param);
+
+  std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
+  size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
+  std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
+  std::set<size_t> candidate_y = GenerateLocalByGlobal(global_size_[1]);
+  for (auto x : candidate_x) {
+    if (x <= max_work_items[0]) {
+      for (auto y : candidate_y) {
+        if (y <= max_work_items[1]) {
+          auto group_size = x * y * local_size_[2];
+          if (group_size <= max_workgroup_size) {
+            BaseTuningParameter tuning_param = BaseTuningParameter();
+            tuning_param.local_size = {x, y, local_size_[2]};
+            tuning_params.push_back(tuning_param);
+          }
+        }
+      }
+    }
+  }
+  return tuning_params;
+}
+
+std::string Conv2DOpenCLKernel::Key() {
+  auto key = OpenCLKernel::Key();
+  key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
+         std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" +
+         std::to_string(param_->dilation_w_);
+  return key;
+}
+
 void Conv2DOpenCLKernel::SetConstArgs() {
   cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
   cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
index bdc2fcdddf..60bbd0ccdb 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@@ -43,6 +43,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {
 
   int Run() override;
   int Tune() override;
+  std::vector<BaseTuningParameter> GenerateTuningParam() override;
+  std::string Key() override;
 
   // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
   float alpha_{0.0f};
diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
index cb23e6964b..88931491aa 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -31,8 +31,8 @@ using mindspore::schema::PrimitiveType_Squeeze;
 namespace mindspore::kernel {
 
 int ReshapeOpenCLKernel::CheckSpecs() {
-  if (in_tensors_.size() != 1 && out_tensors_.size() != 1) {
-    MS_LOG(ERROR) << "Reshape in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
+  if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) {
+    MS_LOG(ERROR) << "Reshape input output size unsupported.";
     return RET_ERROR;
   }
   if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
index 70bbd03b4c..93db7888c4 100644
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -20,6 +20,8 @@
 
 #include <vector>
 #include <set>
+#include <map>
+#include <string>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "src/runtime/opencl/opencl_runtime.h"
@@ -287,6 +289,13 @@ class OpenCLKernel : public LiteKernel {
     if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
       return RET_OK;
     }
+    auto key = Key();
+    auto finded = tuned_param_cache_.find(key);
+    if (finded != tuned_param_cache_.end()) {
+      auto cache_param = finded->second;
+      MS_LOG(INFO) << "Tuning " << name() << ", found cached param(" << cache_param << ")";
+      return RET_OK;
+    }
     auto tuning_params = GenerateTuningParam();
     if (tuning_params.empty()) {
       MS_LOG(WARNING) << "Tuning param size is 0.";
@@ -312,6 +321,7 @@ class OpenCLKernel : public LiteKernel {
       MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
                    << "ms";
       AssignTuningParam(tuning_params[index]);
+      tuned_param_cache_[key] = tuning_params[index];
     } else {
       MS_LOG(WARNING) << "Cannot find suitable param.";
     }
@@ -330,6 +340,15 @@ class OpenCLKernel : public LiteKernel {
     return static_cast<double>(time_ns) * 1e-6;
   }
 
+  virtual std::string Key() {
+    std::string key = type_str();
+    key += "_global";
+    for (auto i : global_size_) {
+      key += "_" + std::to_string(i);
+    }
+    return key;
+  }
+
  protected:
   lite::opencl::OpenCLRuntime *ocl_runtime_;
   lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
@@ -356,6 +375,7 @@ class OpenCLKernel : public LiteKernel {
 
  private:
   lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
+  static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_;
 };
 template <class T>
 kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,