optimize conv2d tuning

5 years ago · ce4b3e743f
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc
@@ -363,6 +363,45 @@ void Conv2DOpenCLKernel::SetGlobalLocal() {
  }
 }

 std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() {
  // don't need to tune local_c
  std::vector<BaseTuningParameter> tuning_params = {};
  if (use_winograd_) {
    return tuning_params;
  }
  BaseTuningParameter default_tuning_param = BaseTuningParameter();
  default_tuning_param.local_size = local_size_;
  tuning_params.push_back(default_tuning_param);

  std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize();
  size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_);
  std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]);
  std::set<size_t> candidate_y = GenerateLocalByGlobal(global_size_[1]);
  for (auto x : candidate_x) {
    if (x <= max_work_items[0]) {
      for (auto y : candidate_y) {
        if (y <= max_work_items[1]) {
          auto group_size = x * y * local_size_[2];
          if (group_size <= max_workgroup_size) {
            BaseTuningParameter tuning_param = BaseTuningParameter();
            tuning_param.local_size = {x, y, local_size_[2]};
            tuning_params.push_back(tuning_param);
          }
        }
      }
    }
  }
  return tuning_params;
 }

 std::string Conv2DOpenCLKernel::Key() {
  auto key = OpenCLKernel::Key();
  key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" +
         std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" +
         std::to_string(param_->dilation_w_);
  return key;
 }

 void Conv2DOpenCLKernel::SetConstArgs() {
  cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_};
  cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h
@@ -43,6 +43,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel {

  int Run() override;
  int Tune() override;
  std::vector<BaseTuningParameter> GenerateTuningParam() override;
  std::string Key() override;

  // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU
  float alpha_{0.0f};
--- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
+++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
@@ -31,8 +31,8 @@ using mindspore::schema::PrimitiveType_Squeeze;
 namespace mindspore::kernel {

 int ReshapeOpenCLKernel::CheckSpecs() {
  if (in_tensors_.size() != 1 && out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "Reshape in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size();
  if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) {
    MS_LOG(ERROR) << "Reshape input output size unsupported.";
    return RET_ERROR;
  }
  if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
--- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
+++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
@@ -20,6 +20,8 @@

 #include <vector>
 #include <set>
 #include <map>
 #include <string>
 #include "src/lite_kernel.h"
 #include "include/errorcode.h"
 #include "src/runtime/opencl/opencl_runtime.h"
@@ -287,6 +289,13 @@ class OpenCLKernel : public LiteKernel {
    if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) {
      return RET_OK;
    }
    auto key = Key();
    auto finded = tuned_param_cache_.find(key);
    if (finded != tuned_param_cache_.end()) {
      auto cache_param = finded->second;
      MS_LOG(INFO) << "Tuning " << name() << ", found cached param(" << cache_param << ")";
      return RET_OK;
    }
    auto tuning_params = GenerateTuningParam();
    if (tuning_params.empty()) {
      MS_LOG(WARNING) << "Tuning param size is 0.";
@@ -312,6 +321,7 @@ class OpenCLKernel : public LiteKernel {
      MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time
                   << "ms";
      AssignTuningParam(tuning_params[index]);
      tuned_param_cache_[key] = tuning_params[index];
    } else {
      MS_LOG(WARNING) << "Cannot find suitable param.";
    }
@@ -330,6 +340,15 @@ class OpenCLKernel : public LiteKernel {
    return static_cast<double>(time_ns) * 1e-6;
  }

  virtual std::string Key() {
    std::string key = type_str();
    key += "_global";
    for (auto i : global_size_) {
      key += "_" + std::to_string(i);
    }
    return key;
  }

 protected:
  lite::opencl::OpenCLRuntime *ocl_runtime_;
  lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG};
@@ -356,6 +375,7 @@ class OpenCLKernel : public LiteKernel {

 private:
  lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_;
  static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_;
 };
 template <class T>
 kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs,