From ce4b3e743f57f1cc9e401f0a65470d7058ed1513 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 3 Dec 2020 17:36:01 +0800 Subject: [PATCH] optimize conv2d tuning --- .../runtime/kernel/opencl/kernel/conv2d.cc | 39 +++++++++++++++++++ .../src/runtime/kernel/opencl/kernel/conv2d.h | 2 + .../runtime/kernel/opencl/kernel/reshape.cc | 4 +- .../src/runtime/kernel/opencl/opencl_kernel.h | 20 ++++++++++ 4 files changed, 63 insertions(+), 2 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index 6d81fc9d5b..453ce0035c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -363,6 +363,45 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { } } +std::vector Conv2DOpenCLKernel::GenerateTuningParam() { + // don't need to tune local_c + std::vector tuning_params = {}; + if (use_winograd_) { + return tuning_params; + } + BaseTuningParameter default_tuning_param = BaseTuningParameter(); + default_tuning_param.local_size = local_size_; + tuning_params.push_back(default_tuning_param); + + std::vector max_work_items = ocl_runtime_->GetWorkItemSize(); + size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); + std::set candidate_x = GenerateLocalByGlobal(global_size_[0]); + std::set candidate_y = GenerateLocalByGlobal(global_size_[1]); + for (auto x : candidate_x) { + if (x <= max_work_items[0]) { + for (auto y : candidate_y) { + if (y <= max_work_items[1]) { + auto group_size = x * y * local_size_[2]; + if (group_size <= max_workgroup_size) { + BaseTuningParameter tuning_param = BaseTuningParameter(); + tuning_param.local_size = {x, y, local_size_[2]}; + tuning_params.push_back(tuning_param); + } + } + } + } + } + return tuning_params; +} + +std::string Conv2DOpenCLKernel::Key() { + auto key = OpenCLKernel::Key(); + key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + + std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" + + std::to_string(param_->dilation_w_); + return key; +} + void Conv2DOpenCLKernel::SetConstArgs() { cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h index bdc2fcdddf..60bbd0ccdb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h @@ -43,6 +43,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel { int Run() override; int Tune() override; + std::vector GenerateTuningParam() override; + std::string Key() override; // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU float alpha_{0.0f}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index cb23e6964b..88931491aa 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -31,8 +31,8 @@ using mindspore::schema::PrimitiveType_Squeeze; namespace mindspore::kernel { int ReshapeOpenCLKernel::CheckSpecs() { - if (in_tensors_.size() != 1 && out_tensors_.size() != 1) { - MS_LOG(ERROR) << "Reshape in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); + if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) { + MS_LOG(ERROR) << "Reshape input output size unsupported."; return RET_ERROR; } if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index 70bbd03b4c..93db7888c4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -20,6 +20,8 @@ #include #include +#include +#include #include "src/lite_kernel.h" #include "include/errorcode.h" #include "src/runtime/opencl/opencl_runtime.h" @@ -287,6 +289,13 @@ class OpenCLKernel : public LiteKernel { if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) { return RET_OK; } + auto key = Key(); + auto finded = tuned_param_cache_.find(key); + if (finded != tuned_param_cache_.end()) { + auto cache_param = finded->second; + MS_LOG(INFO) << "Tuning " << name() << ", found cached param(" << cache_param << ")"; + return RET_OK; + } auto tuning_params = GenerateTuningParam(); if (tuning_params.empty()) { MS_LOG(WARNING) << "Tuning param size is 0."; @@ -312,6 +321,7 @@ class OpenCLKernel : public LiteKernel { MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time << "ms"; AssignTuningParam(tuning_params[index]); + tuned_param_cache_[key] = tuning_params[index]; } else { MS_LOG(WARNING) << "Cannot find suitable param."; } @@ -330,6 +340,15 @@ class OpenCLKernel : public LiteKernel { return static_cast(time_ns) * 1e-6; } + virtual std::string Key() { + std::string key = type_str(); + key += "_global"; + for (auto i : global_size_) { + key += "_" + std::to_string(i); + } + return key; + } + protected: lite::opencl::OpenCLRuntime *ocl_runtime_; lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; @@ -356,6 +375,7 @@ class OpenCLKernel : public LiteKernel { private: lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; + static inline std::map tuned_param_cache_; }; template kernel::LiteKernel *OpenCLKernelCreator(const std::vector &inputs,