| @@ -363,6 +363,45 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { | |||
| } | |||
| } | |||
| std::vector<BaseTuningParameter> Conv2DOpenCLKernel::GenerateTuningParam() { | |||
| // don't need to tune local_c | |||
| std::vector<BaseTuningParameter> tuning_params = {}; | |||
| if (use_winograd_) { | |||
| return tuning_params; | |||
| } | |||
| BaseTuningParameter default_tuning_param = BaseTuningParameter(); | |||
| default_tuning_param.local_size = local_size_; | |||
| tuning_params.push_back(default_tuning_param); | |||
| std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize(); | |||
| size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); | |||
| std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]); | |||
| std::set<size_t> candidate_y = GenerateLocalByGlobal(global_size_[1]); | |||
| for (auto x : candidate_x) { | |||
| if (x <= max_work_items[0]) { | |||
| for (auto y : candidate_y) { | |||
| if (y <= max_work_items[1]) { | |||
| auto group_size = x * y * local_size_[2]; | |||
| if (group_size <= max_workgroup_size) { | |||
| BaseTuningParameter tuning_param = BaseTuningParameter(); | |||
| tuning_param.local_size = {x, y, local_size_[2]}; | |||
| tuning_params.push_back(tuning_param); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return tuning_params; | |||
| } | |||
| std::string Conv2DOpenCLKernel::Key() { | |||
| auto key = OpenCLKernel::Key(); | |||
| key += "_" + std::to_string(KH_) + "_" + std::to_string(KW_) + "_" + std::to_string(param_->stride_h_) + "_" + | |||
| std::to_string(param_->stride_w_) + "_" + std::to_string(param_->dilation_h_) + "_" + | |||
| std::to_string(param_->dilation_w_); | |||
| return key; | |||
| } | |||
| void Conv2DOpenCLKernel::SetConstArgs() { | |||
| cl_int4 input_shape = {batch_size_, IH_, IW_, CI_SLICES_}; | |||
| cl_int4 output_shape = {batch_size_, OH_, OW_, CO_SLICES_}; | |||
| @@ -43,6 +43,8 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||
| int Run() override; | |||
| int Tune() override; | |||
| std::vector<BaseTuningParameter> GenerateTuningParam() override; | |||
| std::string Key() override; | |||
| // for opencl fusion: Conv2D + PReLU(weight is scalar) -> param_.act_type=ActivationType_LEAKY_RELU | |||
| float alpha_{0.0f}; | |||
| @@ -31,8 +31,8 @@ using mindspore::schema::PrimitiveType_Squeeze; | |||
| namespace mindspore::kernel { | |||
| int ReshapeOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_.size() != 1 && out_tensors_.size() != 1) { | |||
| MS_LOG(ERROR) << "Reshape in size: " << in_tensors_.size() << ", out size: " << out_tensors_.size(); | |||
| if ((in_tensors_.size() != 1 && in_tensors_.size() != 2) || out_tensors_.size() != 1) { | |||
| MS_LOG(ERROR) << "Reshape input output size unsupported."; | |||
| return RET_ERROR; | |||
| } | |||
| if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { | |||
| @@ -20,6 +20,8 @@ | |||
| #include <vector> | |||
| #include <set> | |||
| #include <map> | |||
| #include <string> | |||
| #include "src/lite_kernel.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| @@ -287,6 +289,13 @@ class OpenCLKernel : public LiteKernel { | |||
| if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) { | |||
| return RET_OK; | |||
| } | |||
| auto key = Key(); | |||
| auto finded = tuned_param_cache_.find(key); | |||
| if (finded != tuned_param_cache_.end()) { | |||
| auto cache_param = finded->second; | |||
| MS_LOG(INFO) << "Tuning " << name() << ", found cached param(" << cache_param << ")"; | |||
| return RET_OK; | |||
| } | |||
| auto tuning_params = GenerateTuningParam(); | |||
| if (tuning_params.empty()) { | |||
| MS_LOG(WARNING) << "Tuning param size is 0."; | |||
| @@ -312,6 +321,7 @@ class OpenCLKernel : public LiteKernel { | |||
| MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time | |||
| << "ms"; | |||
| AssignTuningParam(tuning_params[index]); | |||
| tuned_param_cache_[key] = tuning_params[index]; | |||
| } else { | |||
| MS_LOG(WARNING) << "Cannot find suitable param."; | |||
| } | |||
| @@ -330,6 +340,15 @@ class OpenCLKernel : public LiteKernel { | |||
| return static_cast<double>(time_ns) * 1e-6; | |||
| } | |||
| virtual std::string Key() { | |||
| std::string key = type_str(); | |||
| key += "_global"; | |||
| for (auto i : global_size_) { | |||
| key += "_" + std::to_string(i); | |||
| } | |||
| return key; | |||
| } | |||
| protected: | |||
| lite::opencl::OpenCLRuntime *ocl_runtime_; | |||
| lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; | |||
| @@ -356,6 +375,7 @@ class OpenCLKernel : public LiteKernel { | |||
| private: | |||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | |||
| static inline std::map<std::string, BaseTuningParameter> tuned_param_cache_; | |||
| }; | |||
| template <class T> | |||
| kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||