From 1d0ee5ca046c2c67c22c1c4c90b039518edbe3f5 Mon Sep 17 00:00:00 2001 From: chenzupeng Date: Thu, 26 Nov 2020 10:51:03 +0800 Subject: [PATCH] add tuning --- .../kernel/opencl/kernel/activation.cc | 7 +- .../runtime/kernel/opencl/kernel/activation.h | 1 - .../runtime/kernel/opencl/kernel/argminmax.cc | 8 +- .../runtime/kernel/opencl/kernel/argminmax.h | 2 +- .../kernel/opencl/kernel/arithmetic.cc | 15 +- .../runtime/kernel/opencl/kernel/arithmetic.h | 1 - .../kernel/opencl/kernel/arithmetic_self.cc | 10 +- .../kernel/opencl/kernel/arithmetic_self.h | 1 - .../kernel/opencl/kernel/batch_to_space_nd.cc | 8 +- .../kernel/opencl/kernel/batch_to_space_nd.h | 2 +- .../runtime/kernel/opencl/kernel/batchnorm.cc | 10 +- .../runtime/kernel/opencl/kernel/biasadd.h | 1 - .../src/runtime/kernel/opencl/kernel/cast.cc | 10 +- .../src/runtime/kernel/opencl/kernel/cast.h | 2 - .../runtime/kernel/opencl/kernel/concat.cc | 14 +- .../src/runtime/kernel/opencl/kernel/concat.h | 1 - .../runtime/kernel/opencl/kernel/conv2d.cc | 49 +++--- .../src/runtime/kernel/opencl/kernel/conv2d.h | 2 +- .../kernel/opencl/kernel/conv2d_transpose.cc | 2 +- .../kernel/opencl/kernel/conv2d_transpose.h | 1 - .../kernel/opencl/kernel/depthwise_conv2d.cc | 17 +-- .../kernel/opencl/kernel/depthwise_conv2d.h | 1 - .../src/runtime/kernel/opencl/kernel/fill.h | 3 - .../kernel/opencl/kernel/fullconnection.cc | 8 +- .../kernel/opencl/kernel/fullconnection.h | 2 +- .../runtime/kernel/opencl/kernel/gather.cc | 8 +- .../src/runtime/kernel/opencl/kernel/gather.h | 2 +- .../src/runtime/kernel/opencl/kernel/hswish.h | 50 ------ .../runtime/kernel/opencl/kernel/matmul.cc | 12 +- .../src/runtime/kernel/opencl/kernel/matmul.h | 2 +- .../runtime/kernel/opencl/kernel/one_hot.cc | 6 +- .../runtime/kernel/opencl/kernel/one_hot.h | 1 - .../src/runtime/kernel/opencl/kernel/pad.cc | 7 +- .../src/runtime/kernel/opencl/kernel/pad.h | 1 - .../runtime/kernel/opencl/kernel/pooling2d.cc | 7 +- .../runtime/kernel/opencl/kernel/pooling2d.h | 1 - .../src/runtime/kernel/opencl/kernel/power.h | 1 - .../src/runtime/kernel/opencl/kernel/prelu.h | 1 - .../runtime/kernel/opencl/kernel/reduce.cc | 19 ++- .../src/runtime/kernel/opencl/kernel/reduce.h | 2 +- .../runtime/kernel/opencl/kernel/reshape.cc | 8 +- .../runtime/kernel/opencl/kernel/reshape.h | 1 - .../runtime/kernel/opencl/kernel/resize.cc | 7 +- .../src/runtime/kernel/opencl/kernel/resize.h | 1 - .../src/runtime/kernel/opencl/kernel/scale.h | 1 - .../runtime/kernel/opencl/kernel/softmax.cc | 10 +- .../runtime/kernel/opencl/kernel/softmax.h | 2 +- .../kernel/opencl/kernel/space_to_batch_nd.cc | 8 +- .../kernel/opencl/kernel/space_to_batch_nd.h | 1 - .../kernel/opencl/kernel/space_to_depth.cc | 6 +- .../kernel/opencl/kernel/space_to_depth.h | 1 - .../kernel/opencl/kernel/sparse_to_dense.cc | 8 +- .../kernel/opencl/kernel/sparse_to_dense.h | 1 - .../src/runtime/kernel/opencl/kernel/stack.h | 1 - .../kernel/opencl/kernel/strided_slice.cc | 14 +- .../kernel/opencl/kernel/strided_slice.h | 1 - .../runtime/kernel/opencl/kernel/to_format.cc | 10 +- .../runtime/kernel/opencl/kernel/to_format.h | 1 - .../runtime/kernel/opencl/kernel/transpose.cc | 8 +- .../runtime/kernel/opencl/kernel/transpose.h | 1 - .../src/runtime/kernel/opencl/opencl_kernel.h | 144 +++++++++++++++++- .../runtime/kernel/opencl/opencl_subgraph.cc | 13 +- .../src/runtime/opencl/opencl_executor.cc | 28 +++- .../lite/src/runtime/opencl/opencl_executor.h | 3 + .../lite/src/runtime/opencl/opencl_runtime.cc | 39 ++--- .../lite/src/runtime/opencl/opencl_runtime.h | 10 +- 66 files changed, 378 insertions(+), 248 deletions(-) delete mode 100644 mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc index 49d267ae81..7e76199901 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.cc @@ -92,8 +92,9 @@ void ActivationOpenCLKernel::SetConstArgs() { } void ActivationOpenCLKernel::SetGlobalLocal() { - local_range_ = cl::NullRange; - global_range_ = {outShape.width, outShape.height}; + local_size_ = {}; + global_size_ = {outShape.width, outShape.height}; + AlignGlobalLocal(global_size_, local_size_); } int ActivationOpenCLKernel::Run() { @@ -101,7 +102,7 @@ int ActivationOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); if (ret != RET_OK) { MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; return RET_ERROR; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h index 5abac64eba..bc8232c4dc 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/activation.h @@ -42,7 +42,6 @@ class ActivationOpenCLKernel : public OpenCLKernel { private: static std::string GetActTypeString(int act_type); - cl::Kernel kernel_; int type_; float alpha_; GpuTensorInfo outShape = GpuTensorInfo(nullptr); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc index 615f693933..796dc180a8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.cc @@ -115,9 +115,9 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() { default: // 3 break; } - std::vector local = {1, 1, 1}; - std::vector global = {static_cast(strides_.s[0]), static_cast(src_size_.s[1]), 1}; - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; + global_size_ = {static_cast(strides_.s[0]), static_cast(src_size_.s[1]), 1}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ArgMinMaxOpenCLKernel::InitWeights() { @@ -153,7 +153,7 @@ int ArgMinMaxOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h index bf6aa1428e..6b7ce95095 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/argminmax.h @@ -38,9 +38,9 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel { void SetConstArgs() override; void SetGlobalLocal() override; int InitWeights() override; + int Tune() override { return lite::RET_OK; } private: - cl::Kernel kernel_; void *buff_{nullptr}; void *ids_{nullptr}; GpuTensorInfo im_in_{GpuTensorInfo(nullptr)}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc index fff321f789..c8aa34908e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.cc @@ -124,23 +124,24 @@ int ArithmeticOpenCLKernel::CheckSpecs() { void ArithmeticOpenCLKernel::SetGlobalLocal() { if (element_flag_) { - local_range_ = {}; + local_size_ = {}; auto out_shape = out_tensors_[0]->shape(); if (out_shape.size() == 2) { size_t H = out_shape[0]; size_t W = UP_DIV(out_shape[1], C4NUM); - global_range_ = {W, H}; + global_size_ = {W, H}; } else { size_t H = out_shape[0] * out_shape[1]; size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); - global_range_ = {W, H}; + global_size_ = {W, H}; } } else { - local_range_ = {}; + local_size_ = {}; auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); - global_range_ = {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), - static_cast(out_shape[1] * out_shape[0])}; + global_size_ = {static_cast(UP_DIV(out_shape[3], C4NUM)), static_cast(out_shape[2]), + static_cast(out_shape[1] * out_shape[0])}; } + AlignGlobalLocal(global_size_, local_size_); } int ArithmeticOpenCLKernel::InitWeights() { @@ -269,7 +270,7 @@ int ArithmeticOpenCLKernel::Run() { auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h index 9a5095f2c9..7e49f8562f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic.h @@ -39,7 +39,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; bool element_flag_{true}; float activation_min_{-FLT_MAX}; float activation_max_{FLT_MAX}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc index 3ebb7e90b2..e8b6c5b12c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc @@ -131,10 +131,10 @@ void ArithmeticSelfOpenCLKernel::SetGlobalLocal() { OC = UP_DIV(output_shape[1], C4NUM); } const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); - std::vector local = {1, 1, 1}; // init local - std::vector global = {OH, OW, OC}; - ArithmeticSelfGetWorkGroup(global, &local, max_global[0]); - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; // init local + global_size_ = {OH, OW, OC}; + ArithmeticSelfGetWorkGroup(global_size_, &local_size_, max_global[0]); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ArithmeticSelfOpenCLKernel::Prepare() { @@ -159,7 +159,7 @@ int ArithmeticSelfOpenCLKernel::Run() { int arg_cn = 0; ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h index 482341948e..93512e1fef 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.h @@ -43,7 +43,6 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel { private: void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); cl_int4 output_shape_ = {}; - cl::Kernel kernel_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc index f79c4b08c9..531de236d3 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc @@ -75,9 +75,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() { size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); std::vector out_shape = out_tensors_[0]->shape(); cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; - std::vector local = {1, 1, 1}; - std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; + global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int BatchToSpaceNDOpenCLKernel::Prepare() { @@ -103,7 +103,7 @@ int BatchToSpaceNDOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h index 6c80c9db96..c6db117a17 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h @@ -37,9 +37,9 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override { return lite::RET_OK; } private: - cl::Kernel kernel_; }; } // namespace mindspore::kernel #endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc index 7231431e54..ecf13fcf46 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batchnorm.cc @@ -62,10 +62,10 @@ void BatchNormOpenCLKernel::SetGlobalLocal() { uint32_t OC = UP_DIV(output_shape[3], C4NUM); const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); - std::vector local = {1, 1, 1}; // init local - std::vector global = {OH, OW, OC}; - BatchNormGetWorkGroup(global, &local, max_global[0]); - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; // init local + global_size_ = {OH, OW, OC}; + BatchNormGetWorkGroup(global_size_, &local_size_, max_global[0]); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int BatchNormOpenCLKernel::Prepare() { @@ -91,7 +91,7 @@ int BatchNormOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h index d6e03a12a0..82dd1d52e4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/biasadd.h @@ -40,7 +40,6 @@ class BiasAddOpenCLKernel : public OpenCLKernel { private: cl_int4 GetGlobalshape(); - cl::Kernel kernel_; void *BiasAdd_{nullptr}; int in_size_{}; int out_size_{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc index 65f94248b9..7e571861f6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.cc @@ -73,10 +73,10 @@ void CastOpenCLKernel::SetGlobalLocal() { uint32_t OC = UP_DIV(input_shape[3], C4NUM); const std::vector &max_global = ocl_runtime_->GetWorkItemSize(); - std::vector local = {1, 1, 1}; // init local - std::vector global = {OH, OW, OC}; - CastGetWorkGroup(global, &local, max_global[0]); - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; // init local + global_size_ = {OH, OW, OC}; + CastGetWorkGroup(global_size_, &local_size_, max_global[0]); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int CastOpenCLKernel::Prepare() { @@ -100,7 +100,7 @@ int CastOpenCLKernel::Run() { int arg_cn = 0; ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h index 6c778ecfec..63af1e5c28 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/cast.h @@ -41,8 +41,6 @@ class CastOpenCLKernel : public OpenCLKernel { private: int GetKernelName(std::string *kernel_name, CastParameter *param); - - cl::Kernel kernel_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc index 14da40ec69..e1a74ab697 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.cc @@ -132,17 +132,17 @@ void ConcatOpenCLKernel::SetGlobalLocal() { if (axis_ == 3 && !Align_) { OH = out_shape_.s[0] * out_shape_.s[1]; OW = out_shape_.s[2]; - global = {OH, OW, 1}; - local = {1, 1, 1}; + global_size_ = {OH, OW, 1}; + local_size_ = {1, 1, 1}; } else { OH = out_shape_.s[0] * out_shape_.s[1]; OW = out_shape_.s[2]; OC = out_shape_.s[3]; - global = {OH, OW, OC}; - local = {1, 1, 1}; + global_size_ = {OH, OW, OC}; + local_size_ = {1, 1, 1}; } - ConcatGetWorkGroup(global, &local, max_global[0]); - OpenCLKernel::AlignGlobalLocal(global, local); + ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ConcatOpenCLKernel::Prepare() { @@ -196,7 +196,7 @@ int ConcatOpenCLKernel::Run() { MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); return RET_ERROR; } - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h index f390704f8b..558e8eaf8d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/concat.h @@ -53,7 +53,6 @@ class ConcatOpenCLKernel : public OpenCLKernel { private: int RunAxis0(); - cl::Kernel kernel_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc index a4eca9d3cd..50604ee3ef 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.cc @@ -93,7 +93,7 @@ int Conv2DOpenCLKernel::Prepare() { std::string program_name = "winograd"; ocl_runtime_->LoadSource(program_name, winograd_source); ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); - ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution"); + ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); } else { SetBlockSize(); @@ -101,7 +101,7 @@ int Conv2DOpenCLKernel::Prepare() { std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + std::to_string(block_size_.C); ocl_runtime_->LoadSource(program_name, conv2d_source); - ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name); + ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); } // allocate winograd memory @@ -329,7 +329,9 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { local_h = std::min(global_h, local_hw); local_w = std::min(local_hw / local_h, global_w); } - AlignGlobalLocal({global_h, global_w, global_c}, {local_h, local_w, local_c}); + global_size_ = {global_h, global_w, global_c}; + local_size_ = {local_h, local_w, local_c}; + AlignGlobalLocal(global_size_, local_size_); } } @@ -355,11 +357,11 @@ void Conv2DOpenCLKernel::SetConstArgs() { arg_cn = 0; cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, conv_out_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape); arg_cn = 2; cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; @@ -373,30 +375,37 @@ void Conv2DOpenCLKernel::SetConstArgs() { cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation); - ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, act_type); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); + ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); + ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type); } } +int Conv2DOpenCLKernel::Tune() { + if (use_winograd_) { + return RET_OK; + } + return OpenCLKernel::Tune(); +} + int Conv2DOpenCLKernel::Run() { if (use_winograd_) { ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); - ocl_runtime_->RunKernel(kernel_conv_, global_conv_, local_conv_); + ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_); ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); } else { - ocl_runtime_->SetKernelArg(kernel_conv_, 0, in_tensors_.front()->data_c()); - ocl_runtime_->SetKernelArg(kernel_conv_, 1, out_tensors_.front()->data_c()); - ocl_runtime_->RunKernel(kernel_conv_, global_range_, local_range_); + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); } return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h index 769bab3d92..95bef1d513 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d.h @@ -42,6 +42,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel { void SetConstArgs() override; int Run() override; + int Tune() override; private: void SetBlockSize(); @@ -60,7 +61,6 @@ class Conv2DOpenCLKernel : public OpenCLKernel { } cl::Kernel kernel_4x4to36_; - cl::Kernel kernel_conv_; cl::Kernel kernel_36to4x4_; cl::NDRange global_4x4to36_, local_4x4to36_; cl::NDRange global_conv_, local_conv_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc index 8879fdea41..82544a5750 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.cc @@ -193,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() { int arg_cnt = 0; ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h index 8cb78c21db..05009c11f4 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/conv2d_transpose.h @@ -40,7 +40,6 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; void *padWeight_{nullptr}; void *bias_{nullptr}; bool enable_fp16_{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc index fdc08ecb62..7a9d100f1d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.cc @@ -175,24 +175,23 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() { void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { // set global size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); - std::vector global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), - (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; + global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), + (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; // set local const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); - int z = global_size[0]; - int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); - int x = std::max(1, std::min(static_cast(global_size[1]), max_group_size / (y * z))); - std::vector local_size = - std::vector({static_cast(z), static_cast(x), static_cast(y)}); + int z = global_size_[0]; + int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8)); + int x = std::max(1, std::min(static_cast(global_size_[1]), max_group_size / (y * z))); + local_size_ = std::vector({static_cast(z), static_cast(x), static_cast(y)}); - OpenCLKernel::AlignGlobalLocal(global_size, local_size); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int DepthwiseConv2dOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h index 5d059b4ee0..17bafbd094 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/depthwise_conv2d.h @@ -42,7 +42,6 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { private: void *packed_weight_{nullptr}; void *bias_data_{nullptr}; - cl::Kernel kernel_; std::vector block_size_{2, 2, 1}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h index abfbbeca0a..7e71745789 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fill.h @@ -43,9 +43,6 @@ class FillOpenCLKernel : public OpenCLKernel { private: int RunFill(); int RunShape(); - cl::Kernel kernel_; - - private: float default_{0.0f}; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc index bb6d4432ec..e79a7156e1 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.cc @@ -179,9 +179,9 @@ int FullConnectionOpenCLKernel::InitWeights() { } void FullConnectionOpenCLKernel::SetGlobalLocal() { - std::vector local = {32, 4, 1}; - std::vector global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; - AlignGlobalLocal(global, local); + local_size_ = {32, 4, 1}; + global_size_ = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; + AlignGlobalLocal(global_size_, local_size_); } void FullConnectionOpenCLKernel::SetConstArgs() { @@ -202,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() { int arg_count = 0; ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h index 1b2b8556b8..d507eecf35 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/fullconnection.h @@ -38,9 +38,9 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { void SetConstArgs() override; void SetGlobalLocal() override; int Init() override; + int Tune() override { return lite::RET_OK; } private: - cl::Kernel kernel_; void *padWeight_{nullptr}; void *bias_{nullptr}; bool enable_fp16_{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc index 3145e93f0e..40ac54c80d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc @@ -93,9 +93,9 @@ void GatherOpenCLKernel::SetConstArgs() { void GatherOpenCLKernel::SetGlobalLocal() { auto output = GpuTensorInfo(out_tensors_.front()); - std::vector local = {1, 1, 1}; - std::vector global = {output.W, output.N * output.H, output.Slice}; - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; + global_size_ = {output.W, output.N * output.H, output.Slice}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int GatherOpenCLKernel::Prepare() { @@ -155,7 +155,7 @@ int GatherOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h index 4fb5c0151d..6b1524cbdb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h @@ -38,12 +38,12 @@ class GatherOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override { return lite::RET_OK; } protected: int UpdateWeights(); private: - cl::Kernel kernel_; int32_t *indices_data_{nullptr}; int axis_ = {0}; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h deleted file mode 100644 index 33b2a27317..0000000000 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/hswish.h +++ /dev/null @@ -1,50 +0,0 @@ -/** - * Copyright 2019 Huawei Technologies Co., Ltd - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ -#define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ - -#include -#include "mindspore/lite/nnacl/fp32/activation_fp32.h" -#include "src/runtime/kernel/opencl/opencl_kernel.h" - -namespace mindspore::kernel { - -class HswishOpenCLKernel : public OpenCLKernel { - public: - HswishOpenCLKernel(OpParameter *parameter, const std::vector &inputs, - const std::vector &outputs) - : OpenCLKernel(parameter, inputs, outputs) {} - - ~HswishOpenCLKernel() override = default; - - int Init() override; - - int Run() override; - - private: - int InferShapeTo4D(); - cl::Kernel kernel_; - - private: - size_t N_{1}; - size_t H_{1}; - size_t W_{1}; - size_t C_{1}; -}; - -} // namespace mindspore::kernel -#endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc index bdee117db1..b113244491 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.cc @@ -137,11 +137,11 @@ int MatMulOpenCLKernel::InitWeights() { void MatMulOpenCLKernel::SetGlobalLocal() { // local size should less than MAX_GROUP_SIZE - std::vector local = {32, 4, 1}; - std::vector global = {UP_DIV(static_cast(outShape[3]), C4NUM), - 4 * static_cast(outShape[0]) * static_cast(outShape[1]), - static_cast(outShape[2])}; - AlignGlobalLocal(global, local); + local_size_ = {32, 4, 1}; + global_size_ = {UP_DIV(static_cast(outShape[3]), C4NUM), + 4 * static_cast(outShape[0]) * static_cast(outShape[1]), + static_cast(outShape[2])}; + AlignGlobalLocal(global_size_, local_size_); } void MatMulOpenCLKernel::SetConstArgs() { @@ -158,7 +158,7 @@ int MatMulOpenCLKernel::Run() { int arg_count = 0; ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h index 66855a4b3f..ddc2caf98f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/matmul.h @@ -37,9 +37,9 @@ class MatMulOpenCLKernel : public OpenCLKernel { int InitWeights() override; void SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override { return lite::RET_OK; } private: - cl::Kernel kernel_; void *padWeight_{nullptr}; bool enable_fp16_{false}; bool transposeA{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc index 7dea6d5203..78e203449e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.cc @@ -85,14 +85,16 @@ void OneHotOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast(out_shape_.C)); } void OneHotOpenCLKernel::SetGlobalLocal() { - global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; + local_size_ = {}; + global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; + AlignGlobalLocal(global_size_, local_size_); } int OneHotOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h index 865a713a9d..23c6bf73ac 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/one_hot.h @@ -39,7 +39,6 @@ class OneHotOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; int depth_{0}; float on_value_{1.0f}; float off_value_{0.0f}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc index fb9eee7987..b8a0506866 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.cc @@ -97,14 +97,15 @@ void PadOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast(param_->constant_value_)); - - AlignGlobalLocal({output.N * output.H, output.W, output.Slice}, {8, 4, 1}); + local_size_ = {8, 4, 1}; + global_size_ = {output.N * output.H, output.W, output.Slice}; + AlignGlobalLocal(global_size_, local_size_); } int PadOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h index 604d08640f..4578e6ee04 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pad.h @@ -41,7 +41,6 @@ class PadOpenCLKernel : public OpenCLKernel { int Run() override; private: - cl::Kernel kernel_; PadParameter *param_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc index 52d30465c9..3839fba8fb 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.cc @@ -86,8 +86,9 @@ void PoolingOpenCLKernel::SetGlobalLocal() { const size_t global_x = out_tensors_[0]->shape()[1]; const size_t global_y = out_tensors_[0]->shape()[2]; const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); - global_range_ = {global_z, global_y, global_x}; - local_range_ = {}; + global_size_ = {global_z, global_y, global_x}; + local_size_ = {}; + AlignGlobalLocal(global_size_, local_size_); } void PoolingOpenCLKernel::SetConstArgs() { @@ -111,7 +112,7 @@ int PoolingOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h index aa4ae8e368..037ccc10df 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/pooling2d.h @@ -39,7 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel { private: PoolingParameter *parameter_; - cl::Kernel kernel_; std::vector local_size_; std::vector global_size_; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h index 7927b924c5..7c0dff5530 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/power.h @@ -37,7 +37,6 @@ class PowerOpenCLKernel : public OpenCLKernel { private: int InferShapeTo4D(); - cl::Kernel kernel_; private: size_t N_{1}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h index 78c3c6a369..7e1af28d64 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/prelu.h @@ -37,7 +37,6 @@ class PReluOpenCLKernel : public OpenCLKernel { int InitWeights() override; private: - cl::Kernel kernel_; bool enable_fp16_{false}; int batch_size_{}; int C_{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc index a3861c77a2..0ecd1f3498 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.cc @@ -148,15 +148,22 @@ void ReduceOpenCLKernel::SetGlobalLocal() { int h = shapex[1]; int c = shapex[3]; int c4 = UP_DIV(c, C4NUM); - std::vector local = {}; + local_size_ = {}; if (use_local_) { - local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; + local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; } - std::vector global = {static_cast(c4), 1, 1}; + global_size_ = {static_cast(c4), 1, 1}; if (wc_reduce_) { - global = {static_cast(h), 1, 1}; + global_size_ = {static_cast(h), 1, 1}; } - AlignGlobalLocal(global, local); + AlignGlobalLocal(global_size_, local_size_); +} + +int ReduceOpenCLKernel::Tune() { + if (use_local_) { + return RET_OK; + } + return OpenCLKernel::Tune(); } int ReduceOpenCLKernel::Run() { @@ -164,7 +171,7 @@ int ReduceOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h index f316b93b65..85c81312f8 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reduce.h @@ -36,11 +36,11 @@ class ReduceOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override; private: cl_float4 GenC4Mask(); static std::string GetReduceTypeStr(int type); - cl::Kernel kernel_; GpuTensorInfo outShape = GpuTensorInfo(nullptr); bool use_local_{false}; bool wc_reduce_{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index 8d436e9820..4b39d0931b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -55,9 +55,9 @@ void ReshapeOpenCLKernel::SetConstArgs() { void ReshapeOpenCLKernel::SetGlobalLocal() { auto out = GpuTensorInfo(out_tensors_.front()); - std::vector local = {}; - std::vector global{out.width, out.height}; - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {}; + global_size_ = {out.width, out.height}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ReshapeOpenCLKernel::Prepare() { @@ -81,7 +81,7 @@ int ReshapeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h index 44185184ea..98cf0978ee 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h @@ -37,7 +37,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc index 252897696f..a2e491f366 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.cc @@ -100,9 +100,10 @@ void ResizeOpenCLKernel::SetConstArgs() { } void ResizeOpenCLKernel::SetGlobalLocal() { - local_range_ = {}; + local_size_ = {}; auto out_shape = GpuTensorInfo(out_tensors_[0]); - global_range_ = {out_shape.Slice, out_shape.W, out_shape.H}; + global_size_ = {out_shape.Slice, out_shape.W, out_shape.H}; + AlignGlobalLocal(global_size_, local_size_); } int ResizeOpenCLKernel::Run() { @@ -110,7 +111,7 @@ int ResizeOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h index 935ff3abc3..3a42882f2a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/resize.h @@ -40,7 +40,6 @@ class ResizeOpenCLKernel : public OpenCLKernel { private: float getResizeScaleFactor(int input_size, int output_size); - cl::Kernel kernel_; bool alignCorner{false}; bool preserveAspectRatio{false}; }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h index 6273067a15..e214fabb37 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/scale.h @@ -37,7 +37,6 @@ class ScaleOpenCLKernel : public OpenCLKernel { private: void Image2dGetWorkGroupSize(); - cl::Kernel kernel_; bool weight_vector_flag_{true}; bool broadcast_flag_{false}; bool broadcast_H_flag_{false}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc index ae7929ca4a..7fca4c09ed 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.cc @@ -116,6 +116,13 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() { AlignGlobalLocal(global_size_, local_size_); } +int SoftmaxOpenCLKernel::Tune() { + if (onexone_flag_) { + return RET_OK; + } + return OpenCLKernel::Tune(); +} + void SoftmaxOpenCLKernel::SetConstArgs() { int arg_idx = 2; int channel = out_shape.C; @@ -133,8 +140,7 @@ int SoftmaxOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - // run opengl kernel - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h index 9ba280b6be..f89fd58776 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/softmax.h @@ -38,6 +38,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { int CheckSpecs() override; void SetConstArgs() override; void SetGlobalLocal() override; + int Tune() override; private: int InitGlobalSize(); @@ -45,7 +46,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { int SetWorkGroupSize(); std::vector GetMaskForLastChannel(int channels); - cl::Kernel kernel_; SoftmaxParameter *parameter_; bool onexone_flag_{false}; std::vector local_size_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc index aa4388b0ea..5c2c4031f5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc @@ -80,9 +80,9 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() { void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; - std::vector local = {1, 1, 1}; - std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; - OpenCLKernel::AlignGlobalLocal(global, local); + local_size_ = {1, 1, 1}; + global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int SpaceToBatchNDOpenCLKernel::Prepare() { @@ -109,7 +109,7 @@ int SpaceToBatchNDOpenCLKernel::Run() { ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h index f1fead8674..2bd9b0c352 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h @@ -39,7 +39,6 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; }; } // namespace mindspore::kernel #endif diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc index 035755a66c..e24e58e5e7 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.cc @@ -69,7 +69,9 @@ void SpaceToDepthOpenCLKernel::SetConstArgs() { ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size); } void SpaceToDepthOpenCLKernel::SetGlobalLocal() { - global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; + local_size_ = {}; + global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; + AlignGlobalLocal(global_size_, local_size_); } int SpaceToDepthOpenCLKernel::Run() { @@ -77,7 +79,7 @@ int SpaceToDepthOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h index b7df317841..671b9dedb6 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_depth.h @@ -38,7 +38,6 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr); GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr); }; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc index a1d203250c..30ce43611a 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.cc @@ -136,11 +136,11 @@ void SparseToDenseOpenCLKernel::SetConstArgs() { } void SparseToDenseOpenCLKernel::SetGlobalLocal() { - std::vector local = {1, 1}; + local_size_ = {1, 1}; size_t OH = n_ * h_; size_t OW = w_ * UP_DIV(c_, C4NUM); - std::vector global = {OH, OW}; - OpenCLKernel::AlignGlobalLocal(global, local); + global_size_ = {OH, OW}; + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int SparseToDenseOpenCLKernel::Prepare() { @@ -209,7 +209,7 @@ int SparseToDenseOpenCLKernel::Run() { } else { ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); } - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h index c51c21782c..b69ff1807b 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/sparse_to_dense.h @@ -43,7 +43,6 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel { int InitOutputToDefault(); private: - cl::Kernel kernel_; // bool IndicesIsScalar{false}; bool enable_fp16_{false}; float default_{0.0f}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h index a2174f7368..9fc809b988 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/stack.h @@ -44,7 +44,6 @@ class StackOpenCLKernel : public OpenCLKernel { int InferOutTensorShapeTo4D(cl_int4 *output_shape); - cl::Kernel kernel_; int axis_{0}; size_t N_{1}; size_t H_{1}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc index 482d0ea351..a92f84b621 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc @@ -164,24 +164,24 @@ void StridedSliceOpenCLKernel::SetConstArgs() { void StridedSliceOpenCLKernel::SetGlobalLocal() { auto output_info = GpuTensorInfo(out_tensors_.front()); - std::vector global = {output_info.N * output_info.H, output_info.W, output_info.Slice}; + global_size_ = {output_info.N * output_info.H, output_info.W, output_info.Slice}; const int max_divider = 8; auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); - size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); + size_t local_c = GetMaxDivisorStrategy0(global_size_[2], max_divider); local_c = std::max(local_c, 1); size_t local_hw = max_work_group_size / local_c; - size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); - size_t local_w = std::min(local_hw / local_h, global[1]); - std::vector local = {local_h, local_w, local_c}; - AlignGlobalLocal(global, local); + size_t local_h = std::min(UP_DIV(global_size_[0], 2), local_hw); + size_t local_w = std::min(local_hw / local_h, global_size_[1]); + local_size_ = {local_h, local_w, local_c}; + AlignGlobalLocal(global_size_, local_size_); } int StridedSliceOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h index 58fb7d3d8b..7016360998 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.h @@ -42,7 +42,6 @@ class StridedSliceOpenCLKernel : public OpenCLKernel { private: int InitConstArgs(); - cl::Kernel kernel_; cl_int4 input_shape_{}; cl_int4 output_shape_{}; cl_int2 io_slices_{}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc index 3e469cf496..17c468cb52 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.cc @@ -51,13 +51,13 @@ void ToFormatOpenCLKernel::SetConstArgs() { } void ToFormatOpenCLKernel::SetGlobalLocal() { - std::vector global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; - std::vector local = {8, 16, 3}; + global_size_ = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; + local_size_ = {8, 16, 3}; size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); if (max_work_group_size < 384) { - local[2] = 1; + local_size_[2] = 1; } - OpenCLKernel::AlignGlobalLocal(global, local); + OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); } int ToFormatOpenCLKernel::Prepare() { @@ -97,7 +97,7 @@ int ToFormatOpenCLKernel::Run() { auto dst_mem_type = out_mem_type_; ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type); ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h index ee4801ea58..41eef5a5ee 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/to_format.h @@ -38,7 +38,6 @@ class ToFormatOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; size_t N_{1}; size_t H_{1}; size_t W_{1}; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc index d6b53ffc95..6a3c90cded 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.cc @@ -96,11 +96,13 @@ void TransposeOpenCLKernel::SetGlobalLocal() { size_t w = shapex[2]; size_t c = shapex[3]; size_t c4 = UP_DIV(c, 4); + local_size_ = {}; if (type == TransposeType::AXIS0312) { // NHWC -> NCHW - global_range_ = {UP_DIV(h, C4NUM), w, c4}; + global_size_ = {UP_DIV(h, C4NUM), w, c4}; } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC - global_range_ = {h, UP_DIV(w, C4NUM), c4}; + global_size_ = {h, UP_DIV(w, C4NUM), c4}; } + AlignGlobalLocal(global_size_, local_size_); } int TransposeOpenCLKernel::Run() { @@ -108,7 +110,7 @@ int TransposeOpenCLKernel::Run() { int arg_idx = 0; ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); return mindspore::lite::RET_OK; } diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h index 1d893f4733..8b6f3c80d9 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/transpose.h @@ -41,7 +41,6 @@ class TransposeOpenCLKernel : public OpenCLKernel { void SetGlobalLocal() override; private: - cl::Kernel kernel_; TransposeType type{TransposeType::AXIS0312}; }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index c96935afba..fdb8e274a5 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -16,8 +16,10 @@ #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ +#define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000 // 10 seconds #include +#include #include "src/lite_kernel.h" #include "include/errorcode.h" #include "src/runtime/opencl/opencl_runtime.h" @@ -137,6 +139,16 @@ struct GpuTensorInfo { size_t NDim{}; }; +struct BaseTuningParameter { + std::vector local_size; + friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) { + ostrm << "LocalSize:"; + for (auto i : a.local_size) { + ostrm << i << ","; + } + return ostrm; + } +}; class OpenCLKernel : public LiteKernel { public: OpenCLKernel(OpParameter *parameter, const std::vector &inputs, @@ -158,7 +170,9 @@ class OpenCLKernel : public LiteKernel { for (size_t i = 0; i < local.size(); i++) { MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); } - + if (local.empty()) { + local_range_ = cl::NullRange; + } if (global.size() == 1) { global_range_ = cl::NDRange(internal_global_ws.at(0)); if (!local.empty()) { @@ -209,13 +223,135 @@ class OpenCLKernel : public LiteKernel { lite::opencl::MemType GetMemType() { return out_mem_type_; } void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } + virtual std::vector GenerateTuningParam() { + size_t ndim = global_size_.size(); + std::vector tuning_params = {}; + if (ndim == 0) { + MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null."; + return tuning_params; + } + BaseTuningParameter default_tuning_param = BaseTuningParameter(); + tuning_params.push_back(default_tuning_param); + std::vector max_work_items = ocl_runtime_->GetWorkItemSize(); + size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); + size_t MIN_WORKGROUP_SIZE = 8; + std::set candidate_x = GenerateLocalByGlobal(global_size_[0]); + std::set candidate_y = {1}; + std::set candidate_z = {1}; + if (ndim > 1) { + candidate_y = GenerateLocalByGlobal(global_size_[1]); + } + if (ndim > 2) { + candidate_z = GenerateLocalByGlobal(global_size_[2]); + } + for (auto x : candidate_x) { + if (x < max_work_items[0]) { + for (auto y : candidate_y) { + if (y < max_work_items[1]) { + for (auto z : candidate_z) { + auto group_size = x * y * z; + if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) { + BaseTuningParameter tuning_param = BaseTuningParameter(); + tuning_param.local_size = {x, y, z}; + tuning_params.push_back(tuning_param); + } + } + } + } + } + } + return tuning_params; + } + + virtual int AssignTuningParam(const BaseTuningParameter param) { + std::vector local_size_tmp = param.local_size; + if (local_size_tmp.size() > global_size_.size()) { + local_size_tmp = std::vector(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); + } + AlignGlobalLocal(global_size_, local_size_tmp); + return RET_OK; + } + + virtual int Tune() { + if (!ocl_runtime_->isProfiling()) { + MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling."; + return RET_OK; + } + lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode(); + if (mode == lite::opencl::TuningMode::DEFAULT) { + return RET_OK; + } + static const std::set FAST_MODE_OPS = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DepthwiseConv2D, + schema::PrimitiveType_DeConv2D}; + if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) { + return RET_OK; + } + auto tuning_params = GenerateTuningParam(); + if (tuning_params.empty()) { + MS_LOG(WARNING) << "Tuning param size is 0."; + return RET_OK; + } + int index = -1; + double min_time = MAX_PROFILING_TIME_MILLI_SECOND; + for (int i = 0; i < tuning_params.size(); i++) { + AssignTuningParam(tuning_params[i]); + auto ret = Run(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i]; + return ret; + } + double current_time = GetProfilingTimeMs(); + MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms"; + if (current_time < min_time) { + min_time = current_time; + index = i; + } + } + if (index != -1) { + MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time + << "ms"; + AssignTuningParam(tuning_params[index]); + } else { + MS_LOG(WARNING) << "Cannot find suitable param."; + } + return RET_OK; + } + + double GetProfilingTimeMs() { + if (!ocl_runtime_->isProfiling()) { + return MAX_PROFILING_TIME_MILLI_SECOND; + } + cl_ulong time_start; + cl_ulong time_end; + event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); + event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); + cl_ulong time_ns = time_end - time_start; + return static_cast(time_ns) * 1e-6; + } + protected: lite::opencl::OpenCLRuntime *ocl_runtime_; lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; cl::NDRange global_range_{cl::NullRange}; cl::NDRange local_range_{cl::NullRange}; - std::vector global_size_; // !!!To be deleted - std::vector local_size_; // !!!To be deleted + std::vector global_size_; + std::vector local_size_; + cl::Kernel kernel_; + cl::Event event_; + static std::set GenerateLocalByGlobal(size_t global_i) { + std::set local_ = {}; + int index = 1; + while (index < global_i) { + local_.insert(index); + index *= 2; + } + for (size_t i = 1; i < 16; i++) { + if (global_i % i == 0) { + local_.insert(i); + } + } + return local_; + } private: lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; @@ -233,8 +369,8 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector &input } auto ret = kernel->CheckSpecs(); if (ret != mindspore::lite::RET_OK) { - delete kernel; MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; + delete kernel; return nullptr; } return kernel; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc index 3ed8bd95e5..8668e14790 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_subgraph.cc @@ -228,8 +228,17 @@ int OpenCLSubGraph::Init() { MS_LOG(ERROR) << "OpenCL prepare fail"; return ret; } - - MallocTensorWithReuse(); + auto opencl_exec = reinterpret_cast(executor_); + ocl_runtime_->SetProfiling(true); + ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); + if (ret != RET_OK) { + MS_LOG(ERROR) << "Run opencl executor failed: " << ret; + return ret; + } + ocl_runtime_->SetProfiling(false); +#ifdef Debug + ocl_runtime_->SetProfiling(true); +#endif return RET_OK; } diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.cc b/mindspore/lite/src/runtime/opencl/opencl_executor.cc index 67e5fbb979..ce7118a34e 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.cc @@ -24,6 +24,12 @@ namespace mindspore::lite::opencl { int OpenCLExecutor::Run(std::vector &inputs, std::vector &outputs, std::vector &kernels, Allocator *allocator, const KernelCallBack &before, const KernelCallBack &after) { + return RunOrTune(inputs, outputs, kernels, allocator, before, after, false); +} + +int OpenCLExecutor::RunOrTune(std::vector &inputs, std::vector &outputs, + std::vector &kernels, Allocator *allocator, + const KernelCallBack &before, const KernelCallBack &after, bool is_tune) { int ret; kernel::LiteKernelUtil::InitTensorRefCount(kernels); for (auto *kernel : kernels) { @@ -57,14 +63,26 @@ int OpenCLExecutor::Run(std::vector &inputs, std::vector &ou return ret; } } + output->set_allocator(allocator_); } + if (is_tune) { + ret = op_kernel->Tune(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "tuning kernel failed, name: " << kernel->name(); + return ret; + } + } else { + ret = kernel->Run(); + if (ret != RET_OK) { + MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); + return ret; + } +#ifdef Debug + MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() + << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; - ret = kernel->Run(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); - return ret; +#endif } - if (after != nullptr) { if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name(); diff --git a/mindspore/lite/src/runtime/opencl/opencl_executor.h b/mindspore/lite/src/runtime/opencl/opencl_executor.h index fcf692f19b..d76fcef69d 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_executor.h +++ b/mindspore/lite/src/runtime/opencl/opencl_executor.h @@ -34,6 +34,9 @@ class OpenCLExecutor : public Executor { int Run(std::vector &inputs, std::vector &outputs, std::vector &kernels, Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr) override; + int RunOrTune(std::vector &inputs, std::vector &outputs, + std::vector &kernels, Allocator *allocator = nullptr, + const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr, bool is_tune = false); protected: InnerContext *context = nullptr; diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 409ec1968c..5043def50a 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -230,12 +230,7 @@ int OpenCLRuntime::Init() { MS_LOG(INFO) << "Compute Unit: " << compute_units_; MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; -#ifdef Debug - const cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; -#else const cl_command_queue_properties properties = 0; -#endif - default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret); if (ret != CL_SUCCESS) { delete device_; @@ -244,6 +239,16 @@ int OpenCLRuntime::Init() { return RET_ERROR; } + const cl_command_queue_properties profiling_properties = CL_QUEUE_PROFILING_ENABLE; + profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, profiling_properties, &ret); + if (ret != CL_SUCCESS) { + delete device_; + delete context_; + delete default_command_queue_; + MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret); + return RET_ERROR; + } + allocator_ = new (std::nothrow) OpenCLAllocator(this); if (allocator_ == nullptr) { delete device_; @@ -473,15 +478,17 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector } // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, - cl::CommandQueue *command_queue) { + cl::CommandQueue *command_queue, cl::Event *event) { if (command_queue == nullptr) { - command_queue = default_command_queue_; + if (profiling_) { + command_queue = profiling_command_queue_; + } else { + command_queue = default_command_queue_; + } } MS_ASSERT(local.size() == 0 || local.size() == global.size()); - - cl::Event event; cl_int ret = CL_SUCCESS; - ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event); + ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event); if (ret != CL_SUCCESS) { MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); return RET_ERROR; @@ -496,15 +503,9 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global } cnt++; MS_LOG(DEBUG) << "RunKernel success!"; -#ifdef Debug - event.wait(); - cl_ulong time_start; - cl_ulong time_end; - event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); - event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); - double nanoSeconds = time_end - time_start; - MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; -#endif + if (profiling_) { + event->wait(); + } return RET_OK; } // get gpu divce type diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.h b/mindspore/lite/src/runtime/opencl/opencl_runtime.h index ffa70254ac..47ebe126f4 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.h +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.h @@ -32,6 +32,7 @@ j* you may not use this file except in compliance with the License. namespace mindspore::lite::opencl { enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; +enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; struct GpuInfo { GpuType type = OTHER; @@ -117,7 +118,7 @@ class OpenCLRuntime { int RunKernel(const cl::Kernel &kernel, const std::vector &global, const std::vector &local, cl::CommandQueue *command_queue = nullptr); // !!!To be deleted int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, - cl::CommandQueue *command_queue = nullptr); + cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, bool sync = false) const; bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, @@ -139,10 +140,14 @@ class OpenCLRuntime { * @return max_work_group_size */ int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); + void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } + TuningMode GetTuningMode() const { return tuning_mode_; } void InitGpuCache(); int LoadCache(const void *buf); void StoreCache(); + bool isProfiling() const { return profiling_; } + void SetProfiling(bool profiling) { profiling_ = profiling; } private: static OpenCLRuntime *GetInstance(); @@ -158,6 +163,7 @@ class OpenCLRuntime { static size_t instance_count_; static OpenCLRuntime *ocl_runtime_instance_; cl::CommandQueue *default_command_queue_{nullptr}; + cl::CommandQueue *profiling_command_queue_{nullptr}; cl::Context *context_{nullptr}; cl::Device *device_{nullptr}; OpenCLAllocator *allocator_{nullptr}; @@ -181,6 +187,8 @@ class OpenCLRuntime { const std::string version_{"V0.1"}; bool need_write_{false}; bool enable_cache_{false}; + TuningMode tuning_mode_{TuningMode::DEFAULT}; + bool profiling_{false}; }; class OpenCLRuntimeWrapper {