| @@ -92,8 +92,9 @@ void ActivationOpenCLKernel::SetConstArgs() { | |||||
| } | } | ||||
| void ActivationOpenCLKernel::SetGlobalLocal() { | void ActivationOpenCLKernel::SetGlobalLocal() { | ||||
| local_range_ = cl::NullRange; | |||||
| global_range_ = {outShape.width, outShape.height}; | |||||
| local_size_ = {}; | |||||
| global_size_ = {outShape.width, outShape.height}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ActivationOpenCLKernel::Run() { | int ActivationOpenCLKernel::Run() { | ||||
| @@ -101,7 +102,7 @@ int ActivationOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; | MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -42,7 +42,6 @@ class ActivationOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| static std::string GetActTypeString(int act_type); | static std::string GetActTypeString(int act_type); | ||||
| cl::Kernel kernel_; | |||||
| int type_; | int type_; | ||||
| float alpha_; | float alpha_; | ||||
| GpuTensorInfo outShape = GpuTensorInfo(nullptr); | GpuTensorInfo outShape = GpuTensorInfo(nullptr); | ||||
| @@ -115,9 +115,9 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() { | |||||
| default: // 3 | default: // 3 | ||||
| break; | break; | ||||
| } | } | ||||
| std::vector<size_t> local = {1, 1, 1}; | |||||
| std::vector<size_t> global = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; | |||||
| global_size_ = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ArgMinMaxOpenCLKernel::InitWeights() { | int ArgMinMaxOpenCLKernel::InitWeights() { | ||||
| @@ -153,7 +153,7 @@ int ArgMinMaxOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -38,9 +38,9 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel { | |||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int InitWeights() override; | int InitWeights() override; | ||||
| int Tune() override { return lite::RET_OK; } | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| void *buff_{nullptr}; | void *buff_{nullptr}; | ||||
| void *ids_{nullptr}; | void *ids_{nullptr}; | ||||
| GpuTensorInfo im_in_{GpuTensorInfo(nullptr)}; | GpuTensorInfo im_in_{GpuTensorInfo(nullptr)}; | ||||
| @@ -124,23 +124,24 @@ int ArithmeticOpenCLKernel::CheckSpecs() { | |||||
| void ArithmeticOpenCLKernel::SetGlobalLocal() { | void ArithmeticOpenCLKernel::SetGlobalLocal() { | ||||
| if (element_flag_) { | if (element_flag_) { | ||||
| local_range_ = {}; | |||||
| local_size_ = {}; | |||||
| auto out_shape = out_tensors_[0]->shape(); | auto out_shape = out_tensors_[0]->shape(); | ||||
| if (out_shape.size() == 2) { | if (out_shape.size() == 2) { | ||||
| size_t H = out_shape[0]; | size_t H = out_shape[0]; | ||||
| size_t W = UP_DIV(out_shape[1], C4NUM); | size_t W = UP_DIV(out_shape[1], C4NUM); | ||||
| global_range_ = {W, H}; | |||||
| global_size_ = {W, H}; | |||||
| } else { | } else { | ||||
| size_t H = out_shape[0] * out_shape[1]; | size_t H = out_shape[0] * out_shape[1]; | ||||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | ||||
| global_range_ = {W, H}; | |||||
| global_size_ = {W, H}; | |||||
| } | } | ||||
| } else { | } else { | ||||
| local_range_ = {}; | |||||
| local_size_ = {}; | |||||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | ||||
| global_range_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||||
| global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||||
| } | } | ||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ArithmeticOpenCLKernel::InitWeights() { | int ArithmeticOpenCLKernel::InitWeights() { | ||||
| @@ -269,7 +270,7 @@ int ArithmeticOpenCLKernel::Run() { | |||||
| auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -39,7 +39,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| bool element_flag_{true}; | bool element_flag_{true}; | ||||
| float activation_min_{-FLT_MAX}; | float activation_min_{-FLT_MAX}; | ||||
| float activation_max_{FLT_MAX}; | float activation_max_{FLT_MAX}; | ||||
| @@ -131,10 +131,10 @@ void ArithmeticSelfOpenCLKernel::SetGlobalLocal() { | |||||
| OC = UP_DIV(output_shape[1], C4NUM); | OC = UP_DIV(output_shape[1], C4NUM); | ||||
| } | } | ||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | ||||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||||
| std::vector<size_t> global = {OH, OW, OC}; | |||||
| ArithmeticSelfGetWorkGroup(global, &local, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; // init local | |||||
| global_size_ = {OH, OW, OC}; | |||||
| ArithmeticSelfGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ArithmeticSelfOpenCLKernel::Prepare() { | int ArithmeticSelfOpenCLKernel::Prepare() { | ||||
| @@ -159,7 +159,7 @@ int ArithmeticSelfOpenCLKernel::Run() { | |||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -43,7 +43,6 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); | void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); | ||||
| cl_int4 output_shape_ = {}; | cl_int4 output_shape_ = {}; | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -75,9 +75,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() { | |||||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | ||||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | std::vector<int> out_shape = out_tensors_[0]->shape(); | ||||
| cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; | cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; | ||||
| std::vector<size_t> local = {1, 1, 1}; | |||||
| std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; | |||||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int BatchToSpaceNDOpenCLKernel::Prepare() { | int BatchToSpaceNDOpenCLKernel::Prepare() { | ||||
| @@ -103,7 +103,7 @@ int BatchToSpaceNDOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -37,9 +37,9 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel { | |||||
| int CheckSpecs() override; | int CheckSpecs() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Tune() override { return lite::RET_OK; } | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif | #endif | ||||
| @@ -62,10 +62,10 @@ void BatchNormOpenCLKernel::SetGlobalLocal() { | |||||
| uint32_t OC = UP_DIV(output_shape[3], C4NUM); | uint32_t OC = UP_DIV(output_shape[3], C4NUM); | ||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | ||||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||||
| std::vector<size_t> global = {OH, OW, OC}; | |||||
| BatchNormGetWorkGroup(global, &local, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; // init local | |||||
| global_size_ = {OH, OW, OC}; | |||||
| BatchNormGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int BatchNormOpenCLKernel::Prepare() { | int BatchNormOpenCLKernel::Prepare() { | ||||
| @@ -91,7 +91,7 @@ int BatchNormOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -40,7 +40,6 @@ class BiasAddOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| cl_int4 GetGlobalshape(); | cl_int4 GetGlobalshape(); | ||||
| cl::Kernel kernel_; | |||||
| void *BiasAdd_{nullptr}; | void *BiasAdd_{nullptr}; | ||||
| int in_size_{}; | int in_size_{}; | ||||
| int out_size_{}; | int out_size_{}; | ||||
| @@ -73,10 +73,10 @@ void CastOpenCLKernel::SetGlobalLocal() { | |||||
| uint32_t OC = UP_DIV(input_shape[3], C4NUM); | uint32_t OC = UP_DIV(input_shape[3], C4NUM); | ||||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | ||||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||||
| std::vector<size_t> global = {OH, OW, OC}; | |||||
| CastGetWorkGroup(global, &local, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; // init local | |||||
| global_size_ = {OH, OW, OC}; | |||||
| CastGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int CastOpenCLKernel::Prepare() { | int CastOpenCLKernel::Prepare() { | ||||
| @@ -100,7 +100,7 @@ int CastOpenCLKernel::Run() { | |||||
| int arg_cn = 0; | int arg_cn = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -41,8 +41,6 @@ class CastOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int GetKernelName(std::string *kernel_name, CastParameter *param); | int GetKernelName(std::string *kernel_name, CastParameter *param); | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -132,17 +132,17 @@ void ConcatOpenCLKernel::SetGlobalLocal() { | |||||
| if (axis_ == 3 && !Align_) { | if (axis_ == 3 && !Align_) { | ||||
| OH = out_shape_.s[0] * out_shape_.s[1]; | OH = out_shape_.s[0] * out_shape_.s[1]; | ||||
| OW = out_shape_.s[2]; | OW = out_shape_.s[2]; | ||||
| global = {OH, OW, 1}; | |||||
| local = {1, 1, 1}; | |||||
| global_size_ = {OH, OW, 1}; | |||||
| local_size_ = {1, 1, 1}; | |||||
| } else { | } else { | ||||
| OH = out_shape_.s[0] * out_shape_.s[1]; | OH = out_shape_.s[0] * out_shape_.s[1]; | ||||
| OW = out_shape_.s[2]; | OW = out_shape_.s[2]; | ||||
| OC = out_shape_.s[3]; | OC = out_shape_.s[3]; | ||||
| global = {OH, OW, OC}; | |||||
| local = {1, 1, 1}; | |||||
| global_size_ = {OH, OW, OC}; | |||||
| local_size_ = {1, 1, 1}; | |||||
| } | } | ||||
| ConcatGetWorkGroup(global, &local, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ConcatOpenCLKernel::Prepare() { | int ConcatOpenCLKernel::Prepare() { | ||||
| @@ -196,7 +196,7 @@ int ConcatOpenCLKernel::Run() { | |||||
| MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -53,7 +53,6 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int RunAxis0(); | int RunAxis0(); | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -93,7 +93,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| std::string program_name = "winograd"; | std::string program_name = "winograd"; | ||||
| ocl_runtime_->LoadSource(program_name, winograd_source); | ocl_runtime_->LoadSource(program_name, winograd_source); | ||||
| ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); | ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); | ||||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution"); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); | |||||
| ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); | ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); | ||||
| } else { | } else { | ||||
| SetBlockSize(); | SetBlockSize(); | ||||
| @@ -101,7 +101,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + | std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + | ||||
| std::to_string(block_size_.C); | std::to_string(block_size_.C); | ||||
| ocl_runtime_->LoadSource(program_name, conv2d_source); | ocl_runtime_->LoadSource(program_name, conv2d_source); | ||||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name); | |||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||||
| } | } | ||||
| // allocate winograd memory | // allocate winograd memory | ||||
| @@ -329,7 +329,9 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { | |||||
| local_h = std::min(global_h, local_hw); | local_h = std::min(global_h, local_hw); | ||||
| local_w = std::min(local_hw / local_h, global_w); | local_w = std::min(local_hw / local_h, global_w); | ||||
| } | } | ||||
| AlignGlobalLocal({global_h, global_w, global_c}, {local_h, local_w, local_c}); | |||||
| global_size_ = {global_h, global_w, global_c}; | |||||
| local_size_ = {local_h, local_w, local_c}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| } | } | ||||
| @@ -355,11 +357,11 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||||
| arg_cn = 0; | arg_cn = 0; | ||||
| cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; | cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; | ||||
| cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; | cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, conv_out_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape); | |||||
| arg_cn = 2; | arg_cn = 2; | ||||
| cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; | cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; | ||||
| @@ -373,30 +375,37 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||||
| cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; | cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; | ||||
| cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; | cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; | ||||
| cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; | cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, act_type); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type); | |||||
| } | } | ||||
| } | } | ||||
| int Conv2DOpenCLKernel::Tune() { | |||||
| if (use_winograd_) { | |||||
| return RET_OK; | |||||
| } | |||||
| return OpenCLKernel::Tune(); | |||||
| } | |||||
| int Conv2DOpenCLKernel::Run() { | int Conv2DOpenCLKernel::Run() { | ||||
| if (use_winograd_) { | if (use_winograd_) { | ||||
| ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); | ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); | ||||
| ocl_runtime_->RunKernel(kernel_conv_, global_conv_, local_conv_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_); | |||||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); | ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); | ||||
| } else { | } else { | ||||
| ocl_runtime_->SetKernelArg(kernel_conv_, 0, in_tensors_.front()->data_c()); | |||||
| ocl_runtime_->SetKernelArg(kernel_conv_, 1, out_tensors_.front()->data_c()); | |||||
| ocl_runtime_->RunKernel(kernel_conv_, global_range_, local_range_); | |||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); | |||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -42,6 +42,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| int Run() override; | int Run() override; | ||||
| int Tune() override; | |||||
| private: | private: | ||||
| void SetBlockSize(); | void SetBlockSize(); | ||||
| @@ -60,7 +61,6 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||||
| } | } | ||||
| cl::Kernel kernel_4x4to36_; | cl::Kernel kernel_4x4to36_; | ||||
| cl::Kernel kernel_conv_; | |||||
| cl::Kernel kernel_36to4x4_; | cl::Kernel kernel_36to4x4_; | ||||
| cl::NDRange global_4x4to36_, local_4x4to36_; | cl::NDRange global_4x4to36_, local_4x4to36_; | ||||
| cl::NDRange global_conv_, local_conv_; | cl::NDRange global_conv_, local_conv_; | ||||
| @@ -193,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() { | |||||
| int arg_cnt = 0; | int arg_cnt = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -40,7 +40,6 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| void *padWeight_{nullptr}; | void *padWeight_{nullptr}; | ||||
| void *bias_{nullptr}; | void *bias_{nullptr}; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| @@ -175,24 +175,23 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() { | |||||
| void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { | void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { | ||||
| // set global | // set global | ||||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); | size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); | ||||
| std::vector<size_t> global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||||
| global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||||
| // set local | // set local | ||||
| const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | ||||
| int z = global_size[0]; | |||||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); | |||||
| int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z))); | |||||
| std::vector<size_t> local_size = | |||||
| std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||||
| int z = global_size_[0]; | |||||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8)); | |||||
| int x = std::max(1, std::min(static_cast<int>(global_size_[1]), max_group_size / (y * z))); | |||||
| local_size_ = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size, local_size); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int DepthwiseConv2dOpenCLKernel::Run() { | int DepthwiseConv2dOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -42,7 +42,6 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| void *packed_weight_{nullptr}; | void *packed_weight_{nullptr}; | ||||
| void *bias_data_{nullptr}; | void *bias_data_{nullptr}; | ||||
| cl::Kernel kernel_; | |||||
| std::vector<int> block_size_{2, 2, 1}; | std::vector<int> block_size_{2, 2, 1}; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -43,9 +43,6 @@ class FillOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int RunFill(); | int RunFill(); | ||||
| int RunShape(); | int RunShape(); | ||||
| cl::Kernel kernel_; | |||||
| private: | |||||
| float default_{0.0f}; | float default_{0.0f}; | ||||
| }; | }; | ||||
| @@ -179,9 +179,9 @@ int FullConnectionOpenCLKernel::InitWeights() { | |||||
| } | } | ||||
| void FullConnectionOpenCLKernel::SetGlobalLocal() { | void FullConnectionOpenCLKernel::SetGlobalLocal() { | ||||
| std::vector<size_t> local = {32, 4, 1}; | |||||
| std::vector<size_t> global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; | |||||
| AlignGlobalLocal(global, local); | |||||
| local_size_ = {32, 4, 1}; | |||||
| global_size_ = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| void FullConnectionOpenCLKernel::SetConstArgs() { | void FullConnectionOpenCLKernel::SetConstArgs() { | ||||
| @@ -202,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() { | |||||
| int arg_count = 0; | int arg_count = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -38,9 +38,9 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Init() override; | int Init() override; | ||||
| int Tune() override { return lite::RET_OK; } | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| void *padWeight_{nullptr}; | void *padWeight_{nullptr}; | ||||
| void *bias_{nullptr}; | void *bias_{nullptr}; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| @@ -93,9 +93,9 @@ void GatherOpenCLKernel::SetConstArgs() { | |||||
| void GatherOpenCLKernel::SetGlobalLocal() { | void GatherOpenCLKernel::SetGlobalLocal() { | ||||
| auto output = GpuTensorInfo(out_tensors_.front()); | auto output = GpuTensorInfo(out_tensors_.front()); | ||||
| std::vector<size_t> local = {1, 1, 1}; | |||||
| std::vector<size_t> global = {output.W, output.N * output.H, output.Slice}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; | |||||
| global_size_ = {output.W, output.N * output.H, output.Slice}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int GatherOpenCLKernel::Prepare() { | int GatherOpenCLKernel::Prepare() { | ||||
| @@ -155,7 +155,7 @@ int GatherOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); | ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -38,12 +38,12 @@ class GatherOpenCLKernel : public OpenCLKernel { | |||||
| int CheckSpecs() override; | int CheckSpecs() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Tune() override { return lite::RET_OK; } | |||||
| protected: | protected: | ||||
| int UpdateWeights(); | int UpdateWeights(); | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| int32_t *indices_data_{nullptr}; | int32_t *indices_data_{nullptr}; | ||||
| int axis_ = {0}; | int axis_ = {0}; | ||||
| }; | }; | ||||
| @@ -1,50 +0,0 @@ | |||||
| /** | |||||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||||
| * | |||||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| * you may not use this file except in compliance with the License. | |||||
| * You may obtain a copy of the License at | |||||
| * | |||||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||||
| * | |||||
| * Unless required by applicable law or agreed to in writing, software | |||||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| * See the License for the specific language governing permissions and | |||||
| * limitations under the License. | |||||
| */ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ | |||||
| #include <vector> | |||||
| #include "mindspore/lite/nnacl/fp32/activation_fp32.h" | |||||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||||
| namespace mindspore::kernel { | |||||
| class HswishOpenCLKernel : public OpenCLKernel { | |||||
| public: | |||||
| HswishOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||||
| const std::vector<lite::Tensor *> &outputs) | |||||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||||
| ~HswishOpenCLKernel() override = default; | |||||
| int Init() override; | |||||
| int Run() override; | |||||
| private: | |||||
| int InferShapeTo4D(); | |||||
| cl::Kernel kernel_; | |||||
| private: | |||||
| size_t N_{1}; | |||||
| size_t H_{1}; | |||||
| size_t W_{1}; | |||||
| size_t C_{1}; | |||||
| }; | |||||
| } // namespace mindspore::kernel | |||||
| #endif | |||||
| @@ -137,11 +137,11 @@ int MatMulOpenCLKernel::InitWeights() { | |||||
| void MatMulOpenCLKernel::SetGlobalLocal() { | void MatMulOpenCLKernel::SetGlobalLocal() { | ||||
| // local size should less than MAX_GROUP_SIZE | // local size should less than MAX_GROUP_SIZE | ||||
| std::vector<size_t> local = {32, 4, 1}; | |||||
| std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||||
| static_cast<size_t>(outShape[2])}; | |||||
| AlignGlobalLocal(global, local); | |||||
| local_size_ = {32, 4, 1}; | |||||
| global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||||
| static_cast<size_t>(outShape[2])}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| void MatMulOpenCLKernel::SetConstArgs() { | void MatMulOpenCLKernel::SetConstArgs() { | ||||
| @@ -158,7 +158,7 @@ int MatMulOpenCLKernel::Run() { | |||||
| int arg_count = 0; | int arg_count = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -37,9 +37,9 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||||
| int InitWeights() override; | int InitWeights() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Tune() override { return lite::RET_OK; } | |||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| void *padWeight_{nullptr}; | void *padWeight_{nullptr}; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| bool transposeA{false}; | bool transposeA{false}; | ||||
| @@ -85,14 +85,16 @@ void OneHotOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C)); | ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C)); | ||||
| } | } | ||||
| void OneHotOpenCLKernel::SetGlobalLocal() { | void OneHotOpenCLKernel::SetGlobalLocal() { | ||||
| global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||||
| local_size_ = {}; | |||||
| global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int OneHotOpenCLKernel::Run() { | int OneHotOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -39,7 +39,6 @@ class OneHotOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| int depth_{0}; | int depth_{0}; | ||||
| float on_value_{1.0f}; | float on_value_{1.0f}; | ||||
| float off_value_{0.0f}; | float off_value_{0.0f}; | ||||
| @@ -97,14 +97,15 @@ void PadOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_)); | ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_)); | ||||
| AlignGlobalLocal({output.N * output.H, output.W, output.Slice}, {8, 4, 1}); | |||||
| local_size_ = {8, 4, 1}; | |||||
| global_size_ = {output.N * output.H, output.W, output.Slice}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int PadOpenCLKernel::Run() { | int PadOpenCLKernel::Run() { | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -41,7 +41,6 @@ class PadOpenCLKernel : public OpenCLKernel { | |||||
| int Run() override; | int Run() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| PadParameter *param_; | PadParameter *param_; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -86,8 +86,9 @@ void PoolingOpenCLKernel::SetGlobalLocal() { | |||||
| const size_t global_x = out_tensors_[0]->shape()[1]; | const size_t global_x = out_tensors_[0]->shape()[1]; | ||||
| const size_t global_y = out_tensors_[0]->shape()[2]; | const size_t global_y = out_tensors_[0]->shape()[2]; | ||||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | ||||
| global_range_ = {global_z, global_y, global_x}; | |||||
| local_range_ = {}; | |||||
| global_size_ = {global_z, global_y, global_x}; | |||||
| local_size_ = {}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| void PoolingOpenCLKernel::SetConstArgs() { | void PoolingOpenCLKernel::SetConstArgs() { | ||||
| @@ -111,7 +112,7 @@ int PoolingOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -39,7 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| PoolingParameter *parameter_; | PoolingParameter *parameter_; | ||||
| cl::Kernel kernel_; | |||||
| std::vector<size_t> local_size_; | std::vector<size_t> local_size_; | ||||
| std::vector<size_t> global_size_; | std::vector<size_t> global_size_; | ||||
| }; | }; | ||||
| @@ -37,7 +37,6 @@ class PowerOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int InferShapeTo4D(); | int InferShapeTo4D(); | ||||
| cl::Kernel kernel_; | |||||
| private: | private: | ||||
| size_t N_{1}; | size_t N_{1}; | ||||
| @@ -37,7 +37,6 @@ class PReluOpenCLKernel : public OpenCLKernel { | |||||
| int InitWeights() override; | int InitWeights() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| int batch_size_{}; | int batch_size_{}; | ||||
| int C_{}; | int C_{}; | ||||
| @@ -148,15 +148,22 @@ void ReduceOpenCLKernel::SetGlobalLocal() { | |||||
| int h = shapex[1]; | int h = shapex[1]; | ||||
| int c = shapex[3]; | int c = shapex[3]; | ||||
| int c4 = UP_DIV(c, C4NUM); | int c4 = UP_DIV(c, C4NUM); | ||||
| std::vector<size_t> local = {}; | |||||
| local_size_ = {}; | |||||
| if (use_local_) { | if (use_local_) { | ||||
| local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||||
| local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||||
| } | } | ||||
| std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1}; | |||||
| global_size_ = {static_cast<size_t>(c4), 1, 1}; | |||||
| if (wc_reduce_) { | if (wc_reduce_) { | ||||
| global = {static_cast<size_t>(h), 1, 1}; | |||||
| global_size_ = {static_cast<size_t>(h), 1, 1}; | |||||
| } | } | ||||
| AlignGlobalLocal(global, local); | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | |||||
| int ReduceOpenCLKernel::Tune() { | |||||
| if (use_local_) { | |||||
| return RET_OK; | |||||
| } | |||||
| return OpenCLKernel::Tune(); | |||||
| } | } | ||||
| int ReduceOpenCLKernel::Run() { | int ReduceOpenCLKernel::Run() { | ||||
| @@ -164,7 +171,7 @@ int ReduceOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -36,11 +36,11 @@ class ReduceOpenCLKernel : public OpenCLKernel { | |||||
| int CheckSpecs() override; | int CheckSpecs() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Tune() override; | |||||
| private: | private: | ||||
| cl_float4 GenC4Mask(); | cl_float4 GenC4Mask(); | ||||
| static std::string GetReduceTypeStr(int type); | static std::string GetReduceTypeStr(int type); | ||||
| cl::Kernel kernel_; | |||||
| GpuTensorInfo outShape = GpuTensorInfo(nullptr); | GpuTensorInfo outShape = GpuTensorInfo(nullptr); | ||||
| bool use_local_{false}; | bool use_local_{false}; | ||||
| bool wc_reduce_{false}; | bool wc_reduce_{false}; | ||||
| @@ -55,9 +55,9 @@ void ReshapeOpenCLKernel::SetConstArgs() { | |||||
| void ReshapeOpenCLKernel::SetGlobalLocal() { | void ReshapeOpenCLKernel::SetGlobalLocal() { | ||||
| auto out = GpuTensorInfo(out_tensors_.front()); | auto out = GpuTensorInfo(out_tensors_.front()); | ||||
| std::vector<size_t> local = {}; | |||||
| std::vector<size_t> global{out.width, out.height}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {}; | |||||
| global_size_ = {out.width, out.height}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ReshapeOpenCLKernel::Prepare() { | int ReshapeOpenCLKernel::Prepare() { | ||||
| @@ -81,7 +81,7 @@ int ReshapeOpenCLKernel::Run() { | |||||
| MS_LOG(DEBUG) << this->name() << " Running!"; | MS_LOG(DEBUG) << this->name() << " Running!"; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -37,7 +37,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -100,9 +100,10 @@ void ResizeOpenCLKernel::SetConstArgs() { | |||||
| } | } | ||||
| void ResizeOpenCLKernel::SetGlobalLocal() { | void ResizeOpenCLKernel::SetGlobalLocal() { | ||||
| local_range_ = {}; | |||||
| local_size_ = {}; | |||||
| auto out_shape = GpuTensorInfo(out_tensors_[0]); | auto out_shape = GpuTensorInfo(out_tensors_[0]); | ||||
| global_range_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||||
| global_size_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ResizeOpenCLKernel::Run() { | int ResizeOpenCLKernel::Run() { | ||||
| @@ -110,7 +111,7 @@ int ResizeOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -40,7 +40,6 @@ class ResizeOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| float getResizeScaleFactor(int input_size, int output_size); | float getResizeScaleFactor(int input_size, int output_size); | ||||
| cl::Kernel kernel_; | |||||
| bool alignCorner{false}; | bool alignCorner{false}; | ||||
| bool preserveAspectRatio{false}; | bool preserveAspectRatio{false}; | ||||
| }; | }; | ||||
| @@ -37,7 +37,6 @@ class ScaleOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| void Image2dGetWorkGroupSize(); | void Image2dGetWorkGroupSize(); | ||||
| cl::Kernel kernel_; | |||||
| bool weight_vector_flag_{true}; | bool weight_vector_flag_{true}; | ||||
| bool broadcast_flag_{false}; | bool broadcast_flag_{false}; | ||||
| bool broadcast_H_flag_{false}; | bool broadcast_H_flag_{false}; | ||||
| @@ -116,6 +116,13 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() { | |||||
| AlignGlobalLocal(global_size_, local_size_); | AlignGlobalLocal(global_size_, local_size_); | ||||
| } | } | ||||
| int SoftmaxOpenCLKernel::Tune() { | |||||
| if (onexone_flag_) { | |||||
| return RET_OK; | |||||
| } | |||||
| return OpenCLKernel::Tune(); | |||||
| } | |||||
| void SoftmaxOpenCLKernel::SetConstArgs() { | void SoftmaxOpenCLKernel::SetConstArgs() { | ||||
| int arg_idx = 2; | int arg_idx = 2; | ||||
| int channel = out_shape.C; | int channel = out_shape.C; | ||||
| @@ -133,8 +140,7 @@ int SoftmaxOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| // run opengl kernel | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return lite::RET_OK; | return lite::RET_OK; | ||||
| } | } | ||||
| @@ -38,6 +38,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||||
| int CheckSpecs() override; | int CheckSpecs() override; | ||||
| void SetConstArgs() override; | void SetConstArgs() override; | ||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| int Tune() override; | |||||
| private: | private: | ||||
| int InitGlobalSize(); | int InitGlobalSize(); | ||||
| @@ -45,7 +46,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||||
| int SetWorkGroupSize(); | int SetWorkGroupSize(); | ||||
| std::vector<float> GetMaskForLastChannel(int channels); | std::vector<float> GetMaskForLastChannel(int channels); | ||||
| cl::Kernel kernel_; | |||||
| SoftmaxParameter *parameter_; | SoftmaxParameter *parameter_; | ||||
| bool onexone_flag_{false}; | bool onexone_flag_{false}; | ||||
| std::vector<size_t> local_size_; | std::vector<size_t> local_size_; | ||||
| @@ -80,9 +80,9 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() { | |||||
| void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { | void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { | ||||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | ||||
| cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; | cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; | ||||
| std::vector<size_t> local = {1, 1, 1}; | |||||
| std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| local_size_ = {1, 1, 1}; | |||||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int SpaceToBatchNDOpenCLKernel::Prepare() { | int SpaceToBatchNDOpenCLKernel::Prepare() { | ||||
| @@ -109,7 +109,7 @@ int SpaceToBatchNDOpenCLKernel::Run() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -39,7 +39,6 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| #endif | #endif | ||||
| @@ -69,7 +69,9 @@ void SpaceToDepthOpenCLKernel::SetConstArgs() { | |||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size); | ||||
| } | } | ||||
| void SpaceToDepthOpenCLKernel::SetGlobalLocal() { | void SpaceToDepthOpenCLKernel::SetGlobalLocal() { | ||||
| global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||||
| local_size_ = {}; | |||||
| global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int SpaceToDepthOpenCLKernel::Run() { | int SpaceToDepthOpenCLKernel::Run() { | ||||
| @@ -77,7 +79,7 @@ int SpaceToDepthOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -38,7 +38,6 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr); | GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr); | ||||
| GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr); | GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr); | ||||
| }; | }; | ||||
| @@ -136,11 +136,11 @@ void SparseToDenseOpenCLKernel::SetConstArgs() { | |||||
| } | } | ||||
| void SparseToDenseOpenCLKernel::SetGlobalLocal() { | void SparseToDenseOpenCLKernel::SetGlobalLocal() { | ||||
| std::vector<size_t> local = {1, 1}; | |||||
| local_size_ = {1, 1}; | |||||
| size_t OH = n_ * h_; | size_t OH = n_ * h_; | ||||
| size_t OW = w_ * UP_DIV(c_, C4NUM); | size_t OW = w_ * UP_DIV(c_, C4NUM); | ||||
| std::vector<size_t> global = {OH, OW}; | |||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| global_size_ = {OH, OW}; | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int SparseToDenseOpenCLKernel::Prepare() { | int SparseToDenseOpenCLKernel::Prepare() { | ||||
| @@ -209,7 +209,7 @@ int SparseToDenseOpenCLKernel::Run() { | |||||
| } else { | } else { | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); | ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); | ||||
| } | } | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -43,7 +43,6 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel { | |||||
| int InitOutputToDefault(); | int InitOutputToDefault(); | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| // bool IndicesIsScalar{false}; | // bool IndicesIsScalar{false}; | ||||
| bool enable_fp16_{false}; | bool enable_fp16_{false}; | ||||
| float default_{0.0f}; | float default_{0.0f}; | ||||
| @@ -44,7 +44,6 @@ class StackOpenCLKernel : public OpenCLKernel { | |||||
| int InferOutTensorShapeTo4D(cl_int4 *output_shape); | int InferOutTensorShapeTo4D(cl_int4 *output_shape); | ||||
| cl::Kernel kernel_; | |||||
| int axis_{0}; | int axis_{0}; | ||||
| size_t N_{1}; | size_t N_{1}; | ||||
| size_t H_{1}; | size_t H_{1}; | ||||
| @@ -164,24 +164,24 @@ void StridedSliceOpenCLKernel::SetConstArgs() { | |||||
| void StridedSliceOpenCLKernel::SetGlobalLocal() { | void StridedSliceOpenCLKernel::SetGlobalLocal() { | ||||
| auto output_info = GpuTensorInfo(out_tensors_.front()); | auto output_info = GpuTensorInfo(out_tensors_.front()); | ||||
| std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice}; | |||||
| global_size_ = {output_info.N * output_info.H, output_info.W, output_info.Slice}; | |||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | ||||
| size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); | |||||
| size_t local_c = GetMaxDivisorStrategy0(global_size_[2], max_divider); | |||||
| local_c = std::max<size_t>(local_c, 1); | local_c = std::max<size_t>(local_c, 1); | ||||
| size_t local_hw = max_work_group_size / local_c; | size_t local_hw = max_work_group_size / local_c; | ||||
| size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); | |||||
| size_t local_w = std::min(local_hw / local_h, global[1]); | |||||
| std::vector<size_t> local = {local_h, local_w, local_c}; | |||||
| AlignGlobalLocal(global, local); | |||||
| size_t local_h = std::min(UP_DIV(global_size_[0], 2), local_hw); | |||||
| size_t local_w = std::min(local_hw / local_h, global_size_[1]); | |||||
| local_size_ = {local_h, local_w, local_c}; | |||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int StridedSliceOpenCLKernel::Run() { | int StridedSliceOpenCLKernel::Run() { | ||||
| MS_LOG(DEBUG) << this->name() << " Running! "; | MS_LOG(DEBUG) << this->name() << " Running! "; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -42,7 +42,6 @@ class StridedSliceOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| int InitConstArgs(); | int InitConstArgs(); | ||||
| cl::Kernel kernel_; | |||||
| cl_int4 input_shape_{}; | cl_int4 input_shape_{}; | ||||
| cl_int4 output_shape_{}; | cl_int4 output_shape_{}; | ||||
| cl_int2 io_slices_{}; | cl_int2 io_slices_{}; | ||||
| @@ -51,13 +51,13 @@ void ToFormatOpenCLKernel::SetConstArgs() { | |||||
| } | } | ||||
| void ToFormatOpenCLKernel::SetGlobalLocal() { | void ToFormatOpenCLKernel::SetGlobalLocal() { | ||||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||||
| std::vector<size_t> local = {8, 16, 3}; | |||||
| global_size_ = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||||
| local_size_ = {8, 16, 3}; | |||||
| size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | ||||
| if (max_work_group_size < 384) { | if (max_work_group_size < 384) { | ||||
| local[2] = 1; | |||||
| local_size_[2] = 1; | |||||
| } | } | ||||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int ToFormatOpenCLKernel::Prepare() { | int ToFormatOpenCLKernel::Prepare() { | ||||
| @@ -97,7 +97,7 @@ int ToFormatOpenCLKernel::Run() { | |||||
| auto dst_mem_type = out_mem_type_; | auto dst_mem_type = out_mem_type_; | ||||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type); | ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type); | ||||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type); | ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -38,7 +38,6 @@ class ToFormatOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| size_t N_{1}; | size_t N_{1}; | ||||
| size_t H_{1}; | size_t H_{1}; | ||||
| size_t W_{1}; | size_t W_{1}; | ||||
| @@ -96,11 +96,13 @@ void TransposeOpenCLKernel::SetGlobalLocal() { | |||||
| size_t w = shapex[2]; | size_t w = shapex[2]; | ||||
| size_t c = shapex[3]; | size_t c = shapex[3]; | ||||
| size_t c4 = UP_DIV(c, 4); | size_t c4 = UP_DIV(c, 4); | ||||
| local_size_ = {}; | |||||
| if (type == TransposeType::AXIS0312) { // NHWC -> NCHW | if (type == TransposeType::AXIS0312) { // NHWC -> NCHW | ||||
| global_range_ = {UP_DIV(h, C4NUM), w, c4}; | |||||
| global_size_ = {UP_DIV(h, C4NUM), w, c4}; | |||||
| } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC | } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC | ||||
| global_range_ = {h, UP_DIV(w, C4NUM), c4}; | |||||
| global_size_ = {h, UP_DIV(w, C4NUM), c4}; | |||||
| } | } | ||||
| AlignGlobalLocal(global_size_, local_size_); | |||||
| } | } | ||||
| int TransposeOpenCLKernel::Run() { | int TransposeOpenCLKernel::Run() { | ||||
| @@ -108,7 +110,7 @@ int TransposeOpenCLKernel::Run() { | |||||
| int arg_idx = 0; | int arg_idx = 0; | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | ||||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | ||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||||
| return mindspore::lite::RET_OK; | return mindspore::lite::RET_OK; | ||||
| } | } | ||||
| @@ -41,7 +41,6 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||||
| void SetGlobalLocal() override; | void SetGlobalLocal() override; | ||||
| private: | private: | ||||
| cl::Kernel kernel_; | |||||
| TransposeType type{TransposeType::AXIS0312}; | TransposeType type{TransposeType::AXIS0312}; | ||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -16,8 +16,10 @@ | |||||
| #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | ||||
| #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | ||||
| #define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000 // 10 seconds | |||||
| #include <vector> | #include <vector> | ||||
| #include <set> | |||||
| #include "src/lite_kernel.h" | #include "src/lite_kernel.h" | ||||
| #include "include/errorcode.h" | #include "include/errorcode.h" | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| @@ -137,6 +139,16 @@ struct GpuTensorInfo { | |||||
| size_t NDim{}; | size_t NDim{}; | ||||
| }; | }; | ||||
| struct BaseTuningParameter { | |||||
| std::vector<size_t> local_size; | |||||
| friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) { | |||||
| ostrm << "LocalSize:"; | |||||
| for (auto i : a.local_size) { | |||||
| ostrm << i << ","; | |||||
| } | |||||
| return ostrm; | |||||
| } | |||||
| }; | |||||
| class OpenCLKernel : public LiteKernel { | class OpenCLKernel : public LiteKernel { | ||||
| public: | public: | ||||
| OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | ||||
| @@ -158,7 +170,9 @@ class OpenCLKernel : public LiteKernel { | |||||
| for (size_t i = 0; i < local.size(); i++) { | for (size_t i = 0; i < local.size(); i++) { | ||||
| MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); | MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); | ||||
| } | } | ||||
| if (local.empty()) { | |||||
| local_range_ = cl::NullRange; | |||||
| } | |||||
| if (global.size() == 1) { | if (global.size() == 1) { | ||||
| global_range_ = cl::NDRange(internal_global_ws.at(0)); | global_range_ = cl::NDRange(internal_global_ws.at(0)); | ||||
| if (!local.empty()) { | if (!local.empty()) { | ||||
| @@ -209,13 +223,135 @@ class OpenCLKernel : public LiteKernel { | |||||
| lite::opencl::MemType GetMemType() { return out_mem_type_; } | lite::opencl::MemType GetMemType() { return out_mem_type_; } | ||||
| void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | ||||
| virtual std::vector<BaseTuningParameter> GenerateTuningParam() { | |||||
| size_t ndim = global_size_.size(); | |||||
| std::vector<BaseTuningParameter> tuning_params = {}; | |||||
| if (ndim == 0) { | |||||
| MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null."; | |||||
| return tuning_params; | |||||
| } | |||||
| BaseTuningParameter default_tuning_param = BaseTuningParameter(); | |||||
| tuning_params.push_back(default_tuning_param); | |||||
| std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize(); | |||||
| size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); | |||||
| size_t MIN_WORKGROUP_SIZE = 8; | |||||
| std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]); | |||||
| std::set<size_t> candidate_y = {1}; | |||||
| std::set<size_t> candidate_z = {1}; | |||||
| if (ndim > 1) { | |||||
| candidate_y = GenerateLocalByGlobal(global_size_[1]); | |||||
| } | |||||
| if (ndim > 2) { | |||||
| candidate_z = GenerateLocalByGlobal(global_size_[2]); | |||||
| } | |||||
| for (auto x : candidate_x) { | |||||
| if (x < max_work_items[0]) { | |||||
| for (auto y : candidate_y) { | |||||
| if (y < max_work_items[1]) { | |||||
| for (auto z : candidate_z) { | |||||
| auto group_size = x * y * z; | |||||
| if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) { | |||||
| BaseTuningParameter tuning_param = BaseTuningParameter(); | |||||
| tuning_param.local_size = {x, y, z}; | |||||
| tuning_params.push_back(tuning_param); | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| } | |||||
| return tuning_params; | |||||
| } | |||||
| virtual int AssignTuningParam(const BaseTuningParameter param) { | |||||
| std::vector<size_t> local_size_tmp = param.local_size; | |||||
| if (local_size_tmp.size() > global_size_.size()) { | |||||
| local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); | |||||
| } | |||||
| AlignGlobalLocal(global_size_, local_size_tmp); | |||||
| return RET_OK; | |||||
| } | |||||
| virtual int Tune() { | |||||
| if (!ocl_runtime_->isProfiling()) { | |||||
| MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling."; | |||||
| return RET_OK; | |||||
| } | |||||
| lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode(); | |||||
| if (mode == lite::opencl::TuningMode::DEFAULT) { | |||||
| return RET_OK; | |||||
| } | |||||
| static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DepthwiseConv2D, | |||||
| schema::PrimitiveType_DeConv2D}; | |||||
| if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) { | |||||
| return RET_OK; | |||||
| } | |||||
| auto tuning_params = GenerateTuningParam(); | |||||
| if (tuning_params.empty()) { | |||||
| MS_LOG(WARNING) << "Tuning param size is 0."; | |||||
| return RET_OK; | |||||
| } | |||||
| int index = -1; | |||||
| double min_time = MAX_PROFILING_TIME_MILLI_SECOND; | |||||
| for (int i = 0; i < tuning_params.size(); i++) { | |||||
| AssignTuningParam(tuning_params[i]); | |||||
| auto ret = Run(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i]; | |||||
| return ret; | |||||
| } | |||||
| double current_time = GetProfilingTimeMs(); | |||||
| MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms"; | |||||
| if (current_time < min_time) { | |||||
| min_time = current_time; | |||||
| index = i; | |||||
| } | |||||
| } | |||||
| if (index != -1) { | |||||
| MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time | |||||
| << "ms"; | |||||
| AssignTuningParam(tuning_params[index]); | |||||
| } else { | |||||
| MS_LOG(WARNING) << "Cannot find suitable param."; | |||||
| } | |||||
| return RET_OK; | |||||
| } | |||||
| double GetProfilingTimeMs() { | |||||
| if (!ocl_runtime_->isProfiling()) { | |||||
| return MAX_PROFILING_TIME_MILLI_SECOND; | |||||
| } | |||||
| cl_ulong time_start; | |||||
| cl_ulong time_end; | |||||
| event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); | |||||
| event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); | |||||
| cl_ulong time_ns = time_end - time_start; | |||||
| return static_cast<double>(time_ns) * 1e-6; | |||||
| } | |||||
| protected: | protected: | ||||
| lite::opencl::OpenCLRuntime *ocl_runtime_; | lite::opencl::OpenCLRuntime *ocl_runtime_; | ||||
| lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; | lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; | ||||
| cl::NDRange global_range_{cl::NullRange}; | cl::NDRange global_range_{cl::NullRange}; | ||||
| cl::NDRange local_range_{cl::NullRange}; | cl::NDRange local_range_{cl::NullRange}; | ||||
| std::vector<size_t> global_size_; // !!!To be deleted | |||||
| std::vector<size_t> local_size_; // !!!To be deleted | |||||
| std::vector<size_t> global_size_; | |||||
| std::vector<size_t> local_size_; | |||||
| cl::Kernel kernel_; | |||||
| cl::Event event_; | |||||
| static std::set<size_t> GenerateLocalByGlobal(size_t global_i) { | |||||
| std::set<size_t> local_ = {}; | |||||
| int index = 1; | |||||
| while (index < global_i) { | |||||
| local_.insert(index); | |||||
| index *= 2; | |||||
| } | |||||
| for (size_t i = 1; i < 16; i++) { | |||||
| if (global_i % i == 0) { | |||||
| local_.insert(i); | |||||
| } | |||||
| } | |||||
| return local_; | |||||
| } | |||||
| private: | private: | ||||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | ||||
| @@ -233,8 +369,8 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input | |||||
| } | } | ||||
| auto ret = kernel->CheckSpecs(); | auto ret = kernel->CheckSpecs(); | ||||
| if (ret != mindspore::lite::RET_OK) { | if (ret != mindspore::lite::RET_OK) { | ||||
| delete kernel; | |||||
| MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; | MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; | ||||
| delete kernel; | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| return kernel; | return kernel; | ||||
| @@ -228,8 +228,17 @@ int OpenCLSubGraph::Init() { | |||||
| MS_LOG(ERROR) << "OpenCL prepare fail"; | MS_LOG(ERROR) << "OpenCL prepare fail"; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| MallocTensorWithReuse(); | |||||
| auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_); | |||||
| ocl_runtime_->SetProfiling(true); | |||||
| ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | |||||
| return ret; | |||||
| } | |||||
| ocl_runtime_->SetProfiling(false); | |||||
| #ifdef Debug | |||||
| ocl_runtime_->SetProfiling(true); | |||||
| #endif | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| @@ -24,6 +24,12 @@ namespace mindspore::lite::opencl { | |||||
| int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | ||||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, | std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, | ||||
| const KernelCallBack &after) { | const KernelCallBack &after) { | ||||
| return RunOrTune(inputs, outputs, kernels, allocator, before, after, false); | |||||
| } | |||||
| int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | |||||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | |||||
| const KernelCallBack &before, const KernelCallBack &after, bool is_tune) { | |||||
| int ret; | int ret; | ||||
| kernel::LiteKernelUtil::InitTensorRefCount(kernels); | kernel::LiteKernelUtil::InitTensorRefCount(kernels); | ||||
| for (auto *kernel : kernels) { | for (auto *kernel : kernels) { | ||||
| @@ -57,14 +63,26 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou | |||||
| return ret; | return ret; | ||||
| } | } | ||||
| } | } | ||||
| output->set_allocator(allocator_); | |||||
| } | } | ||||
| if (is_tune) { | |||||
| ret = op_kernel->Tune(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "tuning kernel failed, name: " << kernel->name(); | |||||
| return ret; | |||||
| } | |||||
| } else { | |||||
| ret = kernel->Run(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | |||||
| return ret; | |||||
| } | |||||
| #ifdef Debug | |||||
| MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() | |||||
| << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; | |||||
| ret = kernel->Run(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | |||||
| return ret; | |||||
| #endif | |||||
| } | } | ||||
| if (after != nullptr) { | if (after != nullptr) { | ||||
| if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { | if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { | ||||
| MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name(); | MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name(); | ||||
| @@ -34,6 +34,9 @@ class OpenCLExecutor : public Executor { | |||||
| int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, | int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, | ||||
| Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, | Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, | ||||
| const KernelCallBack &after = nullptr) override; | const KernelCallBack &after = nullptr) override; | ||||
| int RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | |||||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr, | |||||
| const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr, bool is_tune = false); | |||||
| protected: | protected: | ||||
| InnerContext *context = nullptr; | InnerContext *context = nullptr; | ||||
| @@ -230,12 +230,7 @@ int OpenCLRuntime::Init() { | |||||
| MS_LOG(INFO) << "Compute Unit: " << compute_units_; | MS_LOG(INFO) << "Compute Unit: " << compute_units_; | ||||
| MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; | MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; | ||||
| #ifdef Debug | |||||
| const cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; | |||||
| #else | |||||
| const cl_command_queue_properties properties = 0; | const cl_command_queue_properties properties = 0; | ||||
| #endif | |||||
| default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret); | default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret); | ||||
| if (ret != CL_SUCCESS) { | if (ret != CL_SUCCESS) { | ||||
| delete device_; | delete device_; | ||||
| @@ -244,6 +239,16 @@ int OpenCLRuntime::Init() { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| const cl_command_queue_properties profiling_properties = CL_QUEUE_PROFILING_ENABLE; | |||||
| profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, profiling_properties, &ret); | |||||
| if (ret != CL_SUCCESS) { | |||||
| delete device_; | |||||
| delete context_; | |||||
| delete default_command_queue_; | |||||
| MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret); | |||||
| return RET_ERROR; | |||||
| } | |||||
| allocator_ = new (std::nothrow) OpenCLAllocator(this); | allocator_ = new (std::nothrow) OpenCLAllocator(this); | ||||
| if (allocator_ == nullptr) { | if (allocator_ == nullptr) { | ||||
| delete device_; | delete device_; | ||||
| @@ -473,15 +478,17 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> | |||||
| } | } | ||||
| // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. | // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. | ||||
| int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | ||||
| cl::CommandQueue *command_queue) { | |||||
| cl::CommandQueue *command_queue, cl::Event *event) { | |||||
| if (command_queue == nullptr) { | if (command_queue == nullptr) { | ||||
| command_queue = default_command_queue_; | |||||
| if (profiling_) { | |||||
| command_queue = profiling_command_queue_; | |||||
| } else { | |||||
| command_queue = default_command_queue_; | |||||
| } | |||||
| } | } | ||||
| MS_ASSERT(local.size() == 0 || local.size() == global.size()); | MS_ASSERT(local.size() == 0 || local.size() == global.size()); | ||||
| cl::Event event; | |||||
| cl_int ret = CL_SUCCESS; | cl_int ret = CL_SUCCESS; | ||||
| ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event); | |||||
| ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event); | |||||
| if (ret != CL_SUCCESS) { | if (ret != CL_SUCCESS) { | ||||
| MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); | MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| @@ -496,15 +503,9 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global | |||||
| } | } | ||||
| cnt++; | cnt++; | ||||
| MS_LOG(DEBUG) << "RunKernel success!"; | MS_LOG(DEBUG) << "RunKernel success!"; | ||||
| #ifdef Debug | |||||
| event.wait(); | |||||
| cl_ulong time_start; | |||||
| cl_ulong time_end; | |||||
| event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); | |||||
| event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); | |||||
| double nanoSeconds = time_end - time_start; | |||||
| MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; | |||||
| #endif | |||||
| if (profiling_) { | |||||
| event->wait(); | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| // get gpu divce type | // get gpu divce type | ||||
| @@ -32,6 +32,7 @@ j* you may not use this file except in compliance with the License. | |||||
| namespace mindspore::lite::opencl { | namespace mindspore::lite::opencl { | ||||
| enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; | enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; | ||||
| enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; | |||||
| struct GpuInfo { | struct GpuInfo { | ||||
| GpuType type = OTHER; | GpuType type = OTHER; | ||||
| @@ -117,7 +118,7 @@ class OpenCLRuntime { | |||||
| int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local, | int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local, | ||||
| cl::CommandQueue *command_queue = nullptr); // !!!To be deleted | cl::CommandQueue *command_queue = nullptr); // !!!To be deleted | ||||
| int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | ||||
| cl::CommandQueue *command_queue = nullptr); | |||||
| cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); | |||||
| bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | ||||
| bool sync = false) const; | bool sync = false) const; | ||||
| bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | ||||
| @@ -139,10 +140,14 @@ class OpenCLRuntime { | |||||
| * @return max_work_group_size | * @return max_work_group_size | ||||
| */ | */ | ||||
| int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); | int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); | ||||
| void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } | |||||
| TuningMode GetTuningMode() const { return tuning_mode_; } | |||||
| void InitGpuCache(); | void InitGpuCache(); | ||||
| int LoadCache(const void *buf); | int LoadCache(const void *buf); | ||||
| void StoreCache(); | void StoreCache(); | ||||
| bool isProfiling() const { return profiling_; } | |||||
| void SetProfiling(bool profiling) { profiling_ = profiling; } | |||||
| private: | private: | ||||
| static OpenCLRuntime *GetInstance(); | static OpenCLRuntime *GetInstance(); | ||||
| @@ -158,6 +163,7 @@ class OpenCLRuntime { | |||||
| static size_t instance_count_; | static size_t instance_count_; | ||||
| static OpenCLRuntime *ocl_runtime_instance_; | static OpenCLRuntime *ocl_runtime_instance_; | ||||
| cl::CommandQueue *default_command_queue_{nullptr}; | cl::CommandQueue *default_command_queue_{nullptr}; | ||||
| cl::CommandQueue *profiling_command_queue_{nullptr}; | |||||
| cl::Context *context_{nullptr}; | cl::Context *context_{nullptr}; | ||||
| cl::Device *device_{nullptr}; | cl::Device *device_{nullptr}; | ||||
| OpenCLAllocator *allocator_{nullptr}; | OpenCLAllocator *allocator_{nullptr}; | ||||
| @@ -181,6 +187,8 @@ class OpenCLRuntime { | |||||
| const std::string version_{"V0.1"}; | const std::string version_{"V0.1"}; | ||||
| bool need_write_{false}; | bool need_write_{false}; | ||||
| bool enable_cache_{false}; | bool enable_cache_{false}; | ||||
| TuningMode tuning_mode_{TuningMode::DEFAULT}; | |||||
| bool profiling_{false}; | |||||
| }; | }; | ||||
| class OpenCLRuntimeWrapper { | class OpenCLRuntimeWrapper { | ||||