| @@ -92,8 +92,9 @@ void ActivationOpenCLKernel::SetConstArgs() { | |||
| } | |||
| void ActivationOpenCLKernel::SetGlobalLocal() { | |||
| local_range_ = cl::NullRange; | |||
| global_range_ = {outShape.width, outShape.height}; | |||
| local_size_ = {}; | |||
| global_size_ = {outShape.width, outShape.height}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ActivationOpenCLKernel::Run() { | |||
| @@ -101,7 +102,7 @@ int ActivationOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| auto ret = ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Run kernel:" << this->name() << " fail."; | |||
| return RET_ERROR; | |||
| @@ -42,7 +42,6 @@ class ActivationOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| static std::string GetActTypeString(int act_type); | |||
| cl::Kernel kernel_; | |||
| int type_; | |||
| float alpha_; | |||
| GpuTensorInfo outShape = GpuTensorInfo(nullptr); | |||
| @@ -115,9 +115,9 @@ void ArgMinMaxOpenCLKernel::SetGlobalLocal() { | |||
| default: // 3 | |||
| break; | |||
| } | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| std::vector<size_t> global = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; | |||
| global_size_ = {static_cast<size_t>(strides_.s[0]), static_cast<size_t>(src_size_.s[1]), 1}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ArgMinMaxOpenCLKernel::InitWeights() { | |||
| @@ -153,7 +153,7 @@ int ArgMinMaxOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::BUF); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -38,9 +38,9 @@ class ArgMinMaxOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int InitWeights() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| private: | |||
| cl::Kernel kernel_; | |||
| void *buff_{nullptr}; | |||
| void *ids_{nullptr}; | |||
| GpuTensorInfo im_in_{GpuTensorInfo(nullptr)}; | |||
| @@ -124,23 +124,24 @@ int ArithmeticOpenCLKernel::CheckSpecs() { | |||
| void ArithmeticOpenCLKernel::SetGlobalLocal() { | |||
| if (element_flag_) { | |||
| local_range_ = {}; | |||
| local_size_ = {}; | |||
| auto out_shape = out_tensors_[0]->shape(); | |||
| if (out_shape.size() == 2) { | |||
| size_t H = out_shape[0]; | |||
| size_t W = UP_DIV(out_shape[1], C4NUM); | |||
| global_range_ = {W, H}; | |||
| global_size_ = {W, H}; | |||
| } else { | |||
| size_t H = out_shape[0] * out_shape[1]; | |||
| size_t W = out_shape[2] * UP_DIV(out_shape[3], C4NUM); | |||
| global_range_ = {W, H}; | |||
| global_size_ = {W, H}; | |||
| } | |||
| } else { | |||
| local_range_ = {}; | |||
| local_size_ = {}; | |||
| auto out_shape = GetNHWCShape(out_tensors_[0]->shape()); | |||
| global_range_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||
| global_size_ = {static_cast<size_t>(UP_DIV(out_shape[3], C4NUM)), static_cast<size_t>(out_shape[2]), | |||
| static_cast<size_t>(out_shape[1] * out_shape[0])}; | |||
| } | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ArithmeticOpenCLKernel::InitWeights() { | |||
| @@ -269,7 +270,7 @@ int ArithmeticOpenCLKernel::Run() { | |||
| auto input_1_ptr = inputs_weight_ptrs_[1] == nullptr ? in_tensors_[1]->data_c() : inputs_weight_ptrs_[1]; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, input_1_ptr); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -39,7 +39,6 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| bool element_flag_{true}; | |||
| float activation_min_{-FLT_MAX}; | |||
| float activation_max_{FLT_MAX}; | |||
| @@ -131,10 +131,10 @@ void ArithmeticSelfOpenCLKernel::SetGlobalLocal() { | |||
| OC = UP_DIV(output_shape[1], C4NUM); | |||
| } | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| ArithmeticSelfGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; // init local | |||
| global_size_ = {OH, OW, OC}; | |||
| ArithmeticSelfGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ArithmeticSelfOpenCLKernel::Prepare() { | |||
| @@ -159,7 +159,7 @@ int ArithmeticSelfOpenCLKernel::Run() { | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -43,7 +43,6 @@ class ArithmeticSelfOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| void GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param); | |||
| cl_int4 output_shape_ = {}; | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -75,9 +75,9 @@ void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() { | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||
| std::vector<int> out_shape = out_tensors_[0]->shape(); | |||
| cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; | |||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int BatchToSpaceNDOpenCLKernel::Prepare() { | |||
| @@ -103,7 +103,7 @@ int BatchToSpaceNDOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -37,9 +37,9 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel { | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| private: | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -62,10 +62,10 @@ void BatchNormOpenCLKernel::SetGlobalLocal() { | |||
| uint32_t OC = UP_DIV(output_shape[3], C4NUM); | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| BatchNormGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; // init local | |||
| global_size_ = {OH, OW, OC}; | |||
| BatchNormGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int BatchNormOpenCLKernel::Prepare() { | |||
| @@ -91,7 +91,7 @@ int BatchNormOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[3]->data_c()); // mean | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[4]->data_c()); // variance | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -40,7 +40,6 @@ class BiasAddOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| cl_int4 GetGlobalshape(); | |||
| cl::Kernel kernel_; | |||
| void *BiasAdd_{nullptr}; | |||
| int in_size_{}; | |||
| int out_size_{}; | |||
| @@ -73,10 +73,10 @@ void CastOpenCLKernel::SetGlobalLocal() { | |||
| uint32_t OC = UP_DIV(input_shape[3], C4NUM); | |||
| const std::vector<size_t> &max_global = ocl_runtime_->GetWorkItemSize(); | |||
| std::vector<size_t> local = {1, 1, 1}; // init local | |||
| std::vector<size_t> global = {OH, OW, OC}; | |||
| CastGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; // init local | |||
| global_size_ = {OH, OW, OC}; | |||
| CastGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int CastOpenCLKernel::Prepare() { | |||
| @@ -100,7 +100,7 @@ int CastOpenCLKernel::Run() { | |||
| int arg_cn = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c()); // input tensor | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c()); // out tensor | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -41,8 +41,6 @@ class CastOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int GetKernelName(std::string *kernel_name, CastParameter *param); | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -132,17 +132,17 @@ void ConcatOpenCLKernel::SetGlobalLocal() { | |||
| if (axis_ == 3 && !Align_) { | |||
| OH = out_shape_.s[0] * out_shape_.s[1]; | |||
| OW = out_shape_.s[2]; | |||
| global = {OH, OW, 1}; | |||
| local = {1, 1, 1}; | |||
| global_size_ = {OH, OW, 1}; | |||
| local_size_ = {1, 1, 1}; | |||
| } else { | |||
| OH = out_shape_.s[0] * out_shape_.s[1]; | |||
| OW = out_shape_.s[2]; | |||
| OC = out_shape_.s[3]; | |||
| global = {OH, OW, OC}; | |||
| local = {1, 1, 1}; | |||
| global_size_ = {OH, OW, OC}; | |||
| local_size_ = {1, 1, 1}; | |||
| } | |||
| ConcatGetWorkGroup(global, &local, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| ConcatGetWorkGroup(global_size_, &local_size_, max_global[0]); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ConcatOpenCLKernel::Prepare() { | |||
| @@ -196,7 +196,7 @@ int ConcatOpenCLKernel::Run() { | |||
| MS_LOG(ERROR) << "unsupported input size :" << in_tensors_.size(); | |||
| return RET_ERROR; | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -53,7 +53,6 @@ class ConcatOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int RunAxis0(); | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -93,7 +93,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||
| std::string program_name = "winograd"; | |||
| ocl_runtime_->LoadSource(program_name, winograd_source); | |||
| ocl_runtime_->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36"); | |||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, "WinogradConvolution"); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, "WinogradConvolution"); | |||
| ocl_runtime_->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4"); | |||
| } else { | |||
| SetBlockSize(); | |||
| @@ -101,7 +101,7 @@ int Conv2DOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "Conv2D_H" + std::to_string(block_size_.H) + "W" + std::to_string(block_size_.W) + "C" + | |||
| std::to_string(block_size_.C); | |||
| ocl_runtime_->LoadSource(program_name, conv2d_source); | |||
| ocl_runtime_->BuildKernel(kernel_conv_, program_name, kernel_name); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | |||
| } | |||
| // allocate winograd memory | |||
| @@ -329,7 +329,9 @@ void Conv2DOpenCLKernel::SetGlobalLocal() { | |||
| local_h = std::min(global_h, local_hw); | |||
| local_w = std::min(local_hw / local_h, global_w); | |||
| } | |||
| AlignGlobalLocal({global_h, global_w, global_c}, {local_h, local_w, local_c}); | |||
| global_size_ = {global_h, global_w, global_c}; | |||
| local_size_ = {local_h, local_w, local_c}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| } | |||
| @@ -355,11 +357,11 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||
| arg_cn = 0; | |||
| cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; | |||
| cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, conv_out_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem0_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, winograd_mem1_); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, conv_in_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, conv_out_shape); | |||
| arg_cn = 2; | |||
| cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; | |||
| @@ -373,30 +375,37 @@ void Conv2DOpenCLKernel::SetConstArgs() { | |||
| cl_int4 kernel_stride = {KH_, KW_, param->stride_h_, param->stride_w_}; | |||
| cl_int4 pad = {param->pad_u_, param->pad_d_, param->pad_l_, param->pad_r_}; | |||
| cl_int2 dilation = {param->dilation_h_, param->dilation_w_}; | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, input_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, output_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, kernel_stride); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, pad); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn++, dilation); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, arg_cn, act_type); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, input_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, output_shape); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, kernel_stride); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dilation); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, act_type); | |||
| } | |||
| } | |||
| int Conv2DOpenCLKernel::Tune() { | |||
| if (use_winograd_) { | |||
| return RET_OK; | |||
| } | |||
| return OpenCLKernel::Tune(); | |||
| } | |||
| int Conv2DOpenCLKernel::Run() { | |||
| if (use_winograd_) { | |||
| ocl_runtime_->SetKernelArg(kernel_4x4to36_, 0, in_tensors_.front()->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_); | |||
| ocl_runtime_->RunKernel(kernel_conv_, global_conv_, local_conv_); | |||
| ocl_runtime_->RunKernel(kernel_, global_conv_, local_conv_); | |||
| ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_); | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, 0, in_tensors_.front()->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_conv_, 1, out_tensors_.front()->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_conv_, global_range_, local_range_); | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -42,6 +42,7 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| int Run() override; | |||
| int Tune() override; | |||
| private: | |||
| void SetBlockSize(); | |||
| @@ -60,7 +61,6 @@ class Conv2DOpenCLKernel : public OpenCLKernel { | |||
| } | |||
| cl::Kernel kernel_4x4to36_; | |||
| cl::Kernel kernel_conv_; | |||
| cl::Kernel kernel_36to4x4_; | |||
| cl::NDRange global_4x4to36_, local_4x4to36_; | |||
| cl::NDRange global_conv_, local_conv_; | |||
| @@ -193,7 +193,7 @@ int Conv2dTransposeOpenCLKernel::Run() { | |||
| int arg_cnt = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -40,7 +40,6 @@ class Conv2dTransposeOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| @@ -175,24 +175,23 @@ void DepthwiseConv2dOpenCLKernel::SetConstArgs() { | |||
| void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { | |||
| // set global | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); | |||
| std::vector<size_t> global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||
| global_size_ = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||
| // set local | |||
| const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | |||
| int z = global_size[0]; | |||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); | |||
| int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z))); | |||
| std::vector<size_t> local_size = | |||
| std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||
| int z = global_size_[0]; | |||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size_[2], 8)); | |||
| int x = std::max(1, std::min(static_cast<int>(global_size_[1]), max_group_size / (y * z))); | |||
| local_size_ = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||
| OpenCLKernel::AlignGlobalLocal(global_size, local_size); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -42,7 +42,6 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| void *packed_weight_{nullptr}; | |||
| void *bias_data_{nullptr}; | |||
| cl::Kernel kernel_; | |||
| std::vector<int> block_size_{2, 2, 1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -43,9 +43,6 @@ class FillOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int RunFill(); | |||
| int RunShape(); | |||
| cl::Kernel kernel_; | |||
| private: | |||
| float default_{0.0f}; | |||
| }; | |||
| @@ -179,9 +179,9 @@ int FullConnectionOpenCLKernel::InitWeights() { | |||
| } | |||
| void FullConnectionOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> local = {32, 4, 1}; | |||
| std::vector<size_t> global = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; | |||
| AlignGlobalLocal(global, local); | |||
| local_size_ = {32, 4, 1}; | |||
| global_size_ = {UP_DIV(outShape.C, C4NUM), 4, outShape.N}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| void FullConnectionOpenCLKernel::SetConstArgs() { | |||
| @@ -202,7 +202,7 @@ int FullConnectionOpenCLKernel::Run() { | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -38,9 +38,9 @@ class FullConnectionOpenCLKernel : public OpenCLKernel { | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Init() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| private: | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| void *bias_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| @@ -93,9 +93,9 @@ void GatherOpenCLKernel::SetConstArgs() { | |||
| void GatherOpenCLKernel::SetGlobalLocal() { | |||
| auto output = GpuTensorInfo(out_tensors_.front()); | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| std::vector<size_t> global = {output.W, output.N * output.H, output.Slice}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; | |||
| global_size_ = {output.W, output.N * output.H, output.Slice}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int GatherOpenCLKernel::Prepare() { | |||
| @@ -155,7 +155,7 @@ int GatherOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_.front()->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_.front()->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -38,12 +38,12 @@ class GatherOpenCLKernel : public OpenCLKernel { | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| protected: | |||
| int UpdateWeights(); | |||
| private: | |||
| cl::Kernel kernel_; | |||
| int32_t *indices_data_{nullptr}; | |||
| int axis_ = {0}; | |||
| }; | |||
| @@ -1,50 +0,0 @@ | |||
| /** | |||
| * Copyright 2019 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_HSWISH_H_ | |||
| #include <vector> | |||
| #include "mindspore/lite/nnacl/fp32/activation_fp32.h" | |||
| #include "src/runtime/kernel/opencl/opencl_kernel.h" | |||
| namespace mindspore::kernel { | |||
| class HswishOpenCLKernel : public OpenCLKernel { | |||
| public: | |||
| HswishOpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs) | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~HswishOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| private: | |||
| int InferShapeTo4D(); | |||
| cl::Kernel kernel_; | |||
| private: | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| size_t W_{1}; | |||
| size_t C_{1}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -137,11 +137,11 @@ int MatMulOpenCLKernel::InitWeights() { | |||
| void MatMulOpenCLKernel::SetGlobalLocal() { | |||
| // local size should less than MAX_GROUP_SIZE | |||
| std::vector<size_t> local = {32, 4, 1}; | |||
| std::vector<size_t> global = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| AlignGlobalLocal(global, local); | |||
| local_size_ = {32, 4, 1}; | |||
| global_size_ = {UP_DIV(static_cast<size_t>(outShape[3]), C4NUM), | |||
| 4 * static_cast<size_t>(outShape[0]) * static_cast<size_t>(outShape[1]), | |||
| static_cast<size_t>(outShape[2])}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| void MatMulOpenCLKernel::SetConstArgs() { | |||
| @@ -158,7 +158,7 @@ int MatMulOpenCLKernel::Run() { | |||
| int arg_count = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_count++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -37,9 +37,9 @@ class MatMulOpenCLKernel : public OpenCLKernel { | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override { return lite::RET_OK; } | |||
| private: | |||
| cl::Kernel kernel_; | |||
| void *padWeight_{nullptr}; | |||
| bool enable_fp16_{false}; | |||
| bool transposeA{false}; | |||
| @@ -85,14 +85,16 @@ void OneHotOpenCLKernel::SetConstArgs() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx, static_cast<int>(out_shape_.C)); | |||
| } | |||
| void OneHotOpenCLKernel::SetGlobalLocal() { | |||
| global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||
| local_size_ = {}; | |||
| global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int OneHotOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -39,7 +39,6 @@ class OneHotOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| int depth_{0}; | |||
| float on_value_{1.0f}; | |||
| float off_value_{0.0f}; | |||
| @@ -97,14 +97,15 @@ void PadOpenCLKernel::SetConstArgs() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, io_slices); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, pad_before); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn, static_cast<cl_float>(param_->constant_value_)); | |||
| AlignGlobalLocal({output.N * output.H, output.W, output.Slice}, {8, 4, 1}); | |||
| local_size_ = {8, 4, 1}; | |||
| global_size_ = {output.N * output.H, output.W, output.Slice}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int PadOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -41,7 +41,6 @@ class PadOpenCLKernel : public OpenCLKernel { | |||
| int Run() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| PadParameter *param_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -86,8 +86,9 @@ void PoolingOpenCLKernel::SetGlobalLocal() { | |||
| const size_t global_x = out_tensors_[0]->shape()[1]; | |||
| const size_t global_y = out_tensors_[0]->shape()[2]; | |||
| const size_t global_z = UP_DIV(out_tensors_[0]->shape()[3], C4NUM); | |||
| global_range_ = {global_z, global_y, global_x}; | |||
| local_range_ = {}; | |||
| global_size_ = {global_z, global_y, global_x}; | |||
| local_size_ = {}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| void PoolingOpenCLKernel::SetConstArgs() { | |||
| @@ -111,7 +112,7 @@ int PoolingOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -39,7 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| PoolingParameter *parameter_; | |||
| cl::Kernel kernel_; | |||
| std::vector<size_t> local_size_; | |||
| std::vector<size_t> global_size_; | |||
| }; | |||
| @@ -37,7 +37,6 @@ class PowerOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int InferShapeTo4D(); | |||
| cl::Kernel kernel_; | |||
| private: | |||
| size_t N_{1}; | |||
| @@ -37,7 +37,6 @@ class PReluOpenCLKernel : public OpenCLKernel { | |||
| int InitWeights() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| bool enable_fp16_{false}; | |||
| int batch_size_{}; | |||
| int C_{}; | |||
| @@ -148,15 +148,22 @@ void ReduceOpenCLKernel::SetGlobalLocal() { | |||
| int h = shapex[1]; | |||
| int c = shapex[3]; | |||
| int c4 = UP_DIV(c, C4NUM); | |||
| std::vector<size_t> local = {}; | |||
| local_size_ = {}; | |||
| if (use_local_) { | |||
| local = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||
| local_size_ = {1, LOCAL_CACHE_THREAD, LOCAL_CACHE_THREAD}; | |||
| } | |||
| std::vector<size_t> global = {static_cast<size_t>(c4), 1, 1}; | |||
| global_size_ = {static_cast<size_t>(c4), 1, 1}; | |||
| if (wc_reduce_) { | |||
| global = {static_cast<size_t>(h), 1, 1}; | |||
| global_size_ = {static_cast<size_t>(h), 1, 1}; | |||
| } | |||
| AlignGlobalLocal(global, local); | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ReduceOpenCLKernel::Tune() { | |||
| if (use_local_) { | |||
| return RET_OK; | |||
| } | |||
| return OpenCLKernel::Tune(); | |||
| } | |||
| int ReduceOpenCLKernel::Run() { | |||
| @@ -164,7 +171,7 @@ int ReduceOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -36,11 +36,11 @@ class ReduceOpenCLKernel : public OpenCLKernel { | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override; | |||
| private: | |||
| cl_float4 GenC4Mask(); | |||
| static std::string GetReduceTypeStr(int type); | |||
| cl::Kernel kernel_; | |||
| GpuTensorInfo outShape = GpuTensorInfo(nullptr); | |||
| bool use_local_{false}; | |||
| bool wc_reduce_{false}; | |||
| @@ -55,9 +55,9 @@ void ReshapeOpenCLKernel::SetConstArgs() { | |||
| void ReshapeOpenCLKernel::SetGlobalLocal() { | |||
| auto out = GpuTensorInfo(out_tensors_.front()); | |||
| std::vector<size_t> local = {}; | |||
| std::vector<size_t> global{out.width, out.height}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {}; | |||
| global_size_ = {out.width, out.height}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ReshapeOpenCLKernel::Prepare() { | |||
| @@ -81,7 +81,7 @@ int ReshapeOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -37,7 +37,6 @@ class ReshapeOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -100,9 +100,10 @@ void ResizeOpenCLKernel::SetConstArgs() { | |||
| } | |||
| void ResizeOpenCLKernel::SetGlobalLocal() { | |||
| local_range_ = {}; | |||
| local_size_ = {}; | |||
| auto out_shape = GpuTensorInfo(out_tensors_[0]); | |||
| global_range_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||
| global_size_ = {out_shape.Slice, out_shape.W, out_shape.H}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ResizeOpenCLKernel::Run() { | |||
| @@ -110,7 +111,7 @@ int ResizeOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -40,7 +40,6 @@ class ResizeOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| float getResizeScaleFactor(int input_size, int output_size); | |||
| cl::Kernel kernel_; | |||
| bool alignCorner{false}; | |||
| bool preserveAspectRatio{false}; | |||
| }; | |||
| @@ -37,7 +37,6 @@ class ScaleOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| void Image2dGetWorkGroupSize(); | |||
| cl::Kernel kernel_; | |||
| bool weight_vector_flag_{true}; | |||
| bool broadcast_flag_{false}; | |||
| bool broadcast_H_flag_{false}; | |||
| @@ -116,6 +116,13 @@ void SoftmaxOpenCLKernel::SetGlobalLocal() { | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int SoftmaxOpenCLKernel::Tune() { | |||
| if (onexone_flag_) { | |||
| return RET_OK; | |||
| } | |||
| return OpenCLKernel::Tune(); | |||
| } | |||
| void SoftmaxOpenCLKernel::SetConstArgs() { | |||
| int arg_idx = 2; | |||
| int channel = out_shape.C; | |||
| @@ -133,8 +140,7 @@ int SoftmaxOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| // run opengl kernel | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return lite::RET_OK; | |||
| } | |||
| @@ -38,6 +38,7 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| int Tune() override; | |||
| private: | |||
| int InitGlobalSize(); | |||
| @@ -45,7 +46,6 @@ class SoftmaxOpenCLKernel : public OpenCLKernel { | |||
| int SetWorkGroupSize(); | |||
| std::vector<float> GetMaskForLastChannel(int channels); | |||
| cl::Kernel kernel_; | |||
| SoftmaxParameter *parameter_; | |||
| bool onexone_flag_{false}; | |||
| std::vector<size_t> local_size_; | |||
| @@ -80,9 +80,9 @@ void SpaceToBatchNDOpenCLKernel::SetConstArgs() { | |||
| void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||
| cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; | |||
| std::vector<size_t> local = {1, 1, 1}; | |||
| std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| local_size_ = {1, 1, 1}; | |||
| global_size_ = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int SpaceToBatchNDOpenCLKernel::Prepare() { | |||
| @@ -109,7 +109,7 @@ int SpaceToBatchNDOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -39,7 +39,6 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| #endif | |||
| @@ -69,7 +69,9 @@ void SpaceToDepthOpenCLKernel::SetConstArgs() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, ci_size); | |||
| } | |||
| void SpaceToDepthOpenCLKernel::SetGlobalLocal() { | |||
| global_range_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||
| local_size_ = {}; | |||
| global_size_ = {out_shape_.Slice, out_shape_.W, out_shape_.H * out_shape_.N}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int SpaceToDepthOpenCLKernel::Run() { | |||
| @@ -77,7 +79,7 @@ int SpaceToDepthOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -38,7 +38,6 @@ class SpaceToDepthOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| GpuTensorInfo in_shape_ = GpuTensorInfo(nullptr); | |||
| GpuTensorInfo out_shape_ = GpuTensorInfo(nullptr); | |||
| }; | |||
| @@ -136,11 +136,11 @@ void SparseToDenseOpenCLKernel::SetConstArgs() { | |||
| } | |||
| void SparseToDenseOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> local = {1, 1}; | |||
| local_size_ = {1, 1}; | |||
| size_t OH = n_ * h_; | |||
| size_t OW = w_ * UP_DIV(c_, C4NUM); | |||
| std::vector<size_t> global = {OH, OW}; | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| global_size_ = {OH, OW}; | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int SparseToDenseOpenCLKernel::Prepare() { | |||
| @@ -209,7 +209,7 @@ int SparseToDenseOpenCLKernel::Run() { | |||
| } else { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cn++, weight_scalar_); | |||
| } | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -43,7 +43,6 @@ class SparseToDenseOpenCLKernel : public OpenCLKernel { | |||
| int InitOutputToDefault(); | |||
| private: | |||
| cl::Kernel kernel_; | |||
| // bool IndicesIsScalar{false}; | |||
| bool enable_fp16_{false}; | |||
| float default_{0.0f}; | |||
| @@ -44,7 +44,6 @@ class StackOpenCLKernel : public OpenCLKernel { | |||
| int InferOutTensorShapeTo4D(cl_int4 *output_shape); | |||
| cl::Kernel kernel_; | |||
| int axis_{0}; | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| @@ -164,24 +164,24 @@ void StridedSliceOpenCLKernel::SetConstArgs() { | |||
| void StridedSliceOpenCLKernel::SetGlobalLocal() { | |||
| auto output_info = GpuTensorInfo(out_tensors_.front()); | |||
| std::vector<size_t> global = {output_info.N * output_info.H, output_info.W, output_info.Slice}; | |||
| global_size_ = {output_info.N * output_info.H, output_info.W, output_info.Slice}; | |||
| const int max_divider = 8; | |||
| auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | |||
| size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); | |||
| size_t local_c = GetMaxDivisorStrategy0(global_size_[2], max_divider); | |||
| local_c = std::max<size_t>(local_c, 1); | |||
| size_t local_hw = max_work_group_size / local_c; | |||
| size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); | |||
| size_t local_w = std::min(local_hw / local_h, global[1]); | |||
| std::vector<size_t> local = {local_h, local_w, local_c}; | |||
| AlignGlobalLocal(global, local); | |||
| size_t local_h = std::min(UP_DIV(global_size_[0], 2), local_hw); | |||
| size_t local_w = std::min(local_hw / local_h, global_size_[1]); | |||
| local_size_ = {local_h, local_w, local_c}; | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int StridedSliceOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -42,7 +42,6 @@ class StridedSliceOpenCLKernel : public OpenCLKernel { | |||
| private: | |||
| int InitConstArgs(); | |||
| cl::Kernel kernel_; | |||
| cl_int4 input_shape_{}; | |||
| cl_int4 output_shape_{}; | |||
| cl_int2 io_slices_{}; | |||
| @@ -51,13 +51,13 @@ void ToFormatOpenCLKernel::SetConstArgs() { | |||
| } | |||
| void ToFormatOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||
| std::vector<size_t> local = {8, 16, 3}; | |||
| global_size_ = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||
| local_size_ = {8, 16, 3}; | |||
| size_t max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | |||
| if (max_work_group_size < 384) { | |||
| local[2] = 1; | |||
| local_size_[2] = 1; | |||
| } | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| OpenCLKernel::AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int ToFormatOpenCLKernel::Prepare() { | |||
| @@ -97,7 +97,7 @@ int ToFormatOpenCLKernel::Run() { | |||
| auto dst_mem_type = out_mem_type_; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_.front()->data_c(), src_mem_type); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_.front()->data_c(), dst_mem_type); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return RET_OK; | |||
| } | |||
| @@ -38,7 +38,6 @@ class ToFormatOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| size_t N_{1}; | |||
| size_t H_{1}; | |||
| size_t W_{1}; | |||
| @@ -96,11 +96,13 @@ void TransposeOpenCLKernel::SetGlobalLocal() { | |||
| size_t w = shapex[2]; | |||
| size_t c = shapex[3]; | |||
| size_t c4 = UP_DIV(c, 4); | |||
| local_size_ = {}; | |||
| if (type == TransposeType::AXIS0312) { // NHWC -> NCHW | |||
| global_range_ = {UP_DIV(h, C4NUM), w, c4}; | |||
| global_size_ = {UP_DIV(h, C4NUM), w, c4}; | |||
| } else if (type == TransposeType::AXIS0231) { // NCHW -> NHWC | |||
| global_range_ = {h, UP_DIV(w, C4NUM), c4}; | |||
| global_size_ = {h, UP_DIV(w, C4NUM), c4}; | |||
| } | |||
| AlignGlobalLocal(global_size_, local_size_); | |||
| } | |||
| int TransposeOpenCLKernel::Run() { | |||
| @@ -108,7 +110,7 @@ int TransposeOpenCLKernel::Run() { | |||
| int arg_idx = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -41,7 +41,6 @@ class TransposeOpenCLKernel : public OpenCLKernel { | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| TransposeType type{TransposeType::AXIS0312}; | |||
| }; | |||
| } // namespace mindspore::kernel | |||
| @@ -16,8 +16,10 @@ | |||
| #ifndef MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | |||
| #define MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | |||
| #define MAX_PROFILING_TIME_MILLI_SECOND 10 * 1000 // 10 seconds | |||
| #include <vector> | |||
| #include <set> | |||
| #include "src/lite_kernel.h" | |||
| #include "include/errorcode.h" | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| @@ -137,6 +139,16 @@ struct GpuTensorInfo { | |||
| size_t NDim{}; | |||
| }; | |||
| struct BaseTuningParameter { | |||
| std::vector<size_t> local_size; | |||
| friend std::ostream &operator<<(std::ostream &ostrm, const BaseTuningParameter &a) { | |||
| ostrm << "LocalSize:"; | |||
| for (auto i : a.local_size) { | |||
| ostrm << i << ","; | |||
| } | |||
| return ostrm; | |||
| } | |||
| }; | |||
| class OpenCLKernel : public LiteKernel { | |||
| public: | |||
| OpenCLKernel(OpParameter *parameter, const std::vector<lite::Tensor *> &inputs, | |||
| @@ -158,7 +170,9 @@ class OpenCLKernel : public LiteKernel { | |||
| for (size_t i = 0; i < local.size(); i++) { | |||
| MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); | |||
| } | |||
| if (local.empty()) { | |||
| local_range_ = cl::NullRange; | |||
| } | |||
| if (global.size() == 1) { | |||
| global_range_ = cl::NDRange(internal_global_ws.at(0)); | |||
| if (!local.empty()) { | |||
| @@ -209,13 +223,135 @@ class OpenCLKernel : public LiteKernel { | |||
| lite::opencl::MemType GetMemType() { return out_mem_type_; } | |||
| void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | |||
| virtual std::vector<BaseTuningParameter> GenerateTuningParam() { | |||
| size_t ndim = global_size_.size(); | |||
| std::vector<BaseTuningParameter> tuning_params = {}; | |||
| if (ndim == 0) { | |||
| MS_LOG(ERROR) << "Generate tuning param failed, global_size_ is null."; | |||
| return tuning_params; | |||
| } | |||
| BaseTuningParameter default_tuning_param = BaseTuningParameter(); | |||
| tuning_params.push_back(default_tuning_param); | |||
| std::vector<size_t> max_work_items = ocl_runtime_->GetWorkItemSize(); | |||
| size_t max_workgroup_size = ocl_runtime_->GetMaxWorkGroupSize(kernel_); | |||
| size_t MIN_WORKGROUP_SIZE = 8; | |||
| std::set<size_t> candidate_x = GenerateLocalByGlobal(global_size_[0]); | |||
| std::set<size_t> candidate_y = {1}; | |||
| std::set<size_t> candidate_z = {1}; | |||
| if (ndim > 1) { | |||
| candidate_y = GenerateLocalByGlobal(global_size_[1]); | |||
| } | |||
| if (ndim > 2) { | |||
| candidate_z = GenerateLocalByGlobal(global_size_[2]); | |||
| } | |||
| for (auto x : candidate_x) { | |||
| if (x < max_work_items[0]) { | |||
| for (auto y : candidate_y) { | |||
| if (y < max_work_items[1]) { | |||
| for (auto z : candidate_z) { | |||
| auto group_size = x * y * z; | |||
| if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) { | |||
| BaseTuningParameter tuning_param = BaseTuningParameter(); | |||
| tuning_param.local_size = {x, y, z}; | |||
| tuning_params.push_back(tuning_param); | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| } | |||
| return tuning_params; | |||
| } | |||
| virtual int AssignTuningParam(const BaseTuningParameter param) { | |||
| std::vector<size_t> local_size_tmp = param.local_size; | |||
| if (local_size_tmp.size() > global_size_.size()) { | |||
| local_size_tmp = std::vector<size_t>(local_size_tmp.begin(), local_size_tmp.begin() + global_size_.size()); | |||
| } | |||
| AlignGlobalLocal(global_size_, local_size_tmp); | |||
| return RET_OK; | |||
| } | |||
| virtual int Tune() { | |||
| if (!ocl_runtime_->isProfiling()) { | |||
| MS_LOG(WARNING) << "Tuning mode require opencl runtime profiling."; | |||
| return RET_OK; | |||
| } | |||
| lite::opencl::TuningMode mode = ocl_runtime_->GetTuningMode(); | |||
| if (mode == lite::opencl::TuningMode::DEFAULT) { | |||
| return RET_OK; | |||
| } | |||
| static const std::set<int> FAST_MODE_OPS = {schema::PrimitiveType_Conv2D, schema::PrimitiveType_DepthwiseConv2D, | |||
| schema::PrimitiveType_DeConv2D}; | |||
| if (mode == lite::opencl::TuningMode::FAST && FAST_MODE_OPS.find(op_parameter_->type_) == FAST_MODE_OPS.end()) { | |||
| return RET_OK; | |||
| } | |||
| auto tuning_params = GenerateTuningParam(); | |||
| if (tuning_params.empty()) { | |||
| MS_LOG(WARNING) << "Tuning param size is 0."; | |||
| return RET_OK; | |||
| } | |||
| int index = -1; | |||
| double min_time = MAX_PROFILING_TIME_MILLI_SECOND; | |||
| for (int i = 0; i < tuning_params.size(); i++) { | |||
| AssignTuningParam(tuning_params[i]); | |||
| auto ret = Run(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Tuning " << name() << " failed for tuning param " << tuning_params[i]; | |||
| return ret; | |||
| } | |||
| double current_time = GetProfilingTimeMs(); | |||
| MS_LOG(DEBUG) << "Tuning " << name() << " param (" << tuning_params[i] << ") exectime " << current_time << "ms"; | |||
| if (current_time < min_time) { | |||
| min_time = current_time; | |||
| index = i; | |||
| } | |||
| } | |||
| if (index != -1) { | |||
| MS_LOG(INFO) << "Tuning " << name() << " result: param (" << tuning_params[index] << ") exectime " << min_time | |||
| << "ms"; | |||
| AssignTuningParam(tuning_params[index]); | |||
| } else { | |||
| MS_LOG(WARNING) << "Cannot find suitable param."; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| double GetProfilingTimeMs() { | |||
| if (!ocl_runtime_->isProfiling()) { | |||
| return MAX_PROFILING_TIME_MILLI_SECOND; | |||
| } | |||
| cl_ulong time_start; | |||
| cl_ulong time_end; | |||
| event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); | |||
| event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); | |||
| cl_ulong time_ns = time_end - time_start; | |||
| return static_cast<double>(time_ns) * 1e-6; | |||
| } | |||
| protected: | |||
| lite::opencl::OpenCLRuntime *ocl_runtime_; | |||
| lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; | |||
| cl::NDRange global_range_{cl::NullRange}; | |||
| cl::NDRange local_range_{cl::NullRange}; | |||
| std::vector<size_t> global_size_; // !!!To be deleted | |||
| std::vector<size_t> local_size_; // !!!To be deleted | |||
| std::vector<size_t> global_size_; | |||
| std::vector<size_t> local_size_; | |||
| cl::Kernel kernel_; | |||
| cl::Event event_; | |||
| static std::set<size_t> GenerateLocalByGlobal(size_t global_i) { | |||
| std::set<size_t> local_ = {}; | |||
| int index = 1; | |||
| while (index < global_i) { | |||
| local_.insert(index); | |||
| index *= 2; | |||
| } | |||
| for (size_t i = 1; i < 16; i++) { | |||
| if (global_i % i == 0) { | |||
| local_.insert(i); | |||
| } | |||
| } | |||
| return local_; | |||
| } | |||
| private: | |||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | |||
| @@ -233,8 +369,8 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input | |||
| } | |||
| auto ret = kernel->CheckSpecs(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| @@ -228,8 +228,17 @@ int OpenCLSubGraph::Init() { | |||
| MS_LOG(ERROR) << "OpenCL prepare fail"; | |||
| return ret; | |||
| } | |||
| MallocTensorWithReuse(); | |||
| auto opencl_exec = reinterpret_cast<lite::opencl::OpenCLExecutor *>(executor_); | |||
| ocl_runtime_->SetProfiling(true); | |||
| ret = opencl_exec->RunOrTune(in_tensors_, out_tensors_, nodes_, allocator_, nullptr, nullptr, true); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | |||
| return ret; | |||
| } | |||
| ocl_runtime_->SetProfiling(false); | |||
| #ifdef Debug | |||
| ocl_runtime_->SetProfiling(true); | |||
| #endif | |||
| return RET_OK; | |||
| } | |||
| @@ -24,6 +24,12 @@ namespace mindspore::lite::opencl { | |||
| int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | |||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, | |||
| const KernelCallBack &after) { | |||
| return RunOrTune(inputs, outputs, kernels, allocator, before, after, false); | |||
| } | |||
| int OpenCLExecutor::RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | |||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, | |||
| const KernelCallBack &before, const KernelCallBack &after, bool is_tune) { | |||
| int ret; | |||
| kernel::LiteKernelUtil::InitTensorRefCount(kernels); | |||
| for (auto *kernel : kernels) { | |||
| @@ -57,14 +63,26 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou | |||
| return ret; | |||
| } | |||
| } | |||
| output->set_allocator(allocator_); | |||
| } | |||
| if (is_tune) { | |||
| ret = op_kernel->Tune(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "tuning kernel failed, name: " << kernel->name(); | |||
| return ret; | |||
| } | |||
| } else { | |||
| ret = kernel->Run(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | |||
| return ret; | |||
| } | |||
| #ifdef Debug | |||
| MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str() | |||
| << ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms"; | |||
| ret = kernel->Run(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | |||
| return ret; | |||
| #endif | |||
| } | |||
| if (after != nullptr) { | |||
| if (!after(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) { | |||
| MS_LOG(ERROR) << "run kernel after_callback failed, name: " << kernel->name(); | |||
| @@ -34,6 +34,9 @@ class OpenCLExecutor : public Executor { | |||
| int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, | |||
| Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, | |||
| const KernelCallBack &after = nullptr) override; | |||
| int RunOrTune(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | |||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator = nullptr, | |||
| const KernelCallBack &before = nullptr, const KernelCallBack &after = nullptr, bool is_tune = false); | |||
| protected: | |||
| InnerContext *context = nullptr; | |||
| @@ -230,12 +230,7 @@ int OpenCLRuntime::Init() { | |||
| MS_LOG(INFO) << "Compute Unit: " << compute_units_; | |||
| MS_LOG(INFO) << "Clock Frequency: " << max_freq_ << " MHz"; | |||
| #ifdef Debug | |||
| const cl_command_queue_properties properties = CL_QUEUE_PROFILING_ENABLE; | |||
| #else | |||
| const cl_command_queue_properties properties = 0; | |||
| #endif | |||
| default_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, properties, &ret); | |||
| if (ret != CL_SUCCESS) { | |||
| delete device_; | |||
| @@ -244,6 +239,16 @@ int OpenCLRuntime::Init() { | |||
| return RET_ERROR; | |||
| } | |||
| const cl_command_queue_properties profiling_properties = CL_QUEUE_PROFILING_ENABLE; | |||
| profiling_command_queue_ = new (std::nothrow) cl::CommandQueue(*context_, *device_, profiling_properties, &ret); | |||
| if (ret != CL_SUCCESS) { | |||
| delete device_; | |||
| delete context_; | |||
| delete default_command_queue_; | |||
| MS_LOG(ERROR) << "Profiling command Queue create failed: " << CLErrorCode(ret); | |||
| return RET_ERROR; | |||
| } | |||
| allocator_ = new (std::nothrow) OpenCLAllocator(this); | |||
| if (allocator_ == nullptr) { | |||
| delete device_; | |||
| @@ -473,15 +478,17 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> | |||
| } | |||
| // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. | |||
| int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | |||
| cl::CommandQueue *command_queue) { | |||
| cl::CommandQueue *command_queue, cl::Event *event) { | |||
| if (command_queue == nullptr) { | |||
| command_queue = default_command_queue_; | |||
| if (profiling_) { | |||
| command_queue = profiling_command_queue_; | |||
| } else { | |||
| command_queue = default_command_queue_; | |||
| } | |||
| } | |||
| MS_ASSERT(local.size() == 0 || local.size() == global.size()); | |||
| cl::Event event; | |||
| cl_int ret = CL_SUCCESS; | |||
| ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event); | |||
| ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, event); | |||
| if (ret != CL_SUCCESS) { | |||
| MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); | |||
| return RET_ERROR; | |||
| @@ -496,15 +503,9 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global | |||
| } | |||
| cnt++; | |||
| MS_LOG(DEBUG) << "RunKernel success!"; | |||
| #ifdef Debug | |||
| event.wait(); | |||
| cl_ulong time_start; | |||
| cl_ulong time_end; | |||
| event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); | |||
| event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); | |||
| double nanoSeconds = time_end - time_start; | |||
| MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; | |||
| #endif | |||
| if (profiling_) { | |||
| event->wait(); | |||
| } | |||
| return RET_OK; | |||
| } | |||
| // get gpu divce type | |||
| @@ -32,6 +32,7 @@ j* you may not use this file except in compliance with the License. | |||
| namespace mindspore::lite::opencl { | |||
| enum GpuType { OTHER = 0, ADRENO = 1, MALI = 2, MALI_T = 3, MALI_G = 4 }; | |||
| enum TuningMode { DEFAULT = 0, FAST = 1, EXTREME = 2 }; | |||
| struct GpuInfo { | |||
| GpuType type = OTHER; | |||
| @@ -117,7 +118,7 @@ class OpenCLRuntime { | |||
| int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local, | |||
| cl::CommandQueue *command_queue = nullptr); // !!!To be deleted | |||
| int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | |||
| cl::CommandQueue *command_queue = nullptr); | |||
| cl::CommandQueue *command_queue = nullptr, cl::Event *event = nullptr); | |||
| bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | |||
| bool sync = false) const; | |||
| bool CopyHostMemToDevice(const void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | |||
| @@ -139,10 +140,14 @@ class OpenCLRuntime { | |||
| * @return max_work_group_size | |||
| */ | |||
| int GetKernelMaxWorkGroupSize(cl_kernel kernel, cl_device_id device_id); | |||
| void SetTuningMode(TuningMode mode) { tuning_mode_ = mode; } | |||
| TuningMode GetTuningMode() const { return tuning_mode_; } | |||
| void InitGpuCache(); | |||
| int LoadCache(const void *buf); | |||
| void StoreCache(); | |||
| bool isProfiling() const { return profiling_; } | |||
| void SetProfiling(bool profiling) { profiling_ = profiling; } | |||
| private: | |||
| static OpenCLRuntime *GetInstance(); | |||
| @@ -158,6 +163,7 @@ class OpenCLRuntime { | |||
| static size_t instance_count_; | |||
| static OpenCLRuntime *ocl_runtime_instance_; | |||
| cl::CommandQueue *default_command_queue_{nullptr}; | |||
| cl::CommandQueue *profiling_command_queue_{nullptr}; | |||
| cl::Context *context_{nullptr}; | |||
| cl::Device *device_{nullptr}; | |||
| OpenCLAllocator *allocator_{nullptr}; | |||
| @@ -181,6 +187,8 @@ class OpenCLRuntime { | |||
| const std::string version_{"V0.1"}; | |||
| bool need_write_{false}; | |||
| bool enable_cache_{false}; | |||
| TuningMode tuning_mode_{TuningMode::DEFAULT}; | |||
| bool profiling_{false}; | |||
| }; | |||
| class OpenCLRuntimeWrapper { | |||