| @@ -120,7 +120,10 @@ int Conv2DOpenCLKernel::Prepare() { | |||||
| winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); | winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); | ||||
| } | } | ||||
| InitWeights(); | |||||
| auto ret = InitWeights(); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| SetConstArgs(); | SetConstArgs(); | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -256,8 +259,16 @@ int Conv2DOpenCLKernel::InitBias() { | |||||
| } | } | ||||
| int Conv2DOpenCLKernel::InitWeights() { | int Conv2DOpenCLKernel::InitWeights() { | ||||
| if (!in_tensors_.at(1)->IsConst()) { | |||||
| MS_LOG(ERROR) << "Conv2D don't support non-constant filter yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| InitFilter(); | InitFilter(); | ||||
| if (has_bias_) { | if (has_bias_) { | ||||
| if (!in_tensors_.at(2)->IsConst()) { | |||||
| MS_LOG(ERROR) << "Conv2D don't support non-constant bias yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| InitBias(); | InitBias(); | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| @@ -52,7 +52,10 @@ int Conv2dTransposeOpenCLKernel::Prepare() { | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| InitWeights(); | |||||
| auto ret = InitWeights(); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| SetConstArgs(); | SetConstArgs(); | ||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| @@ -102,6 +105,10 @@ void Conv2dTransposeOpenCLKernel::SetConstArgs() { | |||||
| } | } | ||||
| int Conv2dTransposeOpenCLKernel::InitWeights() { | int Conv2dTransposeOpenCLKernel::InitWeights() { | ||||
| if (!in_tensors_.at(1)->IsConst()) { | |||||
| MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant filter yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | ConvParameter *param = reinterpret_cast<ConvParameter *>(op_parameter_); | ||||
| int ci = in_tensors_[0]->shape()[3]; | int ci = in_tensors_[0]->shape()[3]; | ||||
| int co = out_tensors_[0]->shape()[3]; | int co = out_tensors_[0]->shape()[3]; | ||||
| @@ -171,6 +178,10 @@ int Conv2dTransposeOpenCLKernel::InitWeights() { | |||||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | ||||
| memset(bias_, 0x00, div_co * C4NUM * data_size); | memset(bias_, 0x00, div_co * C4NUM * data_size); | ||||
| if (in_tensors_.size() >= 3) { | if (in_tensors_.size() >= 3) { | ||||
| if (!in_tensors_.at(2)->IsConst()) { | |||||
| MS_LOG(ERROR) << "Conv2dTranspose don't support non-constant bias yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto bias_dtype = in_tensors_[2]->data_type(); | auto bias_dtype = in_tensors_[2]->data_type(); | ||||
| if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { | if (bias_dtype == kNumberTypeFloat32 && enable_fp16_) { | ||||
| for (int i = 0; i < co; i++) { | for (int i = 0; i < co; i++) { | ||||
| @@ -73,7 +73,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() { | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| InitWeights(); | |||||
| auto ret = InitWeights(); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| SetConstArgs(); | SetConstArgs(); | ||||
| MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_); | MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_); | ||||
| @@ -81,6 +84,10 @@ int DepthwiseConv2dOpenCLKernel::Prepare() { | |||||
| } | } | ||||
| int DepthwiseConv2dOpenCLKernel::InitWeights() { | int DepthwiseConv2dOpenCLKernel::InitWeights() { | ||||
| if (!in_tensors_.at(1)->IsConst()) { | |||||
| MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant filter yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); | auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); | ||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| bool is_fp16 = ocl_runtime_->GetFp16Enable(); | bool is_fp16 = ocl_runtime_->GetFp16Enable(); | ||||
| @@ -122,6 +129,10 @@ int DepthwiseConv2dOpenCLKernel::InitWeights() { | |||||
| allocator->UnmapBuffer(packed_weight_); | allocator->UnmapBuffer(packed_weight_); | ||||
| if (in_tensors_.size() == kInputSize2) { | if (in_tensors_.size() == kInputSize2) { | ||||
| if (!in_tensors_.at(2)->IsConst()) { | |||||
| MS_LOG(ERROR) << "DepthwiseConv2d don't support non-constant bias yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| size_t dtype_size = sizeof(float); | size_t dtype_size = sizeof(float); | ||||
| if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { | if (is_fp16 && in_tensors_.at(kBiasIndex)->data_type() == kNumberTypeFloat16) { | ||||
| dtype_size = sizeof(int16_t); | dtype_size = sizeof(int16_t); | ||||
| @@ -81,7 +81,10 @@ int FullConnectionOpenCLKernel::Prepare() { | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| InitWeights(); | |||||
| auto ret = InitWeights(); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| SetConstArgs(); | SetConstArgs(); | ||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| @@ -89,6 +92,10 @@ int FullConnectionOpenCLKernel::Prepare() { | |||||
| } | } | ||||
| int FullConnectionOpenCLKernel::InitWeights() { | int FullConnectionOpenCLKernel::InitWeights() { | ||||
| if (!in_tensors_.at(kWeightIndex)->IsConst()) { | |||||
| MS_LOG(ERROR) << "FullConnection don't support non-constant filter yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| int ci = inShape.C; | int ci = inShape.C; | ||||
| int ci4 = UP_DIV(ci, C4NUM); | int ci4 = UP_DIV(ci, C4NUM); | ||||
| @@ -96,7 +103,6 @@ int FullConnectionOpenCLKernel::InitWeights() { | |||||
| int co4 = UP_DIV(co, C4NUM); | int co4 = UP_DIV(co, C4NUM); | ||||
| int h = inShape.H; | int h = inShape.H; | ||||
| int w = inShape.W; | int w = inShape.W; | ||||
| size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | size_t dtype_size = enable_fp16_ ? sizeof(uint16_t) : sizeof(float); | ||||
| padWeight_ = allocator->Malloc(h * w * ci4 * co4 * C4NUM * C4NUM * dtype_size); | padWeight_ = allocator->Malloc(h * w * ci4 * co4 * C4NUM * C4NUM * dtype_size); | ||||
| padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | padWeight_ = allocator->MapBuffer(padWeight_, CL_MAP_WRITE, nullptr, true); | ||||
| @@ -162,6 +168,10 @@ int FullConnectionOpenCLKernel::InitWeights() { | |||||
| bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | bias_ = allocator->MapBuffer(bias_, CL_MAP_WRITE, nullptr, true); | ||||
| memset(bias_, 0x00, co4 * C4NUM * dtype_size); | memset(bias_, 0x00, co4 * C4NUM * dtype_size); | ||||
| if (in_tensors_.size() >= 3) { | if (in_tensors_.size() >= 3) { | ||||
| if (!in_tensors_.at(2)->IsConst()) { | |||||
| MS_LOG(ERROR) << "FullConnection don't support non-constant bias yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | if (in_tensors_[2]->data_type() == kNumberTypeFloat32 && enable_fp16_) { | ||||
| for (int i = 0; i < co; i++) { | for (int i = 0; i < co; i++) { | ||||
| reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | reinterpret_cast<float16_t *>(bias_)[i] = reinterpret_cast<float *>(in_tensors_[2]->data_c())[i]; | ||||
| @@ -64,7 +64,10 @@ int MatMulOpenCLKernel::Prepare() { | |||||
| ocl_runtime_->LoadSource(program_name, source); | ocl_runtime_->LoadSource(program_name, source); | ||||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name); | ||||
| #endif | #endif | ||||
| InitWeights(); | |||||
| auto ret = InitWeights(); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| SetConstArgs(); | SetConstArgs(); | ||||
| SetGlobalLocal(); | SetGlobalLocal(); | ||||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | MS_LOG(DEBUG) << kernel_name << " Init Done!"; | ||||
| @@ -73,6 +76,10 @@ int MatMulOpenCLKernel::Prepare() { | |||||
| int MatMulOpenCLKernel::InitWeights() { | int MatMulOpenCLKernel::InitWeights() { | ||||
| // ABMCI @ ABCICO = ABMCO | // ABMCI @ ABCICO = ABMCO | ||||
| if (!in_tensors_.at(kWeightIndex)->IsConst()) { | |||||
| MS_LOG(ERROR) << "Matmul don't support non-constant filter yet."; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto allocator = ocl_runtime_->GetAllocator(); | auto allocator = ocl_runtime_->GetAllocator(); | ||||
| int ci = inShape[3]; | int ci = inShape[3]; | ||||
| int ci4 = UP_DIV(ci, C4NUM); | int ci4 = UP_DIV(ci, C4NUM); | ||||
| @@ -39,8 +39,6 @@ class PoolingOpenCLKernel : public OpenCLKernel { | |||||
| private: | private: | ||||
| PoolingParameter *parameter_; | PoolingParameter *parameter_; | ||||
| std::vector<size_t> local_size_; | |||||
| std::vector<size_t> global_size_; | |||||
| }; | }; | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -245,12 +245,12 @@ class OpenCLKernel : public LiteKernel { | |||||
| candidate_z = GenerateLocalByGlobal(global_size_[2]); | candidate_z = GenerateLocalByGlobal(global_size_[2]); | ||||
| } | } | ||||
| for (auto x : candidate_x) { | for (auto x : candidate_x) { | ||||
| if (x < max_work_items[0]) { | |||||
| if (x <= max_work_items[0]) { | |||||
| for (auto y : candidate_y) { | for (auto y : candidate_y) { | ||||
| if (y < max_work_items[1]) { | |||||
| if (y <= max_work_items[1]) { | |||||
| for (auto z : candidate_z) { | for (auto z : candidate_z) { | ||||
| auto group_size = x * y * z; | auto group_size = x * y * z; | ||||
| if (z < max_work_items[2] && group_size < max_workgroup_size && group_size > MIN_WORKGROUP_SIZE) { | |||||
| if (z <= max_work_items[2] && group_size <= max_workgroup_size && group_size >= MIN_WORKGROUP_SIZE) { | |||||
| BaseTuningParameter tuning_param = BaseTuningParameter(); | BaseTuningParameter tuning_param = BaseTuningParameter(); | ||||
| tuning_param.local_size = {x, y, z}; | tuning_param.local_size = {x, y, z}; | ||||
| tuning_params.push_back(tuning_param); | tuning_params.push_back(tuning_param); | ||||
| @@ -341,11 +341,11 @@ class OpenCLKernel : public LiteKernel { | |||||
| static std::set<size_t> GenerateLocalByGlobal(size_t global_i) { | static std::set<size_t> GenerateLocalByGlobal(size_t global_i) { | ||||
| std::set<size_t> local_ = {}; | std::set<size_t> local_ = {}; | ||||
| int index = 1; | int index = 1; | ||||
| while (index < global_i) { | |||||
| while (index <= global_i) { | |||||
| local_.insert(index); | local_.insert(index); | ||||
| index *= 2; | index *= 2; | ||||
| } | } | ||||
| for (size_t i = 1; i < 16; i++) { | |||||
| for (size_t i = 1; i <= 16; i++) { | |||||
| if (global_i % i == 0) { | if (global_i % i == 0) { | ||||
| local_.insert(i); | local_.insert(i); | ||||
| } | } | ||||