Merge pull request !8242 from wandongdong/devtags/v1.1.0
| @@ -1,8 +1,8 @@ | |||
| #pragma OPENCL EXTENSION cl_khr_fp16 : enable | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __write_only image2d_t dst_data, int2 kernel_size, int2 stride, int2 padding, | |||
| int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, | |||
| __kernel void DepthwiseConv2d_IMG_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, | |||
| __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, | |||
| float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -32,8 +32,8 @@ __kernel void DepthwiseConv2d_IMG_NC4HW4(__read_only image2d_t src_data, __globa | |||
| WRITE_IMAGE(dst_data, (int2)(X, (Z * dst_size.y + Y)), res); | |||
| } | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __write_only image2d_t dst_data, int2 kernel_size, int2 stride, | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__write_only image2d_t dst_data, __read_only image2d_t src_data, | |||
| __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(1) * 2; | |||
| @@ -126,8 +126,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b222(__read_only image2d_t src_data, __g | |||
| } | |||
| } | |||
| } | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __write_only image2d_t dst_data, int2 kernel_size, int2 stride, | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__write_only image2d_t dst_data, __read_only image2d_t src_data, | |||
| __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(1) * 2; | |||
| @@ -182,8 +182,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_b221(__read_only image2d_t src_data, __g | |||
| WRITE_IMAGE(dst_data, (int2)((X + 1) * dst_size.z + Z, Y + 1), r[3]); | |||
| } | |||
| } | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __write_only image2d_t dst_data, int2 kernel_size, int2 stride, | |||
| __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__write_only image2d_t dst_data, __read_only image2d_t src_data, | |||
| __global FLT4 *filter, __global FLT4 *bias, int2 kernel_size, int2 stride, | |||
| int2 padding, int2 dilation, int4 src_size, int4 dst_size, | |||
| float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| @@ -215,8 +215,8 @@ __kernel void DepthwiseConv2d_IMG_NHWC4_1x1(__read_only image2d_t src_data, __gl | |||
| WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z + 1, Y), r[1]); | |||
| } | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, | |||
| __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, | |||
| __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, | |||
| int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, | |||
| float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| @@ -247,10 +247,9 @@ __kernel void DepthwiseConv2d_BUF_NC4HW4(__global FLT4 *src_data, __global FLT4 | |||
| dst_data[(((Z)*dst_size.y + (Y)) * dst_size.x + (X))] = res; | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, | |||
| int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, | |||
| float relu_clip_max) { | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, | |||
| __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, int2 dilation, | |||
| int4 src_size, int4 dst_size, float relu_clip_min, float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| int Z = get_global_id(2); | |||
| @@ -279,8 +278,8 @@ __kernel void DepthwiseConv2d_BUF_NHWC4(__global FLT4 *src_data, __global FLT4 * | |||
| dst_data[((Y * dst_size.x + X) * dst_size.z + Z)] = res; | |||
| } | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *src_data, __global FLT4 *filter, __global FLT4 *bias, | |||
| __global FLT4 *dst_data, int2 kernel_size, int2 stride, int2 padding, | |||
| __kernel void DepthwiseConv2d_BUF_NHWC4_1x1(__global FLT4 *dst_data, __global FLT4 *src_data, __global FLT4 *filter, | |||
| __global FLT4 *bias, int2 kernel_size, int2 stride, int2 padding, | |||
| int2 dilation, int4 src_size, int4 dst_size, float relu_clip_min, | |||
| float relu_clip_max) { | |||
| int X = get_global_id(0); | |||
| @@ -1,62 +0,0 @@ | |||
| /** | |||
| * Copyright 2020 Huawei Technologies Co., Ltd | |||
| * | |||
| * Licensed under the Apache License, Version 2.0 (the "License"); | |||
| * you may not use this file except in compliance with the License. | |||
| * You may obtain a copy of the License at | |||
| * | |||
| * http://www.apache.org/licenses/LICENSE-2.0 | |||
| * | |||
| * Unless required by applicable law or agreed to in writing, software | |||
| * distributed under the License is distributed on an "AS IS" BASIS, | |||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
| * See the License for the specific language governing permissions and | |||
| * limitations under the License. | |||
| */ | |||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ | |||
| #define MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ | |||
| #include "src/runtime/opencl/opencl_runtime.h" | |||
| namespace mindspore { | |||
| namespace kernel { | |||
| /** | |||
| * MindSpore to OpenCL channel order. | |||
| * @param num_channels | |||
| * @return opencl_channels | |||
| */ | |||
| cl_channel_order ToChannelOrder(int num_channels) { | |||
| switch (num_channels) { | |||
| case 1: | |||
| return CL_R; | |||
| case 2: | |||
| return CL_RG; | |||
| case 3: | |||
| return CL_RGB; | |||
| case 4: | |||
| return CL_RGBA; | |||
| default: | |||
| return -1; | |||
| } | |||
| } | |||
| /** | |||
| * MindSpore image channel type to OpenCL channel data type. | |||
| * @param data_type | |||
| * @return opencl_data_type | |||
| */ | |||
| cl_channel_type ToImageChannelType(TypeId data_type) { | |||
| switch (data_type) { | |||
| case kNumberTypeFloat32: | |||
| return CL_FLOAT; | |||
| case kNumberTypeFloat16: | |||
| return CL_HALF_FLOAT; | |||
| default: | |||
| return -1; | |||
| } | |||
| } | |||
| } // namespace kernel | |||
| } // namespace mindspore | |||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_OPENCL_KERNEL_IMAGE_FORMAT_H_ | |||
| @@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| using mindspore::schema::PrimitiveType_Eltwise; | |||
| namespace mindspore::kernel { | |||
| @@ -72,7 +73,7 @@ void ArithmeticOpenCLKernel::Image2dGetWorkGroupSize() { | |||
| } | |||
| } | |||
| int ArithmeticOpenCLKernel::InitBuffer() { | |||
| int ArithmeticOpenCLKernel::InitWeights() { | |||
| auto fp16_enable = ocl_runtime_->GetFp16Enable(); | |||
| auto data_size = fp16_enable ? sizeof(float16_t) : sizeof(float); | |||
| for (auto in_tensor_ : in_tensors_) { | |||
| @@ -255,7 +256,7 @@ int ArithmeticOpenCLKernel::Init() { | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||
| if (out_mem_type_ == MemType::IMG) { | |||
| kernel_name += "_IMG"; | |||
| } else { | |||
| kernel_name += "_BUF"; | |||
| @@ -271,7 +272,7 @@ int ArithmeticOpenCLKernel::Init() { | |||
| } | |||
| Image2dGetWorkGroupSize(); | |||
| InitBuffer(); | |||
| InitWeights(); | |||
| SetArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| @@ -32,7 +32,7 @@ class ArithmeticOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| int SetArgs(); | |||
| private: | |||
| @@ -35,7 +35,7 @@ using mindspore::schema::PrimitiveType_BiasAdd; | |||
| namespace mindspore::kernel { | |||
| int BiasAddOpenCLKernel::InitBuffer() { | |||
| int BiasAddOpenCLKernel::InitWeights() { | |||
| int C = in_tensors_[1]->shape()[0]; | |||
| int div_ci = UP_DIV(C, C4NUM); | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| @@ -70,7 +70,7 @@ int BiasAddOpenCLKernel::Init() { | |||
| MS_LOG(ERROR) << "BiasAdd weight channel size:" << Bias_Size << " must be equal with in_teneors channel size:" << C; | |||
| return mindspore::lite::RET_ERROR; | |||
| } | |||
| InitBuffer(); | |||
| InitWeights(); | |||
| std::set<std::string> build_options; | |||
| std::string source = biasadd_source; | |||
| std::string program_name = "BiasAdd"; | |||
| @@ -35,7 +35,7 @@ class BiasAddOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| private: | |||
| cl_int4 GetGlobalshape(); | |||
| @@ -101,7 +101,7 @@ int ConvolutionOpenCLKernel::Init() { | |||
| winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); | |||
| } | |||
| InitBuffer(); | |||
| InitWeights(); | |||
| MS_LOG(DEBUG) << "Convolution Init Done!"; | |||
| return RET_OK; | |||
| @@ -236,7 +236,7 @@ int ConvolutionOpenCLKernel::InitBias() { | |||
| return RET_OK; | |||
| } | |||
| int ConvolutionOpenCLKernel::InitBuffer() { | |||
| int ConvolutionOpenCLKernel::InitWeights() { | |||
| InitWeight(); | |||
| if (has_bias_) { | |||
| InitBias(); | |||
| @@ -360,10 +360,13 @@ int ConvolutionOpenCLKernel::Run() { | |||
| } | |||
| if (use_winograd_) { | |||
| ocl_runtime_->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_conv_, {size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}, {8, 6, 2}, | |||
| nullptr); | |||
| ocl_runtime_->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_4x4to36_, std::vector<size_t>({size_t(TILES_XY_), 6, size_t(CI_SLICES_)}), | |||
| std::vector<size_t>({8, 6, 4}), nullptr); | |||
| ocl_runtime_->RunKernel(kernel_conv_, | |||
| std::vector<size_t>({size_t(UP_DIV(TILES_XY_, 2)), 36, size_t(UP_DIV(CO_SLICES_, 2))}), | |||
| std::vector<size_t>({8, 6, 2}), nullptr); | |||
| ocl_runtime_->RunKernel(kernel_36to4x4_, std::vector<size_t>({size_t(TILES_XY_), 4, size_t(CO_SLICES_)}), | |||
| std::vector<size_t>({32, 4, 2}), nullptr); | |||
| } else { | |||
| ocl_runtime_->RunKernel(kernel_conv_, global_, local_, nullptr); | |||
| } | |||
| @@ -36,11 +36,11 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| void SetBlockSize(); | |||
| void SetGlobalLocal(); | |||
| int InitWeight(); | |||
| int InitBias(); | |||
| int GenerateWinogradWeight(); | |||
| @@ -37,13 +37,21 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| using mindspore::schema::PrimitiveType_DepthwiseConv2D; | |||
| namespace mindspore::kernel { | |||
| int DepthwiseConv2dOpenCLKernel::Init() { | |||
| int DepthwiseConv2dOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { | |||
| MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::Prepare() { | |||
| std::string kernel_name = "DepthwiseConv2d"; | |||
| if (out_mem_type_ == OpenCLMemType::BUF) { | |||
| if (out_mem_type_ == MemType::BUF) { | |||
| kernel_name += "_BUF"; | |||
| } else { | |||
| kernel_name += "_IMG"; | |||
| @@ -66,14 +74,14 @@ int DepthwiseConv2dOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| #endif | |||
| InitBuffer(); | |||
| GetGlobalSize(0, &global_size_); | |||
| GetLocalSize(0, global_size_, &local_size_); | |||
| InitWeights(); | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done! mem type=" << static_cast<int>(out_mem_type_); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::InitBuffer() { | |||
| int DepthwiseConv2dOpenCLKernel::InitWeights() { | |||
| auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| bool is_fp16 = ocl_runtime_->GetFp16Enable(); | |||
| @@ -138,28 +146,7 @@ int DepthwiseConv2dOpenCLKernel::InitBuffer() { | |||
| } | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); | |||
| std::vector<size_t> global = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||
| *global_size = std::move(global); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::GetLocalSize(size_t idx, const std::vector<size_t> &global_size, | |||
| std::vector<size_t> *local_size) { | |||
| const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | |||
| int z = global_size[0]; | |||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); | |||
| int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z))); | |||
| local_size->clear(); | |||
| *local_size = std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| int DepthwiseConv2dOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| void DepthwiseConv2dOpenCLKernel::SetConstArgs() { | |||
| auto parameter = reinterpret_cast<ConvParameter *>(op_parameter_); | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); | |||
| size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); | |||
| @@ -174,11 +161,9 @@ int DepthwiseConv2dOpenCLKernel::Run() { | |||
| cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, | |||
| (cl_int)out_tensors_[0]->Batch()}; | |||
| int arg_cnt = 0; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, in_tensors_[0]->data_c()); | |||
| int arg_cnt = 2; | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, packed_weight_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, bias_data_, lite::opencl::MemType::BUF); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, kernel_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, stride); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, padding); | |||
| @@ -187,31 +172,31 @@ int DepthwiseConv2dOpenCLKernel::Run() { | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].first); | |||
| ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, relu_clips[parameter->act_type_].second); | |||
| ocl_runtime_->RunKernel(kernel_, global_size_, local_size_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| void DepthwiseConv2dOpenCLKernel::SetGlobalLocal() { | |||
| // set global | |||
| size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM * block_size_[2]); | |||
| std::vector<size_t> global_size = {CO4, (size_t)UP_DIV(out_tensors_[0]->Width(), block_size_[1]), | |||
| (size_t)UP_DIV(out_tensors_[0]->Height(), block_size_[0])}; | |||
| // set local | |||
| const int max_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | |||
| int z = global_size[0]; | |||
| int y = std::min(max_group_size / z, GetMaxDivisorStrategy0(global_size[2], 8)); | |||
| int x = std::max(1, std::min(static_cast<int>(global_size[1]), max_group_size / (y * z))); | |||
| std::vector<size_t> local_size = | |||
| std::vector<size_t>({static_cast<size_t>(z), static_cast<size_t>(x), static_cast<size_t>(y)}); | |||
| OpenCLKernel::AlignGlobalLocal(global_size, local_size); | |||
| } | |||
| kernel::LiteKernel *OpenCLDepthwiseConv2dKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, | |||
| OpParameter *opParameter, const lite::InnerContext *ctx, | |||
| const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = | |||
| new (std::nothrow) DepthwiseConv2dOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| MS_LOG(ERROR) << "Init DepthwiseConv2dOpenCLKernel failed!"; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| int DepthwiseConv2dOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c()); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c()); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLDepthwiseConv2dKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator<DepthwiseConv2dOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_DepthwiseConv2D, OpenCLKernelCreator<DepthwiseConv2dOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -31,15 +31,13 @@ class DepthwiseConv2dOpenCLKernel : public OpenCLKernel { | |||
| ~DepthwiseConv2dOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int InitBuffer() override; | |||
| int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) override; | |||
| int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) override; | |||
| int CheckSpecs() override; | |||
| int InitWeights() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| void *packed_weight_{nullptr}; | |||
| @@ -57,7 +57,7 @@ int GatherOpenCLKernel::Init() { | |||
| return RET_OK; | |||
| } | |||
| int GatherOpenCLKernel::InitBuffer() { | |||
| int GatherOpenCLKernel::InitWeights() { | |||
| auto indices_tensor = in_tensors_.at(1); | |||
| int indices_num = indices_tensor->ElementsNum(); | |||
| bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32; | |||
| @@ -88,7 +88,7 @@ int GatherOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running! "; | |||
| auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_); | |||
| if (InitBuffer() != RET_OK) { | |||
| if (InitWeights() != RET_OK) { | |||
| return RET_ERROR; | |||
| } | |||
| auto input_shape = in_tensors_[0]->shape(); | |||
| @@ -33,7 +33,7 @@ class GatherOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| @@ -20,7 +20,6 @@ | |||
| #include "include/errorcode.h" | |||
| #include "src/kernel_registry.h" | |||
| #include "src/runtime/kernel/opencl/utils.h" | |||
| #include "src/runtime/kernel/opencl/image_format.h" | |||
| #ifndef PROGRAM_WITH_IL | |||
| #include "src/runtime/kernel/opencl/cl/avg_pool2d.cl.inc" | |||
| #include "src/runtime/kernel/opencl/cl/max_pool2d.cl.inc" | |||
| @@ -32,6 +31,7 @@ using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_INVALID_OP_NAME; | |||
| using mindspore::lite::RET_MEMORY_FAILED; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| using mindspore::schema::PrimitiveType_Pooling; | |||
| namespace mindspore { | |||
| @@ -73,7 +73,7 @@ int PoolingOpenCLKernel::Init() { | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| kernel_name += "_NHWC4"; | |||
| if (out_mem_type_ == OpenCLMemType::BUF) { | |||
| if (out_mem_type_ == MemType::BUF) { | |||
| MS_LOG(ERROR) << "buffer output not support yet."; | |||
| return mindspore::lite::RET_ERROR; | |||
| } else { | |||
| @@ -33,7 +33,7 @@ using mindspore::schema::PrimitiveType_PReLU; | |||
| namespace mindspore::kernel { | |||
| int PReluOpenCLKernel::InitBuffer() { | |||
| int PReluOpenCLKernel::InitWeights() { | |||
| auto allocator = ocl_runtime_->GetAllocator(); | |||
| auto weight_tensor = in_tensors_[1]; | |||
| if (weight_is_scalar) { | |||
| @@ -110,7 +110,7 @@ int PReluOpenCLKernel::Init() { | |||
| ocl_runtime_->LoadSource(program_name, source); | |||
| ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); | |||
| InitBuffer(); | |||
| InitWeights(); | |||
| MS_LOG(DEBUG) << program_name << " init Done!"; | |||
| return mindspore::lite::RET_OK; | |||
| } | |||
| @@ -34,7 +34,7 @@ class PReluOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| private: | |||
| cl::Kernel kernel_; | |||
| @@ -30,6 +30,7 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| using mindspore::schema::PrimitiveType_Scale; | |||
| namespace mindspore::kernel { | |||
| @@ -52,7 +53,7 @@ void ScaleOpenCLKernel::Image2dGetWorkGroupSize() { | |||
| global_size_ = {image2d_info.width, image2d_info.height}; | |||
| } | |||
| int ScaleOpenCLKernel::InitBuffer() { | |||
| int ScaleOpenCLKernel::InitWeights() { | |||
| if (!weight_vector_flag_) { | |||
| return RET_OK; | |||
| } | |||
| @@ -178,7 +179,7 @@ int ScaleOpenCLKernel::Init() { | |||
| #ifdef PROGRAM_WITH_IL | |||
| kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); | |||
| #else | |||
| if (out_mem_type_ == OpenCLMemType::IMG) { | |||
| if (out_mem_type_ == MemType::IMG) { | |||
| kernel_name += "_IMG"; | |||
| } else { | |||
| kernel_name += "_BUF"; | |||
| @@ -194,7 +195,7 @@ int ScaleOpenCLKernel::Init() { | |||
| } | |||
| Image2dGetWorkGroupSize(); | |||
| InitBuffer(); | |||
| InitWeights(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -32,7 +32,7 @@ class ScaleOpenCLKernel : public OpenCLKernel { | |||
| int Init() override; | |||
| int Run() override; | |||
| int InitBuffer() override; | |||
| int InitWeights() override; | |||
| private: | |||
| void Image2dGetWorkGroupSize(); | |||
| @@ -27,16 +27,40 @@ using mindspore::kernel::KERNEL_ARCH::kGPU; | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| using mindspore::schema::PrimitiveType_ToFormat; | |||
| namespace mindspore::kernel { | |||
| int ToFormatOpenCLKernel::Init() { | |||
| int ToFormatOpenCLKernel::CheckSpecs() { | |||
| if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { | |||
| MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); | |||
| return RET_ERROR; | |||
| } | |||
| auto parameter = reinterpret_cast<OpenCLToFormatParameter *>(op_parameter_); | |||
| out_mem_type_ = parameter->out_mem_type; | |||
| return RET_OK; | |||
| } | |||
| void ToFormatOpenCLKernel::SetConstArgs() { | |||
| cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; | |||
| cl_int4 gsize{(cl_int)(N_ * H_), (cl_int)W_, (cl_int)UP_DIV(C_, C4NUM), 1}; | |||
| ocl_runtime_->SetKernelArg(kernel_, 2, gsize); | |||
| ocl_runtime_->SetKernelArg(kernel_, 3, shape); | |||
| } | |||
| void ToFormatOpenCLKernel::SetGlobalLocal() { | |||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||
| std::vector<size_t> local = {8, 16, 3}; | |||
| size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| if (max_work_group_size < 384) { | |||
| local[2] = 1; | |||
| } | |||
| OpenCLKernel::AlignGlobalLocal(global, local); | |||
| } | |||
| int ToFormatOpenCLKernel::Prepare() { | |||
| std::map<TypeId, std::string> dtype_str{{kNumberTypeFloat32, "float"}, {kNumberTypeFloat16, "half"}}; | |||
| std::string kernel_name; | |||
| if (parameter->out_mem_type == OpenCLMemType::IMG) { | |||
| if (out_mem_type_ == MemType::IMG) { | |||
| kernel_name = "to_format_NHWC_to_NHWC4_IMG_" + dtype_str[in_tensors_[0]->data_type()]; | |||
| } else { | |||
| kernel_name = "to_format_NHWC4_to_NHWC_BUF_" + dtype_str[out_tensors_[0]->data_type()]; | |||
| @@ -54,6 +78,8 @@ int ToFormatOpenCLKernel::Init() { | |||
| #endif | |||
| InitNHWC(); | |||
| SetGlobalLocal(); | |||
| SetConstArgs(); | |||
| MS_LOG(DEBUG) << kernel_name << " Init Done!"; | |||
| return RET_OK; | |||
| } | |||
| @@ -86,43 +112,14 @@ int ToFormatOpenCLKernel::InitNHWC() { | |||
| int ToFormatOpenCLKernel::Run() { | |||
| MS_LOG(DEBUG) << this->name() << " Running!"; | |||
| std::vector<size_t> global = {N_ * H_, W_, UP_DIV(C_, C4NUM)}; | |||
| std::vector<size_t> local = {8, 16, 3}; | |||
| size_t max_work_group_size = ocl_runtime_->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime_->Device())()); | |||
| if (max_work_group_size < 384) { | |||
| local[2] = 1; | |||
| } | |||
| cl_int4 shape{(cl_int)N_, (cl_int)H_, (cl_int)W_, (cl_int)C_}; | |||
| cl_int4 gsize{(cl_int)global[0], (cl_int)global[1], (cl_int)global[2], 1}; | |||
| auto src_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | |||
| auto dst_mem_type = (out_mem_type_ == OpenCLMemType::IMG) ? lite::opencl::MemType::IMG : lite::opencl::MemType::BUF; | |||
| auto src_mem_type = (out_mem_type_ == MemType::IMG) ? lite::opencl::MemType::BUF : lite::opencl::MemType::IMG; | |||
| auto dst_mem_type = out_mem_type_; | |||
| ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), src_mem_type); | |||
| ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), dst_mem_type); | |||
| ocl_runtime_->SetKernelArg(kernel_, 2, gsize); | |||
| ocl_runtime_->SetKernelArg(kernel_, 3, shape); | |||
| ocl_runtime_->RunKernel(kernel_, global, local, nullptr); | |||
| ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); | |||
| return RET_OK; | |||
| } | |||
| kernel::LiteKernel *OpenCLToFormatKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) ToFormatOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->Init(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLToFormatKernelCreator) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_ToFormat, OpenCLKernelCreator<ToFormatOpenCLKernel>) | |||
| REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_ToFormat, OpenCLKernelCreator<ToFormatOpenCLKernel>) | |||
| } // namespace mindspore::kernel | |||
| @@ -30,8 +30,12 @@ class ToFormatOpenCLKernel : public OpenCLKernel { | |||
| : OpenCLKernel(parameter, inputs, outputs) {} | |||
| ~ToFormatOpenCLKernel() override = default; | |||
| int Init() override; | |||
| int Run() override; | |||
| int Prepare() override; | |||
| int CheckSpecs() override; | |||
| void SetConstArgs() override; | |||
| void SetGlobalLocal() override; | |||
| private: | |||
| int InitNHWC(); | |||
| @@ -27,13 +27,11 @@ using mindspore::lite::RET_OK; | |||
| namespace mindspore::kernel { | |||
| enum class OpenCLMemType { BUF, IMG }; | |||
| struct OpenCLToFormatParameter { | |||
| OpParameter op_parameter{}; | |||
| schema::Format src_format{schema::Format::Format_NHWC}; | |||
| schema::Format dst_format{schema::Format::Format_NHWC4}; | |||
| OpenCLMemType out_mem_type{OpenCLMemType::IMG}; | |||
| lite::opencl::MemType out_mem_type{lite::opencl::MemType::IMG}; | |||
| }; | |||
| struct Image2DInfo { | |||
| @@ -107,13 +105,52 @@ class OpenCLKernel : public LiteKernel { | |||
| ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); | |||
| } | |||
| ~OpenCLKernel() override = default; | |||
| int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) { | |||
| std::vector<size_t> internal_global_ws = global; | |||
| for (size_t i = 0; i < local.size(); ++i) { | |||
| internal_global_ws[i] = UP_ROUND(global[i], local[i]); | |||
| } | |||
| int Init() override { return RET_ERROR; } | |||
| MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); | |||
| for (size_t i = 0; i < global.size(); i++) { | |||
| MS_LOG(DEBUG) << "global[" << i << "] = " << global[i]; | |||
| } | |||
| for (size_t i = 0; i < local.size(); i++) { | |||
| MS_LOG(DEBUG) << "local[" << i << "] = " << local[i]; | |||
| } | |||
| if (global.size() == 1) { | |||
| global_range_ = cl::NDRange(internal_global_ws[0]); | |||
| if (!local.empty()) { | |||
| local_range_ = cl::NDRange(local[0]); | |||
| } | |||
| } else if (global.size() == 2) { | |||
| global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1]); | |||
| if (!local.empty()) { | |||
| local_range_ = cl::NDRange(local[0], local[1]); | |||
| } | |||
| } else if (global.size() == 3) { | |||
| global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]); | |||
| if (!local.empty()) { | |||
| local_range_ = cl::NDRange(local[0], local[1], local[2]); | |||
| } | |||
| } else { | |||
| MS_LOG(ERROR) << "Not supported NDRange!"; | |||
| return RET_ERROR; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| int Init() override { return RET_ERROR; } // !!!To be deleted | |||
| int Prepare() override { return RET_OK; } | |||
| int PreProcess() override { return RET_ERROR; } | |||
| int ReSize() override { return RET_ERROR; } | |||
| int Run() override { return RET_ERROR; } | |||
| virtual int InitBuffer() { return RET_OK; } | |||
| virtual int CheckSpecs() { return RET_ERROR; } | |||
| virtual int InitWeights() { return RET_OK; } | |||
| virtual void SetConstArgs() {} | |||
| virtual void SetGlobalLocal() {} | |||
| virtual int GetGlobalSize(size_t idx, std::vector<size_t> *global_size) { return RET_ERROR; } | |||
| virtual int GetLocalSize(size_t idx, const std::vector<size_t> &global_size, std::vector<size_t> *local_size) { | |||
| return RET_ERROR; | |||
| @@ -128,18 +165,39 @@ class OpenCLKernel : public LiteKernel { | |||
| return RET_OK; | |||
| } | |||
| OpenCLMemType GetMemType() { return out_mem_type_; } | |||
| void SetMemType(OpenCLMemType mem_type) { out_mem_type_ = mem_type; } | |||
| lite::opencl::MemType GetMemType() { return out_mem_type_; } | |||
| void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; } | |||
| protected: | |||
| lite::opencl::OpenCLRuntime *ocl_runtime_; | |||
| OpenCLMemType out_mem_type_{OpenCLMemType::IMG}; | |||
| std::vector<size_t> global_size_; | |||
| std::vector<size_t> local_size_; | |||
| lite::opencl::MemType out_mem_type_{lite::opencl::MemType::IMG}; | |||
| cl::NDRange global_range_{cl::NullRange}; | |||
| cl::NDRange local_range_{cl::NullRange}; | |||
| std::vector<size_t> global_size_; // !!!To be deleted | |||
| std::vector<size_t> local_size_; // !!!To be deleted | |||
| private: | |||
| lite::opencl::OpenCLRuntimeWrapper ocl_runtime_wrap_; | |||
| }; | |||
| template <class T> | |||
| kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &inputs, | |||
| const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter, | |||
| const lite::InnerContext *ctx, const kernel::KernelKey &desc, | |||
| const mindspore::lite::PrimitiveC *primitive) { | |||
| auto *kernel = new (std::nothrow) T(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs); | |||
| if (kernel == nullptr) { | |||
| MS_LOG(ERROR) << "kernel " << opParameter->name_ << "is nullptr."; | |||
| free(opParameter); | |||
| return nullptr; | |||
| } | |||
| auto ret = kernel->CheckSpecs(); | |||
| if (ret != mindspore::lite::RET_OK) { | |||
| delete kernel; | |||
| MS_LOG(ERROR) << "Init " << opParameter->name_ << " failed!"; | |||
| return nullptr; | |||
| } | |||
| return kernel; | |||
| } | |||
| } // namespace mindspore::kernel | |||
| #endif // MINDSPORE_LITE_SRC_OPENCL_KERNEL_H_ | |||
| @@ -24,20 +24,22 @@ | |||
| namespace mindspore::kernel { | |||
| using mindspore::lite::RET_ERROR; | |||
| using mindspore::lite::RET_OK; | |||
| using mindspore::lite::opencl::MemType; | |||
| SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | |||
| int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||
| const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||
| OpenCLMemType mem_type) { | |||
| MemType mem_type) { | |||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | |||
| for (auto &jv : in_kernels.at(i)) { | |||
| auto tensors = (mem_type == OpenCLMemType::IMG) ? jv->in_tensors() : jv->out_tensors(); | |||
| auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors(); | |||
| auto ft = std::find_if(tensors.begin(), tensors.end(), | |||
| [&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); }); | |||
| if (ft != tensors.end()) { | |||
| *ft = nullptr; | |||
| } | |||
| auto kernels = (mem_type == OpenCLMemType::IMG) ? jv->in_kernels() : jv->out_kernels(); | |||
| auto kernels = (mem_type == MemType::IMG) ? jv->in_kernels() : jv->out_kernels(); | |||
| std::replace_if( | |||
| kernels.begin(), kernels.end(), | |||
| [this, &in_tensors, &i](kernel::LiteKernel *kv) { | |||
| @@ -47,7 +49,7 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||
| this->nodes_set_.count(kv) == 0; | |||
| }, | |||
| nullptr); | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| if (mem_type == MemType::IMG) { | |||
| jv->set_in_tensors(tensors); | |||
| jv->SetInKernel(kernels); | |||
| } else { | |||
| @@ -61,25 +63,24 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||
| int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||
| lite::Tensor *new_tensor, | |||
| kernel::LiteKernel *in_convert_op, | |||
| OpenCLMemType mem_type) { | |||
| kernel::LiteKernel *in_convert_op, MemType mem_type) { | |||
| auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); | |||
| for (auto &iv : in_kernels) { | |||
| auto kernels = (mem_type == OpenCLMemType::IMG) ? iv->in_kernels() : iv->out_kernels(); | |||
| auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels(); | |||
| auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; }); | |||
| if (fk != kernels.end()) { | |||
| *fk = in_convert_op; | |||
| } else { | |||
| kernels.emplace_back(in_convert_op); | |||
| } | |||
| auto tensors = (mem_type == OpenCLMemType::IMG) ? iv->in_tensors() : iv->out_tensors(); | |||
| auto tensors = (mem_type == MemType::IMG) ? iv->in_tensors() : iv->out_tensors(); | |||
| auto ft = std::find_if(tensors.begin(), tensors.end(), [&](lite::Tensor *kv) { return kv == nullptr; }); | |||
| if (ft != tensors.end()) { | |||
| *ft = new_tensor; | |||
| } else { | |||
| tensors.emplace_back(new_tensor); | |||
| } | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| if (mem_type == MemType::IMG) { | |||
| iv->SetInKernel(kernels); | |||
| iv->set_in_tensors(tensors); | |||
| in_opencl_op->AddOutKernel(iv); | |||
| @@ -95,22 +96,22 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||
| std::vector<lite::Tensor *> *out_tensors, | |||
| std::vector<OpenCLToFormatParameter *> *out_parameters, | |||
| std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type) { | |||
| std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) { | |||
| out_tensors->clear(); | |||
| out_parameters->clear(); | |||
| out_convert_ops->clear(); | |||
| MS_ASSERT(in_tensors.size() == to_kernels.size()); | |||
| MS_ASSERT(in_tensors.size() == from_kernels.size()); | |||
| std::vector<std::vector<kernel::LiteKernel *>> loop_kernels; | |||
| if (mem_type == OpenCLMemType::BUF) { | |||
| if (mem_type == MemType::BUF) { | |||
| GetKernelFromToTensor(in_tensors, nodes_, &loop_kernels, true); | |||
| } | |||
| ReplaceOutTensorAndKernelToNull(in_tensors, in_kernels, mem_type); | |||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | |||
| auto dst_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; | |||
| auto src_format = (mem_type == OpenCLMemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; | |||
| auto dst_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC4 : schema::Format::Format_NHWC; | |||
| auto src_format = (mem_type == MemType::IMG) ? schema::Format::Format_NHWC : schema::Format::Format_NHWC4; | |||
| auto *new_tensor = new (std::nothrow) lite::Tensor(); | |||
| MS_ASSERT(new_tensor); | |||
| if (new_tensor == nullptr) { | |||
| @@ -118,7 +119,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| return RET_ERROR; | |||
| } | |||
| new_tensor->CopyTensor(*in_tensors[i]); | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| if (mem_type == MemType::IMG) { | |||
| new_tensor->SetFormat(dst_format); | |||
| in_tensors[i]->SetFormat(src_format); | |||
| } else { | |||
| @@ -128,7 +129,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| out_tensors->emplace_back(new_tensor); | |||
| KernelKey desc{kGPU, kNumberTypeFloat32, schema::PrimitiveType_ToFormat}; | |||
| if (mem_type == OpenCLMemType::IMG && ocl_runtime_->GetFp16Enable()) { | |||
| if (mem_type == MemType::IMG && ocl_runtime_->GetFp16Enable()) { | |||
| desc.data_type = kNumberTypeFloat16; | |||
| new_tensor->set_data_type(kNumberTypeFloat16); | |||
| } | |||
| @@ -146,7 +147,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| parameter->out_mem_type = mem_type; | |||
| out_parameters->emplace_back(parameter); | |||
| LiteKernel *in_convert_op = nullptr; | |||
| if (mem_type == OpenCLMemType::IMG) { | |||
| if (mem_type == MemType::IMG) { | |||
| in_convert_op = lite::GetOpenCLKernel({in_tensors[i]}, {new_tensor}, reinterpret_cast<OpParameter *>(parameter), | |||
| context_, desc); | |||
| } else { | |||
| @@ -166,7 +167,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||
| ReplaceOutTensorAndKernelToConvert(in_tensors.at(i), in_kernels.at(i), new_tensor, in_convert_op, mem_type); | |||
| // replace in_tensor of inner kernel which use out tensor | |||
| if (mem_type == OpenCLMemType::BUF) { | |||
| if (mem_type == MemType::BUF) { | |||
| for (auto &iv : loop_kernels[i]) { | |||
| auto tensors = iv->in_tensors(); | |||
| auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i)); | |||
| @@ -196,8 +197,8 @@ int SubGraphOpenCLKernel::Init() { | |||
| std::vector<std::vector<kernel::LiteKernel *>> from_kernels_; | |||
| GetKernelFromToTensor(in_tensors_, in_nodes_, &from_kernels_, true); | |||
| int ret = GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, | |||
| OpenCLMemType::IMG); | |||
| int ret = | |||
| GenToFormatOp(in_tensors_, from_kernels_, &in_convert_tensors_, &in_parameters_, &in_convert_ops_, MemType::IMG); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| @@ -205,8 +206,8 @@ int SubGraphOpenCLKernel::Init() { | |||
| std::vector<std::vector<kernel::LiteKernel *>> to_kernels_; | |||
| GetKernelFromToTensor(out_tensors_, out_nodes_, &to_kernels_, false); | |||
| ret = GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, | |||
| OpenCLMemType::BUF); | |||
| ret = | |||
| GenToFormatOp(out_tensors_, to_kernels_, &out_convert_tensors_, &out_parameters_, &out_convert_ops_, MemType::BUF); | |||
| if (ret != RET_OK) { | |||
| return ret; | |||
| } | |||
| @@ -216,6 +217,11 @@ int SubGraphOpenCLKernel::Init() { | |||
| MallocTensorWithReuse(); | |||
| ret = SubGraphKernel::Prepare(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "OpenCL prepare fail"; | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -246,7 +252,7 @@ int SubGraphOpenCLKernel::MallocTensorWithReuse() { | |||
| for (auto i = 0; i < outputs.size(); ++i) { | |||
| auto *output = outputs.at(i); | |||
| MS_ASSERT(nullptr != output); | |||
| if (op_kernel->GetMemType() == OpenCLMemType::IMG) { | |||
| if (op_kernel->GetMemType() == MemType::IMG) { | |||
| std::vector<size_t> img_size; | |||
| op_kernel->GetImageSize(i, &img_size); | |||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||
| @@ -328,11 +334,6 @@ int SubGraphOpenCLKernel::Prepare() { | |||
| MS_LOG(ERROR) << "OpenCL subgraph init fail"; | |||
| return ret; | |||
| } | |||
| ret = SubGraphKernel::Prepare(); | |||
| if (ret != RET_OK) { | |||
| MS_LOG(ERROR) << "OpenCL prepare fail"; | |||
| return ret; | |||
| } | |||
| return RET_OK; | |||
| } | |||
| @@ -61,15 +61,15 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||
| int MallocTensorWithReuse(); | |||
| int ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors, | |||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||
| OpenCLMemType mem_type); | |||
| lite::opencl::MemType mem_type); | |||
| int ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||
| const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor, | |||
| kernel::LiteKernel *in_convert_op, OpenCLMemType mem_type); | |||
| kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type); | |||
| int GetInOutNodes(); | |||
| int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | |||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||
| std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters, | |||
| std::vector<LiteKernel *> *out_convert_ops, OpenCLMemType mem_type); | |||
| std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type); | |||
| int GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors, | |||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||
| std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from); | |||
| @@ -24,6 +24,7 @@ | |||
| #include "src/common/file_utils.h" | |||
| using mindspore::lite::KernelRegistrar; | |||
| using mindspore::lite::opencl::MemType; | |||
| namespace mindspore::lite { | |||
| kernel::LiteKernel *GetOpenCLKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors, | |||
| @@ -243,20 +244,20 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) { | |||
| return 0; | |||
| } | |||
| void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, const std::string &out_file) { | |||
| void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { | |||
| if (tensor->data_c() == nullptr) { | |||
| return; | |||
| } | |||
| Image2DInfo img_info(tensor); | |||
| auto size = mem_type == OpenCLMemType::BUF ? img_info.OriginSize : img_info.Image2DSize; | |||
| auto size = mem_type == MemType::BUF ? img_info.OriginSize : img_info.Image2DSize; | |||
| std::vector<char> data(size); | |||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | |||
| auto runtime = runtime_wrapper.GetInstance(); | |||
| auto allocator = runtime->GetAllocator(); | |||
| runtime->SyncCommandQueue(); | |||
| allocator->MapBuffer(tensor->data_c(), CL_MAP_READ, nullptr, true); | |||
| if (mem_type == OpenCLMemType::BUF) { | |||
| if (mem_type == MemType::BUF) { | |||
| memcpy(data.data(), tensor->data_c(), img_info.OriginSize); | |||
| } else { | |||
| auto row_size = img_info.width * img_info.FLT4_size; | |||
| @@ -277,7 +278,7 @@ void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n, cons | |||
| } | |||
| printf(") "); | |||
| auto num = mem_type == OpenCLMemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; | |||
| auto num = mem_type == MemType::BUF ? img_info.ElementsNum : img_info.ElementsC4Num; | |||
| for (int i = 0; i < n && i < num; ++i) { | |||
| if (tensor->data_type() == kNumberTypeFloat16) { | |||
| printf("%d %7.3f | ", i, reinterpret_cast<float16_t *>(data.data())[i]); | |||
| @@ -48,7 +48,8 @@ std::string CLErrorCode(cl_int error_code); | |||
| int WriteToBin(const std::string &file_path, void *data, size_t size); | |||
| void PrintTensor(const lite::Tensor *tensor, OpenCLMemType mem_type, int n = 10, const std::string &out_file = ""); | |||
| void PrintTensor(const lite::Tensor *tensor, lite::opencl::MemType mem_type, int n = 10, | |||
| const std::string &out_file = ""); | |||
| void PrintKernelOutput(OpenCLKernel *kernel, int n = 10, const std::string &out_file = ""); | |||
| @@ -41,7 +41,7 @@ struct OpenclMemory { | |||
| }; | |||
| class OpenCLRuntime; | |||
| enum class MemType : char { SVM, BUF, IMG }; | |||
| enum class MemType : char { BUF, IMG }; | |||
| class OpenCLAllocator : public Allocator { | |||
| public: | |||
| @@ -42,7 +42,7 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou | |||
| for (auto i = 0; i < cur_outputs.size(); ++i) { | |||
| auto *output = cur_outputs.at(i); | |||
| MS_ASSERT(nullptr != output); | |||
| if (op_kernel->GetMemType() == kernel::OpenCLMemType::IMG) { | |||
| if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) { | |||
| std::vector<size_t> img_size; | |||
| op_kernel->GetImageSize(i, &img_size); | |||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | |||
| @@ -416,7 +416,39 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> | |||
| #endif | |||
| return RET_OK; | |||
| } | |||
| // Run Kernel with 1D, 2D, 3D group size, and local size can be empty. | |||
| int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | |||
| cl::CommandQueue *command_queue) { | |||
| if (command_queue == nullptr) { | |||
| command_queue = default_command_queue_; | |||
| } | |||
| MS_ASSERT(local.size() == 0 || local.size() == global.size()); | |||
| cl::Event event; | |||
| cl_int ret = CL_SUCCESS; | |||
| ret = command_queue->enqueueNDRangeKernel(kernel, cl::NullRange, global, local, nullptr, &event); | |||
| if (ret != CL_SUCCESS) { | |||
| MS_LOG(ERROR) << "Kernel execute failed:" << CLErrorCode(ret); | |||
| return RET_ERROR; | |||
| } | |||
| static int cnt = 0; | |||
| const int flush_period = 10; | |||
| if (cnt % flush_period == 0) { | |||
| command_queue->flush(); | |||
| } | |||
| cnt++; | |||
| MS_LOG(DEBUG) << "RunKernel success!"; | |||
| #if MS_OPENCL_PROFILE | |||
| event.wait(); | |||
| cl_ulong time_start; | |||
| cl_ulong time_end; | |||
| event.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start); | |||
| event.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end); | |||
| double nanoSeconds = time_end - time_start; | |||
| MS_LOG(INFO) << "OpenCl Execution time is: " << nanoSeconds / 1000000.0 << "ms"; | |||
| #endif | |||
| return RET_OK; | |||
| } | |||
| // get gpu divce type | |||
| GpuInfo OpenCLRuntime::ParseGpuInfo(std::string device_name, std::string device_version) { | |||
| GpuInfo info; | |||
| @@ -73,11 +73,12 @@ class OpenCLRuntime { | |||
| const T value, | |||
| const MemType mem_type = MemType::IMG) { | |||
| switch (mem_type) { | |||
| case MemType::SVM: { | |||
| MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; | |||
| return kernel.setArg(index, value); | |||
| } | |||
| case MemType::BUF: { | |||
| auto svm_capabilities = GetSVMCapabilities(); | |||
| if (svm_capabilities) { | |||
| MS_LOG(DEBUG) << "Set kernel arg[" << index << "] SVM pointer " << value; | |||
| return kernel.setArg(index, value); | |||
| } | |||
| cl::Buffer *buffer = reinterpret_cast<cl::Buffer *>(allocator_->GetBuffer(value)); | |||
| MS_LOG(DEBUG) << "Set kernel arg[" << index << "] OpenCL Buffer " << buffer << ", host_ptr: " << value; | |||
| return kernel.setArg(index, *buffer); | |||
| @@ -113,6 +114,8 @@ class OpenCLRuntime { | |||
| int BuildKernel(cl::Kernel &kernel, const std::string &program_name, const std::string &kernel_name, | |||
| const std::set<std::string> &build_options); | |||
| int RunKernel(const cl::Kernel &kernel, const std::vector<size_t> &global, const std::vector<size_t> &local, | |||
| cl::CommandQueue *command_queue); // !!!To be deleted | |||
| int RunKernel(const cl::Kernel &kernel, const cl::NDRange &global, const cl::NDRange &local, | |||
| cl::CommandQueue *command_queue); | |||
| bool CopyDeviceMemToHost(void *dst, const void *src, size_t size, cl::CommandQueue *command_queue = nullptr, | |||
| bool sync = false) const; | |||