diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl index 83388d1d83..cfd4c7d1ca 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl @@ -1,6 +1,6 @@ #pragma OPENCL EXTENSION cl_khr_fp16 : enable __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; -__kernel void gather_NHWC4(__read_only image2d_t src_data, __global int *indices, __write_only image2d_t dst_data, +__kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices, int4 src_size, int4 dst_size, int indices_num, int axis) { int X = get_global_id(0); // w int Y = get_global_id(1); // n*h @@ -41,7 +41,7 @@ __kernel void gather_NHWC4(__read_only image2d_t src_data, __global int *indices WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data); } -__kernel void gather_NC4HW4(__read_only image2d_t src_data, __global int *indices, __write_only image2d_t dst_data, +__kernel void gather_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices, int4 src_size, int4 dst_size, int indices_num, int axis) { int X = get_global_id(0); // w int Y = get_global_id(1); // n*h diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl index 18a69dfcf8..dfa8e25f8f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl @@ -27,7 +27,7 @@ __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y); gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM; start = start % src_size.x % C4NUM; - for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) { + for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) { int X_src = (gcnt + n) % in_img_x; res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x)); tmp[0] = res.x; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc index 754df35b31..a8720ad60f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc @@ -31,8 +31,11 @@ using mindspore::schema::PrimitiveType_BatchToSpaceND; namespace mindspore::kernel { -int BatchToSpaceNDOpenCLKernel::Init() { - std::string kernel_name = "batch_to_space_nd_NHWC4"; +int BatchToSpaceNDOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); + return RET_ERROR; + } if (in_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 4) { MS_LOG(ERROR) << "input/output shape size must be 4, actual: " << in_tensors_[0]->shape().size() << ", " << out_tensors_[0]->shape().size(); @@ -48,6 +51,37 @@ int BatchToSpaceNDOpenCLKernel::Init() { MS_LOG(ERROR) << "crop shape error!"; return RET_ERROR; } + return RET_OK; +} + +void BatchToSpaceNDOpenCLKernel::SetConstArgs() { + auto param = reinterpret_cast(this->op_parameter_); + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); + cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height() * out_tensors_[0]->Batch(), 1}; + std::vector out_shape = out_tensors_[0]->shape(); + cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; + cl_int2 block_size = {param->block_shape_[0], param->block_shape_[1]}; + cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]}; + + int arg_cnt = 2; + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings); +} + +void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() { + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + std::vector out_shape = out_tensors_[0]->shape(); + cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; + std::vector local = {1, 1, 1}; + std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int BatchToSpaceNDOpenCLKernel::Prepare() { + std::string kernel_name = "batch_to_space_nd_NHWC4"; #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); @@ -59,58 +93,25 @@ int BatchToSpaceNDOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } int BatchToSpaceNDOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; - auto param = reinterpret_cast(this->op_parameter_); - - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); - cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height() * out_tensors_[0]->Batch(), 1}; - std::vector out_shape = out_tensors_[0]->shape(); - cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]}; - cl_int2 block_size = {param->block_shape_[0], param->block_shape_[1]}; - cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]}; - std::vector local = {1, 1, 1}; - std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; - int arg_cn = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, block_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, paddings); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLBatchToSpaceNDKernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) BatchToSpaceNDOpenCLKernel(opParameter, inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed."; - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpaceND, OpenCLBatchToSpaceNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpaceND, OpenCLBatchToSpaceNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpace, OpenCLBatchToSpaceNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpace, OpenCLBatchToSpaceNDKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpaceND, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpaceND, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpace, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpace, OpenCLKernelCreator); } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h index f710c4d436..6c80c9db96 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h @@ -31,9 +31,12 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel { ~BatchToSpaceNDOpenCLKernel() override = default; - int Init() override; - int Run() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc index 24ec31f732..0de2158999 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc @@ -30,7 +30,35 @@ using mindspore::schema::PrimitiveType_Gather; namespace mindspore::kernel { -int GatherOpenCLKernel::Init() { +int GatherOpenCLKernel::CheckSpecs() { return RET_OK; } + +void GatherOpenCLKernel::SetConstArgs() { + auto param = reinterpret_cast(this->op_parameter_); + param->axis_ = (param->axis_ + in_tensors_[0]->shape().size()) % in_tensors_[0]->shape().size(); + auto input_shape = in_tensors_[0]->shape(); + auto output_shape = out_tensors_[0]->shape(); + int indices_num = in_tensors_[1]->ElementsNum(); + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); + cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; + cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, + (cl_int)out_tensors_[0]->Batch()}; + int arg_cnt = 3; + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, param->axis_); +} + +void GatherOpenCLKernel::SetGlobalLocal() { + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + std::vector local = {1, 1, 1}; + std::vector global = {(size_t)out_tensors_[0]->Width(), + (size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4}; + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int GatherOpenCLKernel::Prepare() { std::string kernel_name = "gather_NHWC4"; #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); @@ -41,7 +69,15 @@ int GatherOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif - // init indices_data_ + + InitWeights(); + SetGlobalLocal(); + SetConstArgs(); + MS_LOG(DEBUG) << kernel_name << " Init Done!"; + return RET_OK; +} + +int GatherOpenCLKernel::InitWeights() { auto indices_tensor = in_tensors_.at(1); int indices_num = indices_tensor->ElementsNum(); bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32; @@ -53,11 +89,10 @@ int GatherOpenCLKernel::Init() { return RET_ERROR; } } - MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } -int GatherOpenCLKernel::InitWeights() { +int GatherOpenCLKernel::UpdateWeights() { auto indices_tensor = in_tensors_.at(1); int indices_num = indices_tensor->ElementsNum(); bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32; @@ -86,55 +121,20 @@ int GatherOpenCLKernel::InitWeights() { int GatherOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; - auto param = reinterpret_cast(this->op_parameter_); - if (InitWeights() != RET_OK) { + if (UpdateWeights() != RET_OK) { return RET_ERROR; } - auto input_shape = in_tensors_[0]->shape(); - auto output_shape = out_tensors_[0]->shape(); - int indices_num = in_tensors_[1]->ElementsNum(); - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); - cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()}; - cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4, - (cl_int)out_tensors_[0]->Batch()}; - std::vector local = {1, 1, 1}; - std::vector global = {(size_t)out_tensors_[0]->Width(), - (size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4}; - int arg_cn = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_num); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); - return RET_OK; -} + ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); -kernel::LiteKernel *OpenCLGatherKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) GatherOpenCLKernel(opParameter, inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed."; - delete kernel; - return nullptr; - } - return kernel; + return RET_OK; } -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Gather, OpenCLGatherKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Gather, OpenCLGatherKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Gather, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Gather, OpenCLKernelCreator); } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h index 236c2b6a9c..b1a3f5a04f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h @@ -31,9 +31,16 @@ class GatherOpenCLKernel : public OpenCLKernel { ~GatherOpenCLKernel() override = default; - int Init() override; int Run() override; int InitWeights() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; + + protected: + int UpdateWeights(); private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc index 9da40a502d..5ef6b4162e 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc @@ -30,12 +30,38 @@ using mindspore::schema::PrimitiveType_Squeeze; namespace mindspore::kernel { -int ReshapeOpenCLKernel::Init() { - std::string kernel_name = "reshape_NHWC4"; +int ReshapeOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); + return RET_ERROR; + } if (out_tensors_[0]->shape().size() != 2 && out_tensors_[0]->shape().size() != 4) { MS_LOG(ERROR) << "Reshape output size should in 2,4"; return RET_ERROR; } + return RET_OK; +} + +void ReshapeOpenCLKernel::SetConstArgs() { + auto in = Image2DInfo(in_tensors_.front()); + auto out = Image2DInfo(out_tensors_.front()); + cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)}; + cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)}; + + int arg_idx = 2; + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size); +} + +void ReshapeOpenCLKernel::SetGlobalLocal() { + auto out = Image2DInfo(out_tensors_.front()); + std::vector local = {}; + std::vector global{out.width, out.height}; + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int ReshapeOpenCLKernel::Prepare() { + std::string kernel_name = "reshape_NHWC4"; #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); #else @@ -45,49 +71,23 @@ int ReshapeOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } int ReshapeOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; - auto in = Image2DInfo(in_tensors_.front()); - auto out = Image2DInfo(out_tensors_.front()); - - std::vector local = {}; - std::vector global{out.width, out.height}; - cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)}; - cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)}; - - int arg_idx = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c()); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size); - ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c()); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c()); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector &inputs, - const std::vector &outputs, OpParameter *opParameter, - const lite::InnerContext *ctx, const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) ReshapeOpenCLKernel(reinterpret_cast(opParameter), inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator) -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLKernelCreator) +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLKernelCreator) } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h index 31c8e95d07..9c8d97823c 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h @@ -30,8 +30,12 @@ class ReshapeOpenCLKernel : public OpenCLKernel { : OpenCLKernel(parameter, inputs, outputs) {} ~ReshapeOpenCLKernel() override = default; - int Init() override; int Run() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc index cc3691255d..9fffb80b43 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc @@ -31,8 +31,11 @@ using mindspore::schema::PrimitiveType_SpaceToBatchND; namespace mindspore::kernel { -int SpaceToBatchNDOpenCLKernel::Init() { - std::string kernel_name = "space_to_batch_nd_NHWC4"; +int SpaceToBatchNDOpenCLKernel::CheckSpecs() { + if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) { + MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type(); + return RET_ERROR; + } if (in_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 4) { MS_LOG(ERROR) << "input/output shape size must be 4, actual: " << in_tensors_[0]->shape().size() << ", " << out_tensors_[0]->shape().size(); @@ -53,6 +56,35 @@ int SpaceToBatchNDOpenCLKernel::Init() { MS_LOG(ERROR) << "padded shape must be multiple of block!"; return RET_ERROR; } + return RET_OK; +} + +void SpaceToBatchNDOpenCLKernel::SetConstArgs() { + auto param = reinterpret_cast(this->op_parameter_); + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); + cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height(), in_tensors_[0]->Batch()}; + cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; + cl_int2 block_size = {param->block_sizes_[0], param->block_sizes_[1]}; + cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]}; + + int arg_cnt = 2; + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size); + ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings); +} + +void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() { + size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; + std::vector local = {1, 1, 1}; + std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; + OpenCLKernel::AlignGlobalLocal(global, local); +} + +int SpaceToBatchNDOpenCLKernel::Prepare() { + std::string kernel_name = "space_to_batch_nd_NHWC4"; #ifdef PROGRAM_WITH_IL kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name); @@ -64,57 +96,26 @@ int SpaceToBatchNDOpenCLKernel::Init() { ocl_runtime_->LoadSource(program_name, source); ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options); #endif + + SetGlobalLocal(); + SetConstArgs(); MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } int SpaceToBatchNDOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running! "; - auto param = reinterpret_cast(this->op_parameter_); - size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM); - cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height(), in_tensors_[0]->Batch()}; - cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()}; - cl_int2 block_size = {param->block_sizes_[0], param->block_sizes_[1]}; - cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]}; - std::vector local = {1, 1, 1}; - std::vector global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]}; - int arg_cn = 0; - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, block_size); - ocl_runtime_->SetKernelArg(kernel_, arg_cn++, paddings); - ocl_runtime_->RunKernel(kernel_, global, local, nullptr); + ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr); return RET_OK; } -kernel::LiteKernel *OpenCLSpaceToBatchNDKernelCreator(const std::vector &inputs, - const std::vector &outputs, - OpParameter *opParameter, const lite::InnerContext *ctx, - const kernel::KernelKey &desc, - const mindspore::lite::PrimitiveC *primitive) { - auto *kernel = new (std::nothrow) SpaceToBatchNDOpenCLKernel(opParameter, inputs, outputs); - if (kernel == nullptr) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed."; - free(opParameter); - return nullptr; - } - auto ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed."; - delete kernel; - return nullptr; - } - return kernel; -} - -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatchND, OpenCLSpaceToBatchNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatchND, OpenCLSpaceToBatchNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatch, OpenCLSpaceToBatchNDKernelCreator); -REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatch, OpenCLSpaceToBatchNDKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatchND, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatchND, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatch, OpenCLKernelCreator); +REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatch, OpenCLKernelCreator); } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h index 06a4ebb520..46362e4fac 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h @@ -31,9 +31,12 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel { ~SpaceToBatchNDOpenCLKernel() override = default; - int Init() override; - int Run() override; + int Prepare() override; + + int CheckSpecs() override; + void SetConstArgs() override; + void SetGlobalLocal() override; private: cl::Kernel kernel_; diff --git a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h index 295369daf9..8ab9120485 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h +++ b/mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h @@ -193,7 +193,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector &input auto ret = kernel->CheckSpecs(); if (ret != mindspore::lite::RET_OK) { delete kernel; - MS_LOG(ERROR) << "Init " << opParameter->name_ << " failed!"; + MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!"; return nullptr; } return kernel; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc index 8aa3f70fa9..c8643c5176 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc @@ -127,14 +127,19 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H2W2Pad2020) { 67, 103, 252, 35, 114, 30, 29, 241, 33, 146, 17, 221, 84, 253, 2, 69, 101, 140, 44, 117, 253, 66, 111, 91, 85, 167, 39, 203, 150, 158, 145, 198, }; - float correct_data[] = {88, 81, 165, 25, 85, 48, 49, 69, 77, 72, 9, 148, 169, 163, 192, 95, 115, 208, - 243, 197, 197, 94, 0, 113, 237, 139, 252, 86, 218, 178, 108, 3, 205, 121, 109, 75, - 31, 9, 138, 27, 184, 16, 152, 157, 173, 199, 167, 61, 243, 29, 147, 147, 205, 112, - 231, 149, 142, 167, 32, 193, 201, 127, 0, 138, 9, 185, 127, 32, 114, 43, 186, 127, - 189, 83, 161, 104, 232, 36, 0, 203, 160, 228, 251, 251, 34, 197, 126, 181, 121, 70, - 213, 31, 254, 80, 190, 136, 183, 28, 34, 128, 123, 195, 82, 174, 128, 164, 53, 133, - 227, 148, 209, 50, 38, 232, 244, 17, 155, 14, 41, 58, 182, 207, 11, 166, 116, 36, - 176, 25, 111, 93, 249, 129, 67, 103, 252, 35, 223, 118, 44, 216, 114, 30, 29, 241}; + float correct_data[] = { + 88, 81, 165, 25, 85, 48, 49, 69, 77, 72, 9, 148, 169, 163, 192, 95, 115, 208, 243, 197, 197, 94, + 0, 113, 254, 79, 175, 192, 178, 36, 162, 48, 237, 139, 252, 86, 218, 178, 108, 3, 205, 121, 109, 75, + 31, 9, 138, 27, 184, 16, 152, 157, 173, 199, 167, 61, 149, 110, 25, 208, 85, 97, 44, 34, 243, 29, + 147, 147, 205, 112, 231, 149, 142, 167, 32, 193, 201, 127, 0, 138, 9, 185, 127, 32, 114, 43, 186, 127, + 31, 202, 244, 151, 23, 187, 130, 121, 189, 83, 161, 104, 232, 36, 0, 203, 160, 228, 251, 251, 34, 197, + 126, 181, 121, 70, 213, 31, 254, 80, 190, 136, 13, 71, 184, 152, 189, 129, 209, 112, 183, 28, 34, 128, + 123, 195, 82, 174, 128, 164, 53, 133, 227, 148, 209, 50, 38, 232, 244, 17, 155, 14, 41, 58, 79, 132, + 105, 42, 193, 36, 10, 86, 182, 207, 11, 166, 116, 36, 176, 25, 111, 93, 249, 129, 67, 103, 252, 35, + 223, 118, 44, 216, 114, 30, 29, 241, 125, 24, 67, 210, 33, 146, 17, 221, 65, 231, 169, 57, 51, 80, + 32, 182, 35, 102, 119, 11, 128, 38, 19, 174, 174, 82, 91, 128, 42, 115, 184, 188, 142, 99, 53, 140, + 232, 77, 30, 24, 230, 35, 214, 254, 101, 140, 44, 117, 189, 197, 215, 43, 253, 66, 111, 91, 32, 11, + 104, 212, 85, 167, 39, 203, 138, 182, 235, 165, 150, 158, 145, 198}; TypeId data_type = kNumberTypeFloat32; schema::Format format = schema::Format_NHWC; test_main_batch_to_space_nd(input_data, correct_data, input_shape, param, data_type, format); diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc index 0eb9a75d90..cacfec82e1 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc @@ -34,7 +34,8 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector &indices, GatherParameter *param, TypeId data_type, schema::Format format) { MS_LOG(INFO) << " begin test "; - auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance(); + auto ocl_wrp = lite::opencl::OpenCLRuntimeWrapper(); + auto ocl_runtime = ocl_wrp.GetInstance(); ocl_runtime->Init(); auto allocator = ocl_runtime->GetAllocator(); @@ -64,7 +65,7 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector kernels{pkernel}; - auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels); + auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&tensor_a}, outputs, kernels, kernels, kernels); if (sub_graph == nullptr) { delete pkernel; MS_LOG(INFO) << " new SubGraphOpenCLKernel failed "; @@ -82,6 +83,15 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector(outputs[0]->data_c()); + for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) { + std::cout << output_data[i] << " "; + } + std::cout << std::endl; + std::cout << "==================expected data================" << std::endl; + for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) { + std::cout << static_cast(correct_data)[i] << " "; + } + std::cout << std::endl; CommonTest::CompareOutputData(output_data, static_cast(correct_data), outputs[0]->ElementsNum(), 0.0001); } TEST_F(TestGatherOpenCL, Axis0Fp16) {