Browse Source

!8211 optimze opencl runtime

Merge pull request !8211 from wandongdong/master
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
2ed8ed69e4
13 changed files with 227 additions and 193 deletions
  1. +2
    -2
      mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl
  2. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl
  3. +46
    -45
      mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc
  4. +5
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h
  5. +47
    -47
      mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc
  6. +8
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h
  7. +38
    -38
      mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc
  8. +5
    -1
      mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h
  9. +44
    -43
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
  10. +5
    -2
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h
  11. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  12. +13
    -8
      mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc
  13. +12
    -2
      mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc

+ 2
- 2
mindspore/lite/src/runtime/kernel/opencl/cl/gather.cl View File

@@ -1,6 +1,6 @@
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
__kernel void gather_NHWC4(__read_only image2d_t src_data, __global int *indices, __write_only image2d_t dst_data,
__kernel void gather_NHWC4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
int4 src_size, int4 dst_size, int indices_num, int axis) {
int X = get_global_id(0); // w
int Y = get_global_id(1); // n*h
@@ -41,7 +41,7 @@ __kernel void gather_NHWC4(__read_only image2d_t src_data, __global int *indices
WRITE_IMAGE(dst_data, (int2)(X * dst_size.z + Z, batch * dst_size.y + height), res_data);
}

__kernel void gather_NC4HW4(__read_only image2d_t src_data, __global int *indices, __write_only image2d_t dst_data,
__kernel void gather_NC4HW4(__write_only image2d_t dst_data, __read_only image2d_t src_data, __global int *indices,
int4 src_size, int4 dst_size, int indices_num, int axis) {
int X = get_global_id(0); // w
int Y = get_global_id(1); // n*h


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/cl/reshape.cl View File

@@ -27,7 +27,7 @@ __kernel void reshape_NHWC4(__read_only image2d_t src_data, __write_only image2d
int start = ((X / CO4 * dst_size.z + min(dst_size.z, (X % CO4) * C4NUM)) + dst_size.w * Y);
gcnt = start / src_size.x * CI4 + (start % src_size.x) / C4NUM;
start = start % src_size.x % C4NUM;
for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) {
for (int i = 0, n = 0, j = start; i < C4NUM; ++n, j = 0) {
int X_src = (gcnt + n) % in_img_x;
res = READ_IMAGE(src_data, smp_zero, (int2)(X_src, (gcnt + n) / in_img_x));
tmp[0] = res.x;


+ 46
- 45
mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.cc View File

@@ -31,8 +31,11 @@ using mindspore::schema::PrimitiveType_BatchToSpaceND;

namespace mindspore::kernel {

int BatchToSpaceNDOpenCLKernel::Init() {
std::string kernel_name = "batch_to_space_nd_NHWC4";
int BatchToSpaceNDOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}
if (in_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 4) {
MS_LOG(ERROR) << "input/output shape size must be 4, actual: " << in_tensors_[0]->shape().size() << ", "
<< out_tensors_[0]->shape().size();
@@ -48,6 +51,37 @@ int BatchToSpaceNDOpenCLKernel::Init() {
MS_LOG(ERROR) << "crop shape error!";
return RET_ERROR;
}
return RET_OK;
}

void BatchToSpaceNDOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<BatchToSpaceParameter *>(this->op_parameter_);
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height() * out_tensors_[0]->Batch(), 1};
std::vector<int> out_shape = out_tensors_[0]->shape();
cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]};
cl_int2 block_size = {param->block_shape_[0], param->block_shape_[1]};
cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]};

int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
}

void BatchToSpaceNDOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
std::vector<int> out_shape = out_tensors_[0]->shape();
cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global, local);
}

int BatchToSpaceNDOpenCLKernel::Prepare() {
std::string kernel_name = "batch_to_space_nd_NHWC4";

#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
@@ -59,58 +93,25 @@ int BatchToSpaceNDOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif

SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int BatchToSpaceNDOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<BatchToSpaceParameter *>(this->op_parameter_);

size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height() * out_tensors_[0]->Batch(), 1};
std::vector<int> out_shape = out_tensors_[0]->shape();
cl_int4 dst_size = {(cl_int)CO4, out_shape[2], out_shape[1], out_shape[0]};
cl_int2 block_size = {param->block_shape_[0], param->block_shape_[1]};
cl_int4 paddings = {param->crops_[0], param->crops_[1], param->crops_[2], param->crops_[3]};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, paddings);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

return RET_OK;
}

kernel::LiteKernel *OpenCLBatchToSpaceNDKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) BatchToSpaceNDOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed.";
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpaceND, OpenCLBatchToSpaceNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpaceND, OpenCLBatchToSpaceNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpace, OpenCLBatchToSpaceNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpace, OpenCLBatchToSpaceNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpaceND, OpenCLKernelCreator<BatchToSpaceNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpaceND, OpenCLKernelCreator<BatchToSpaceNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_BatchToSpace, OpenCLKernelCreator<BatchToSpaceNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_BatchToSpace, OpenCLKernelCreator<BatchToSpaceNDOpenCLKernel>);

} // namespace mindspore::kernel

+ 5
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/batch_to_space_nd.h View File

@@ -31,9 +31,12 @@ class BatchToSpaceNDOpenCLKernel : public OpenCLKernel {

~BatchToSpaceNDOpenCLKernel() override = default;

int Init() override;

int Run() override;
int Prepare() override;

int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;

private:
cl::Kernel kernel_;


+ 47
- 47
mindspore/lite/src/runtime/kernel/opencl/kernel/gather.cc View File

@@ -30,7 +30,35 @@ using mindspore::schema::PrimitiveType_Gather;

namespace mindspore::kernel {

int GatherOpenCLKernel::Init() {
int GatherOpenCLKernel::CheckSpecs() { return RET_OK; }

void GatherOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);
param->axis_ = (param->axis_ + in_tensors_[0]->shape().size()) % in_tensors_[0]->shape().size();
auto input_shape = in_tensors_[0]->shape();
auto output_shape = out_tensors_[0]->shape();
int indices_num = in_tensors_[1]->ElementsNum();
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
(cl_int)out_tensors_[0]->Batch()};
int arg_cnt = 3;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, indices_num);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, param->axis_);
}

void GatherOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(),
(size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4};
OpenCLKernel::AlignGlobalLocal(global, local);
}

int GatherOpenCLKernel::Prepare() {
std::string kernel_name = "gather_NHWC4";
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
@@ -41,7 +69,15 @@ int GatherOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif
// init indices_data_

InitWeights();
SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int GatherOpenCLKernel::InitWeights() {
auto indices_tensor = in_tensors_.at(1);
int indices_num = indices_tensor->ElementsNum();
bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
@@ -53,11 +89,10 @@ int GatherOpenCLKernel::Init() {
return RET_ERROR;
}
}
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int GatherOpenCLKernel::InitWeights() {
int GatherOpenCLKernel::UpdateWeights() {
auto indices_tensor = in_tensors_.at(1);
int indices_num = indices_tensor->ElementsNum();
bool isIndicesInt32 = indices_tensor->data_type() == kNumberTypeInt32;
@@ -86,55 +121,20 @@ int GatherOpenCLKernel::InitWeights() {

int GatherOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<GatherParameter *>(this->op_parameter_);

if (InitWeights() != RET_OK) {
if (UpdateWeights() != RET_OK) {
return RET_ERROR;
}
auto input_shape = in_tensors_[0]->shape();
auto output_shape = out_tensors_[0]->shape();
int indices_num = in_tensors_[1]->ElementsNum();
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {in_tensors_[0]->Width(), in_tensors_[0]->Height(), (cl_int)CI4, in_tensors_[0]->Batch()};
cl_int4 dst_size = {(cl_int)out_tensors_[0]->Width(), (cl_int)out_tensors_[0]->Height(), (cl_int)CO4,
(cl_int)out_tensors_[0]->Batch()};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)out_tensors_[0]->Width(),
(size_t)out_tensors_[0]->Batch() * (size_t)out_tensors_[0]->Height(), CO4};
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_data_, lite::opencl::MemType::BUF);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, indices_num);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, param->axis_);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);

return RET_OK;
}
ocl_runtime_->SetKernelArg(kernel_, 0, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, 1, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, 2, indices_data_, lite::opencl::MemType::BUF);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

kernel::LiteKernel *OpenCLGatherKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) GatherOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed.";
delete kernel;
return nullptr;
}
return kernel;
return RET_OK;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Gather, OpenCLGatherKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Gather, OpenCLGatherKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Gather, OpenCLKernelCreator<GatherOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Gather, OpenCLKernelCreator<GatherOpenCLKernel>);

} // namespace mindspore::kernel

+ 8
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/gather.h View File

@@ -31,9 +31,16 @@ class GatherOpenCLKernel : public OpenCLKernel {

~GatherOpenCLKernel() override = default;

int Init() override;
int Run() override;
int InitWeights() override;
int Prepare() override;

int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;

protected:
int UpdateWeights();

private:
cl::Kernel kernel_;


+ 38
- 38
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.cc View File

@@ -30,12 +30,38 @@ using mindspore::schema::PrimitiveType_Squeeze;

namespace mindspore::kernel {

int ReshapeOpenCLKernel::Init() {
std::string kernel_name = "reshape_NHWC4";
int ReshapeOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}
if (out_tensors_[0]->shape().size() != 2 && out_tensors_[0]->shape().size() != 4) {
MS_LOG(ERROR) << "Reshape output size should in 2,4";
return RET_ERROR;
}
return RET_OK;
}

void ReshapeOpenCLKernel::SetConstArgs() {
auto in = Image2DInfo(in_tensors_.front());
auto out = Image2DInfo(out_tensors_.front());
cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)};
cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)};

int arg_idx = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size);
}

void ReshapeOpenCLKernel::SetGlobalLocal() {
auto out = Image2DInfo(out_tensors_.front());
std::vector<size_t> local = {};
std::vector<size_t> global{out.width, out.height};
OpenCLKernel::AlignGlobalLocal(global, local);
}

int ReshapeOpenCLKernel::Prepare() {
std::string kernel_name = "reshape_NHWC4";
#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
#else
@@ -45,49 +71,23 @@ int ReshapeOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif

SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int ReshapeOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!";
auto in = Image2DInfo(in_tensors_.front());
auto out = Image2DInfo(out_tensors_.front());

std::vector<size_t> local = {};
std::vector<size_t> global{out.width, out.height};
cl_int4 src_size = {cl_int(in.C), cl_int(in.W), cl_int(in.H), cl_int(in.N)};
cl_int4 dst_size = {cl_int(out.width), cl_int(out.height), cl_int(out.C), cl_int(out.C * out.W)};

int arg_idx = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, out_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_idx++, dst_size);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c());
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c());
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);
return RET_OK;
}

kernel::LiteKernel *OpenCLReshapeKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs, OpParameter *opParameter,
const lite::InnerContext *ctx, const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) ReshapeOpenCLKernel(reinterpret_cast<OpParameter *>(opParameter), inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "kernel " << opParameter->name_ << " create failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLReshapeKernelCreator)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Reshape, OpenCLKernelCreator<ReshapeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Reshape, OpenCLKernelCreator<ReshapeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_Squeeze, OpenCLKernelCreator<ReshapeOpenCLKernel>)
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_Squeeze, OpenCLKernelCreator<ReshapeOpenCLKernel>)
} // namespace mindspore::kernel

+ 5
- 1
mindspore/lite/src/runtime/kernel/opencl/kernel/reshape.h View File

@@ -30,8 +30,12 @@ class ReshapeOpenCLKernel : public OpenCLKernel {
: OpenCLKernel(parameter, inputs, outputs) {}
~ReshapeOpenCLKernel() override = default;

int Init() override;
int Run() override;
int Prepare() override;

int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;

private:
cl::Kernel kernel_;


+ 44
- 43
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc View File

@@ -31,8 +31,11 @@ using mindspore::schema::PrimitiveType_SpaceToBatchND;

namespace mindspore::kernel {

int SpaceToBatchNDOpenCLKernel::Init() {
std::string kernel_name = "space_to_batch_nd_NHWC4";
int SpaceToBatchNDOpenCLKernel::CheckSpecs() {
if (in_tensors_[0]->data_type() != kNumberTypeFloat32 && in_tensors_[0]->data_type() != kNumberTypeFloat16) {
MS_LOG(ERROR) << "Unsupported data type " << in_tensors_[0]->data_type();
return RET_ERROR;
}
if (in_tensors_[0]->shape().size() != 4 && out_tensors_[0]->shape().size() != 4) {
MS_LOG(ERROR) << "input/output shape size must be 4, actual: " << in_tensors_[0]->shape().size() << ", "
<< out_tensors_[0]->shape().size();
@@ -53,6 +56,35 @@ int SpaceToBatchNDOpenCLKernel::Init() {
MS_LOG(ERROR) << "padded shape must be multiple of block!";
return RET_ERROR;
}
return RET_OK;
}

void SpaceToBatchNDOpenCLKernel::SetConstArgs() {
auto param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height(), in_tensors_[0]->Batch()};
cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()};
cl_int2 block_size = {param->block_sizes_[0], param->block_sizes_[1]};
cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]};

int arg_cnt = 2;
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cnt++, paddings);
}

void SpaceToBatchNDOpenCLKernel::SetGlobalLocal() {
size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
OpenCLKernel::AlignGlobalLocal(global, local);
}

int SpaceToBatchNDOpenCLKernel::Prepare() {
std::string kernel_name = "space_to_batch_nd_NHWC4";

#ifdef PROGRAM_WITH_IL
kernel_ = ocl_runtime_->GetKernelFromBinary(kernel_name);
@@ -64,57 +96,26 @@ int SpaceToBatchNDOpenCLKernel::Init() {
ocl_runtime_->LoadSource(program_name, source);
ocl_runtime_->BuildKernel(kernel_, program_name, kernel_name, build_options);
#endif

SetGlobalLocal();
SetConstArgs();
MS_LOG(DEBUG) << kernel_name << " Init Done!";
return RET_OK;
}

int SpaceToBatchNDOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running! ";
auto param = reinterpret_cast<SpaceToBatchParameter *>(this->op_parameter_);

size_t CO4 = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t CI4 = UP_DIV(in_tensors_[0]->Channel(), C4NUM);
cl_int4 src_size = {(cl_int)CI4, in_tensors_[0]->Width(), in_tensors_[0]->Height(), in_tensors_[0]->Batch()};
cl_int4 dst_size = {(cl_int)CO4, out_tensors_[0]->Width(), out_tensors_[0]->Height(), out_tensors_[0]->Batch()};
cl_int2 block_size = {param->block_sizes_[0], param->block_sizes_[1]};
cl_int4 paddings = {param->paddings_[0], param->paddings_[1], param->paddings_[2], param->paddings_[3]};
std::vector<size_t> local = {1, 1, 1};
std::vector<size_t> global = {(size_t)dst_size.s[0], (size_t)dst_size.s[1], (size_t)dst_size.s[2]};
int arg_cn = 0;
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, src_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, dst_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, block_size);
ocl_runtime_->SetKernelArg(kernel_, arg_cn++, paddings);
ocl_runtime_->RunKernel(kernel_, global, local, nullptr);
ocl_runtime_->SetKernelArg(kernel_, 0, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->SetKernelArg(kernel_, 1, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr);

return RET_OK;
}

kernel::LiteKernel *OpenCLSpaceToBatchNDKernelCreator(const std::vector<lite::Tensor *> &inputs,
const std::vector<lite::Tensor *> &outputs,
OpParameter *opParameter, const lite::InnerContext *ctx,
const kernel::KernelKey &desc,
const mindspore::lite::PrimitiveC *primitive) {
auto *kernel = new (std::nothrow) SpaceToBatchNDOpenCLKernel(opParameter, inputs, outputs);
if (kernel == nullptr) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " new failed.";
free(opParameter);
return nullptr;
}
auto ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Kernel " << opParameter->name_ << " init failed.";
delete kernel;
return nullptr;
}
return kernel;
}

REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatchND, OpenCLSpaceToBatchNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatchND, OpenCLSpaceToBatchNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatch, OpenCLSpaceToBatchNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatch, OpenCLSpaceToBatchNDKernelCreator);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatchND, OpenCLKernelCreator<SpaceToBatchNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatchND, OpenCLKernelCreator<SpaceToBatchNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat32, PrimitiveType_SpaceToBatch, OpenCLKernelCreator<SpaceToBatchNDOpenCLKernel>);
REG_KERNEL(kGPU, kNumberTypeFloat16, PrimitiveType_SpaceToBatch, OpenCLKernelCreator<SpaceToBatchNDOpenCLKernel>);

} // namespace mindspore::kernel

+ 5
- 2
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.h View File

@@ -31,9 +31,12 @@ class SpaceToBatchNDOpenCLKernel : public OpenCLKernel {

~SpaceToBatchNDOpenCLKernel() override = default;

int Init() override;

int Run() override;
int Prepare() override;

int CheckSpecs() override;
void SetConstArgs() override;
void SetGlobalLocal() override;

private:
cl::Kernel kernel_;


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -193,7 +193,7 @@ kernel::LiteKernel *OpenCLKernelCreator(const std::vector<lite::Tensor *> &input
auto ret = kernel->CheckSpecs();
if (ret != mindspore::lite::RET_OK) {
delete kernel;
MS_LOG(ERROR) << "Init " << opParameter->name_ << " failed!";
MS_LOG(ERROR) << "Check " << opParameter->name_ << " specification failed!";
return nullptr;
}
return kernel;


+ 13
- 8
mindspore/lite/test/ut/src/runtime/kernel/opencl/batch_to_space_nd_tests.cc View File

@@ -127,14 +127,19 @@ TEST_F(TestBatchToSpaceNDOpenCL, NHWC4H2W2Pad2020) {
67, 103, 252, 35, 114, 30, 29, 241, 33, 146, 17, 221, 84, 253, 2, 69, 101, 140, 44, 117, 253, 66, 111,
91, 85, 167, 39, 203, 150, 158, 145, 198,
};
float correct_data[] = {88, 81, 165, 25, 85, 48, 49, 69, 77, 72, 9, 148, 169, 163, 192, 95, 115, 208,
243, 197, 197, 94, 0, 113, 237, 139, 252, 86, 218, 178, 108, 3, 205, 121, 109, 75,
31, 9, 138, 27, 184, 16, 152, 157, 173, 199, 167, 61, 243, 29, 147, 147, 205, 112,
231, 149, 142, 167, 32, 193, 201, 127, 0, 138, 9, 185, 127, 32, 114, 43, 186, 127,
189, 83, 161, 104, 232, 36, 0, 203, 160, 228, 251, 251, 34, 197, 126, 181, 121, 70,
213, 31, 254, 80, 190, 136, 183, 28, 34, 128, 123, 195, 82, 174, 128, 164, 53, 133,
227, 148, 209, 50, 38, 232, 244, 17, 155, 14, 41, 58, 182, 207, 11, 166, 116, 36,
176, 25, 111, 93, 249, 129, 67, 103, 252, 35, 223, 118, 44, 216, 114, 30, 29, 241};
float correct_data[] = {
88, 81, 165, 25, 85, 48, 49, 69, 77, 72, 9, 148, 169, 163, 192, 95, 115, 208, 243, 197, 197, 94,
0, 113, 254, 79, 175, 192, 178, 36, 162, 48, 237, 139, 252, 86, 218, 178, 108, 3, 205, 121, 109, 75,
31, 9, 138, 27, 184, 16, 152, 157, 173, 199, 167, 61, 149, 110, 25, 208, 85, 97, 44, 34, 243, 29,
147, 147, 205, 112, 231, 149, 142, 167, 32, 193, 201, 127, 0, 138, 9, 185, 127, 32, 114, 43, 186, 127,
31, 202, 244, 151, 23, 187, 130, 121, 189, 83, 161, 104, 232, 36, 0, 203, 160, 228, 251, 251, 34, 197,
126, 181, 121, 70, 213, 31, 254, 80, 190, 136, 13, 71, 184, 152, 189, 129, 209, 112, 183, 28, 34, 128,
123, 195, 82, 174, 128, 164, 53, 133, 227, 148, 209, 50, 38, 232, 244, 17, 155, 14, 41, 58, 79, 132,
105, 42, 193, 36, 10, 86, 182, 207, 11, 166, 116, 36, 176, 25, 111, 93, 249, 129, 67, 103, 252, 35,
223, 118, 44, 216, 114, 30, 29, 241, 125, 24, 67, 210, 33, 146, 17, 221, 65, 231, 169, 57, 51, 80,
32, 182, 35, 102, 119, 11, 128, 38, 19, 174, 174, 82, 91, 128, 42, 115, 184, 188, 142, 99, 53, 140,
232, 77, 30, 24, 230, 35, 214, 254, 101, 140, 44, 117, 189, 197, 215, 43, 253, 66, 111, 91, 32, 11,
104, 212, 85, 167, 39, 203, 138, 182, 235, 165, 150, 158, 145, 198};
TypeId data_type = kNumberTypeFloat32;
schema::Format format = schema::Format_NHWC;
test_main_batch_to_space_nd<float>(input_data, correct_data, input_shape, param, data_type, format);


+ 12
- 2
mindspore/lite/test/ut/src/runtime/kernel/opencl/gather_tests.cc View File

@@ -34,7 +34,8 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector<in
const std::vector<int> &indices, GatherParameter *param, TypeId data_type,
schema::Format format) {
MS_LOG(INFO) << " begin test ";
auto ocl_runtime = lite::opencl::OpenCLRuntimeWrapper().GetInstance();
auto ocl_wrp = lite::opencl::OpenCLRuntimeWrapper();
auto ocl_runtime = ocl_wrp.GetInstance();
ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator();

@@ -64,7 +65,7 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector<in

MS_LOG(INFO) << " initialize sub_graph ";
std::vector<kernel::LiteKernel *> kernels{pkernel};
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel(inputs, outputs, kernels, kernels, kernels);
auto *sub_graph = new (std::nothrow) kernel::SubGraphOpenCLKernel({&tensor_a}, outputs, kernels, kernels, kernels);
if (sub_graph == nullptr) {
delete pkernel;
MS_LOG(INFO) << " new SubGraphOpenCLKernel failed ";
@@ -82,6 +83,15 @@ void test_main_gather(void *input_data, void *correct_data, const std::vector<in

std::cout << "==================output data================" << std::endl;
auto *output_data = reinterpret_cast<T *>(outputs[0]->data_c());
for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) {
std::cout << output_data[i] << " ";
}
std::cout << std::endl;
std::cout << "==================expected data================" << std::endl;
for (size_t i = 0; i < outputs[0]->ElementsNum(); ++i) {
std::cout << static_cast<T *>(correct_data)[i] << " ";
}
std::cout << std::endl;
CommonTest::CompareOutputData(output_data, static_cast<T *>(correct_data), outputs[0]->ElementsNum(), 0.0001);
}
TEST_F(TestGatherOpenCL, Axis0Fp16) {


Loading…
Cancel
Save