Browse Source

!8858 【MSLITE】code clean

From: @wangdongxu6
Reviewed-by: @ddwsky,@HilbertDavid
Signed-off-by: @ddwsky
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
2d89e7f306
16 changed files with 207 additions and 104 deletions
  1. +4
    -4
      mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl
  2. +2
    -0
      mindspore/lite/src/runtime/kernel/opencl/cl/space_to_depth.cl
  3. +1
    -1
      mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl
  4. +2
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc
  5. +2
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc
  6. +1
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc
  7. +16
    -10
      mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h
  8. +64
    -32
      mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc
  9. +12
    -13
      mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h
  10. +7
    -1
      mindspore/lite/src/runtime/kernel/opencl/utils.cc
  11. +11
    -0
      mindspore/lite/src/runtime/kernel/opencl/utils.h
  12. +31
    -8
      mindspore/lite/src/runtime/opencl/opencl_allocator.cc
  13. +3
    -14
      mindspore/lite/src/runtime/opencl/opencl_allocator.h
  14. +17
    -10
      mindspore/lite/src/runtime/opencl/opencl_executor.cc
  15. +3
    -3
      mindspore/lite/src/runtime/opencl/opencl_executor.h
  16. +31
    -8
      mindspore/lite/src/runtime/opencl/opencl_runtime.cc

+ 4
- 4
mindspore/lite/src/runtime/kernel/opencl/cl/conv2d.cl View File

@@ -67,7 +67,7 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t
} }
} }


if (bias) {
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0]; out_h0_w0_c0 += bias[co_slice0];
} }


@@ -135,7 +135,7 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t
} }
} }


if (bias) {
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0]; out_h0_w0_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0]; out_h1_w0_c0 += bias[co_slice0];
} }
@@ -224,7 +224,7 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t
} }
} }


if (bias) {
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0]; out_h0_w0_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0]; out_h1_w0_c0 += bias[co_slice0];
out_h0_w0_c1 += bias[co_slice1]; out_h0_w0_c1 += bias[co_slice1];
@@ -357,7 +357,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t
} }
} }


if (bias) {
if (bias != 0) {
out_h0_w0_c0 += bias[co_slice0]; out_h0_w0_c0 += bias[co_slice0];
out_h0_w1_c0 += bias[co_slice0]; out_h0_w1_c0 += bias[co_slice0];
out_h1_w0_c0 += bias[co_slice0]; out_h1_w0_c0 += bias[co_slice0];


+ 2
- 0
mindspore/lite/src/runtime/kernel/opencl/cl/space_to_depth.cl View File

@@ -10,6 +10,7 @@ __kernel void SpaceToDepth(__read_only image2d_t src_data, __write_only image2d_
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H * N int Z = get_global_id(2); // H * N
if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return;
if (out_shape.y == 0 || ci_size == 0 || block_size == 0) return;
int N = Z / out_shape.y; int N = Z / out_shape.y;
int H = Z % out_shape.y; int H = Z % out_shape.y;
int co_base = X * C4NUM; int co_base = X * C4NUM;
@@ -43,6 +44,7 @@ __kernel void SpaceToDepthAlign(__read_only image2d_t src_data, __write_only ima
int Y = get_global_id(1); // W int Y = get_global_id(1); // W
int Z = get_global_id(2); // H * N int Z = get_global_id(2); // H * N
if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return;
if (out_shape.y == 0 || in_shape.w == 0 || block_size == 0) return;


int N = Z / out_shape.y; int N = Z / out_shape.y;
int H = Z % out_shape.y; int H = Z % out_shape.y;


+ 1
- 1
mindspore/lite/src/runtime/kernel/opencl/cl/winograd.cl View File

@@ -171,7 +171,7 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_
acc += AtM_row[y] * At[idx]; acc += AtM_row[y] * At[idx];
} }


if (bias) {
if (bias != 0) {
acc += bias[slice]; acc += bias[slice];
} }




+ 2
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/arithmetic_self.cc View File

@@ -42,6 +42,8 @@ using mindspore::schema::PrimitiveType_Square;
namespace mindspore::kernel { namespace mindspore::kernel {


void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) { void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) {
MS_ASSERT(kernel_name);
MS_ASSERT(param);
switch (param->op_parameter_.type_) { switch (param->op_parameter_.type_) {
case PrimitiveType_Abs: case PrimitiveType_Abs:
kernel_name[0] += "_ElementAbs"; kernel_name[0] += "_ElementAbs";


+ 2
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/space_to_batch_nd.cc View File

@@ -51,6 +51,8 @@ int SpaceToBatchNDOpenCLKernel::CheckSpecs() {
MS_LOG(ERROR) << "block_sizes_ must > 1, actual " << param->block_sizes_[0] << ", " << param->block_sizes_[1]; MS_LOG(ERROR) << "block_sizes_ must > 1, actual " << param->block_sizes_[0] << ", " << param->block_sizes_[1];
return RET_ERROR; return RET_ERROR;
} }
MS_ASSERT(param->block_sizes_[0]);
MS_ASSERT(param->block_sizes_[1]);
if (param->padded_in_shape_[kNHWC_H] % param->block_sizes_[0] || if (param->padded_in_shape_[kNHWC_H] % param->block_sizes_[0] ||
param->padded_in_shape_[kNHWC_W] % param->block_sizes_[1]) { param->padded_in_shape_[kNHWC_W] % param->block_sizes_[1]) {
MS_LOG(ERROR) << "padded shape must be multiple of block!"; MS_LOG(ERROR) << "padded shape must be multiple of block!";


+ 1
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/strided_slice.cc View File

@@ -169,6 +169,7 @@ void StridedSliceOpenCLKernel::SetGlobalLocal() {
const int max_divider = 8; const int max_divider = 8;
auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize();
size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider);
local_c = std::max<size_t>(local_c, 1);
size_t local_hw = max_work_group_size / local_c; size_t local_hw = max_work_group_size / local_c;
size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); size_t local_h = std::min(UP_DIV(global[0], 2), local_hw);
size_t local_w = std::min(local_hw / local_h, global[1]); size_t local_w = std::min(local_hw / local_h, global[1]);


+ 16
- 10
mindspore/lite/src/runtime/kernel/opencl/opencl_kernel.h View File

@@ -36,6 +36,8 @@ struct OpenCLToFormatParameter {


template <typename SrcT, typename DstT> template <typename SrcT, typename DstT>
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
MS_ASSERT(dst);
MS_ASSERT(src);
auto *N = dst; auto *N = dst;
auto *H = dst + 1; auto *H = dst + 1;
auto *W = dst + 2; auto *W = dst + 2;
@@ -54,13 +56,15 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) {
*H = src[1]; *H = src[1];
*W = src[2]; *W = src[2];
*C = src[3]; *C = src[3];
} else if (src_num >= 5) {
} else if (src_num > 4) {
MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num; MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num;
} }
} }


template <typename SrcT, typename DstT> template <typename SrcT, typename DstT>
void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) {
MS_ASSERT(dst);
MS_ASSERT(src);
for (int i = 0; i < 4; ++i) { for (int i = 0; i < 4; ++i) {
dst[i] = default_value; dst[i] = default_value;
} }
@@ -101,6 +105,7 @@ struct GpuTensorInfo {
size_t RowPitch() const { size_t RowPitch() const {
auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper();
int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment();
MS_ASSERT(alignment);
size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size; size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size;
return row_pitch; return row_pitch;
} }
@@ -143,31 +148,31 @@ class OpenCLKernel : public LiteKernel {
int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) { int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) {
std::vector<size_t> internal_global_ws = global; std::vector<size_t> internal_global_ws = global;
for (size_t i = 0; i < local.size(); ++i) { for (size_t i = 0; i < local.size(); ++i) {
internal_global_ws[i] = UP_ROUND(global[i], local[i]);
internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i));
} }


MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size();
for (size_t i = 0; i < global.size(); i++) { for (size_t i = 0; i < global.size(); i++) {
MS_LOG(DEBUG) << "global[" << i << "] = " << global[i];
MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i);
} }
for (size_t i = 0; i < local.size(); i++) { for (size_t i = 0; i < local.size(); i++) {
MS_LOG(DEBUG) << "local[" << i << "] = " << local[i];
MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i);
} }


if (global.size() == 1) { if (global.size() == 1) {
global_range_ = cl::NDRange(internal_global_ws[0]);
global_range_ = cl::NDRange(internal_global_ws.at(0));
if (!local.empty()) { if (!local.empty()) {
local_range_ = cl::NDRange(local[0]);
local_range_ = cl::NDRange(local.at(0));
} }
} else if (global.size() == 2) { } else if (global.size() == 2) {
global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1]);
global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1));
if (!local.empty()) { if (!local.empty()) {
local_range_ = cl::NDRange(local[0], local[1]);
local_range_ = cl::NDRange(local.at(0), local.at(1));
} }
} else if (global.size() == 3) { } else if (global.size() == 3) {
global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]);
global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2));
if (!local.empty()) { if (!local.empty()) {
local_range_ = cl::NDRange(local[0], local[1], local[2]);
local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2));
} }
} else { } else {
MS_LOG(ERROR) << "Not supported NDRange!"; MS_LOG(ERROR) << "Not supported NDRange!";
@@ -191,6 +196,7 @@ class OpenCLKernel : public LiteKernel {
return RET_ERROR; return RET_ERROR;
} }
int GetImageSize(size_t idx, std::vector<size_t> *img_size) { int GetImageSize(size_t idx, std::vector<size_t> *img_size) {
MS_ASSERT(img_size);
if (idx >= out_tensors_.size()) { if (idx >= out_tensors_.size()) {
return RET_ERROR; return RET_ERROR;
} }


+ 64
- 32
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.cc View File

@@ -28,11 +28,12 @@ using mindspore::lite::opencl::MemType;


SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); }


int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
MemType mem_type) { MemType mem_type) {
for (size_t i = 0; i < in_tensors.size(); ++i) { for (size_t i = 0; i < in_tensors.size(); ++i) {
for (auto &jv : in_kernels.at(i)) { for (auto &jv : in_kernels.at(i)) {
MS_ASSERT(jv);
auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors(); auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors();
auto ft = std::find_if(tensors.begin(), tensors.end(), auto ft = std::find_if(tensors.begin(), tensors.end(),
[&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); }); [&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); });
@@ -43,6 +44,7 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
std::replace_if( std::replace_if(
kernels.begin(), kernels.end(), kernels.begin(), kernels.end(),
[this, &in_tensors, &i](kernel::LiteKernel *kv) { [this, &in_tensors, &i](kernel::LiteKernel *kv) {
MS_ASSERT(kv);
return std::find_if(kv->in_tensors().begin(), kv->in_tensors().end(), return std::find_if(kv->in_tensors().begin(), kv->in_tensors().end(),
[&in_tensors, &i](lite::Tensor *xv) { return xv == in_tensors.at(i); }) != [&in_tensors, &i](lite::Tensor *xv) { return xv == in_tensors.at(i); }) !=
kv->in_tensors().end() && kv->in_tensors().end() &&
@@ -58,14 +60,16 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull(
} }
} }
} }
return RET_OK;
} }
int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels,
lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op, MemType mem_type) {

void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels,
lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op, MemType mem_type) {
MS_ASSERT(in_convert_op);
auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op);
for (auto &iv : in_kernels) { for (auto &iv : in_kernels) {
MS_ASSERT(iv);
auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels(); auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels();
auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; }); auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; });
if (fk != kernels.end()) { if (fk != kernels.end()) {
@@ -90,13 +94,16 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor
in_convert_op->AddInKernel(iv); in_convert_op->AddInKernel(iv);
} }
} }
return RET_OK;
} }

int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
std::vector<lite::Tensor *> *out_tensors, std::vector<lite::Tensor *> *out_tensors,
std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<OpenCLToFormatParameter *> *out_parameters,
std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) { std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) {
MS_ASSERT(out_tensors);
MS_ASSERT(out_parameters);
MS_ASSERT(out_convert_ops);
out_tensors->clear(); out_tensors->clear();
out_parameters->clear(); out_parameters->clear();
out_convert_ops->clear(); out_convert_ops->clear();
@@ -167,6 +174,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te
// replace in_tensor of inner kernel which use out tensor // replace in_tensor of inner kernel which use out tensor
if (mem_type == MemType::BUF) { if (mem_type == MemType::BUF) {
for (auto &iv : loop_kernels[i]) { for (auto &iv : loop_kernels[i]) {
MS_ASSERT(iv);
auto tensors = iv->in_tensors(); auto tensors = iv->in_tensors();
auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i)); auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i));
if (jv != tensors.end()) { if (jv != tensors.end()) {
@@ -185,9 +193,11 @@ int SubGraphOpenCLKernel::Init() {
allocator_ = ocl_runtime_->GetAllocator(); allocator_ = ocl_runtime_->GetAllocator();
MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size();
for (const auto tensor : in_tensors_) { for (const auto tensor : in_tensors_) {
MS_ASSERT(tensor);
tensor->set_allocator(allocator_); tensor->set_allocator(allocator_);
} }
for (const auto tensor : out_tensors_) { for (const auto tensor : out_tensors_) {
MS_ASSERT(tensor);
tensor->set_allocator(allocator_); tensor->set_allocator(allocator_);
} }


@@ -223,72 +233,83 @@ int SubGraphOpenCLKernel::Init() {
return RET_OK; return RET_OK;
} }


int SubGraphOpenCLKernel::UpdateTensorDataType() {
void SubGraphOpenCLKernel::UpdateTensorDataType() {
bool is_fp16 = ocl_runtime_->GetFp16Enable(); bool is_fp16 = ocl_runtime_->GetFp16Enable();
MS_ASSERT(in_tensors_[0]);
if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) { if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) {
std::set<lite::Tensor *> out_set; std::set<lite::Tensor *> out_set;
out_set.insert(in_tensors_.begin(), in_tensors_.end()); out_set.insert(in_tensors_.begin(), in_tensors_.end());
out_set.insert(out_tensors_.begin(), out_tensors_.end()); out_set.insert(out_tensors_.begin(), out_tensors_.end());
for (auto iv : nodes_) { for (auto iv : nodes_) {
MS_ASSERT(iv);
auto cur_outs = iv->out_tensors(); auto cur_outs = iv->out_tensors();
for (auto jv : cur_outs) { for (auto jv : cur_outs) {
if (out_set.count(jv) == 0) { if (out_set.count(jv) == 0) {
MS_ASSERT(jv);
jv->set_data_type(kNumberTypeFloat16); jv->set_data_type(kNumberTypeFloat16);
} }
} }
} }
} }
return RET_OK;
} }


int SubGraphOpenCLKernel::MallocTensorWithReuse() { int SubGraphOpenCLKernel::MallocTensorWithReuse() {
int ret;
kernel::LiteKernelUtil::InitTensorRefCount(nodes_); kernel::LiteKernelUtil::InitTensorRefCount(nodes_);
for (auto *kernel : nodes_) { for (auto *kernel : nodes_) {
MS_ASSERT(nullptr != kernel);
MS_ASSERT(kernel);
auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel);
auto outputs = kernel->out_tensors(); auto outputs = kernel->out_tensors();
for (auto i = 0; i < outputs.size(); ++i) { for (auto i = 0; i < outputs.size(); ++i) {
auto *output = outputs.at(i); auto *output = outputs.at(i);
MS_ASSERT(nullptr != output);
MS_ASSERT(output);
if (op_kernel->GetMemType() == MemType::IMG) { if (op_kernel->GetMemType() == MemType::IMG) {
std::vector<size_t> img_size; std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
ret = op_kernel->GetImageSize(i, &img_size);
if (ret != RET_OK) {
MS_LOG(WARNING) << "GetImageSize failed";
}
auto data_ptr = allocator_->Malloc(output->Size(), img_size); auto data_ptr = allocator_->Malloc(output->Size(), img_size);
output->set_data(data_ptr); output->set_data(data_ptr);
} else { } else {
output->MallocData(allocator_);
ret = output->MallocData(allocator_);
if (ret != RET_OK) {
MS_LOG(WARNING) << "MallocData failed";
}
} }
output->set_allocator(allocator_); output->set_allocator(allocator_);
} }
for (auto input_kernel : kernel->in_kernels()) { for (auto input_kernel : kernel->in_kernels()) {
MS_ASSERT(nullptr != input_kernel);
auto ret = input_kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_ASSERT(input_kernel);
ret = input_kernel->DecOutTensorRefCount();
if (ret != RET_OK) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
} }
} }
} }
for (auto kernel : out_kernels_) { for (auto kernel : out_kernels_) {
MS_ASSERT(nullptr != kernel);
auto ret = kernel->DecOutTensorRefCount();
if (0 != ret) {
MS_ASSERT(kernel);
ret = kernel->DecOutTensorRefCount();
if (ret != RET_OK) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
} }
} }
return RET_OK; return RET_OK;
} }


int SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels,
bool is_from) {
void SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels,
bool is_from) {
std::vector<std::set<lite::Tensor *>> ksets; std::vector<std::set<lite::Tensor *>> ksets;
for (auto jv : in_kernels) { for (auto jv : in_kernels) {
MS_ASSERT(jv);
auto tens = is_from ? jv->in_tensors() : jv->out_tensors(); auto tens = is_from ? jv->in_tensors() : jv->out_tensors();
std::set<lite::Tensor *> kset; std::set<lite::Tensor *> kset;
kset.insert(tens.begin(), tens.end()); kset.insert(tens.begin(), tens.end());
ksets.emplace_back(kset); ksets.emplace_back(kset);
} }
MS_ASSERT(out_kernels);
for (auto in_tensor : in_tensors) { for (auto in_tensor : in_tensors) {
std::vector<kernel::LiteKernel *> kvec; std::vector<kernel::LiteKernel *> kvec;
for (size_t j = 0; j < in_kernels.size(); ++j) { for (size_t j = 0; j < in_kernels.size(); ++j) {
@@ -298,13 +319,13 @@ int SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *
} }
out_kernels->emplace_back(kvec); out_kernels->emplace_back(kvec);
} }
return RET_OK;
} }


int SubGraphOpenCLKernel::GetInOutNodes() {
void SubGraphOpenCLKernel::GetInOutNodes() {
std::vector<std::set<lite::Tensor *>> ksets_in; std::vector<std::set<lite::Tensor *>> ksets_in;
std::vector<std::set<lite::Tensor *>> ksets_out; std::vector<std::set<lite::Tensor *>> ksets_out;
for (auto jv : nodes_) { for (auto jv : nodes_) {
MS_ASSERT(jv);
std::set<lite::Tensor *> kset; std::set<lite::Tensor *> kset;
kset.insert(jv->in_tensors().begin(), jv->in_tensors().end()); kset.insert(jv->in_tensors().begin(), jv->in_tensors().end());
ksets_in.emplace_back(kset); ksets_in.emplace_back(kset);
@@ -323,10 +344,15 @@ int SubGraphOpenCLKernel::GetInOutNodes() {
out_nodes_.emplace_back(nodes_.at(j)); out_nodes_.emplace_back(nodes_.at(j));
} }
} }
return RET_OK;
} }


int SubGraphOpenCLKernel::Prepare() { int SubGraphOpenCLKernel::Prepare() {
executor_ = new (std::nothrow) lite::opencl::OpenCLExecutor();
if (executor_ == nullptr) {
MS_LOG(ERROR) << "Create OpenCLExecutor fail";
return RET_ERROR;
}

auto ret = Init(); auto ret = Init();
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "OpenCL subgraph init fail"; MS_LOG(ERROR) << "OpenCL subgraph init fail";
@@ -335,7 +361,7 @@ int SubGraphOpenCLKernel::Prepare() {
return RET_OK; return RET_OK;
} }


int SubGraphOpenCLKernel::UnInit() {
void SubGraphOpenCLKernel::UnInit() {
for (const auto &tensor : in_convert_tensors_) { for (const auto &tensor : in_convert_tensors_) {
delete tensor; delete tensor;
} }
@@ -351,7 +377,6 @@ int SubGraphOpenCLKernel::UnInit() {
in_convert_ops_.clear(); in_convert_ops_.clear();
out_convert_ops_.clear(); out_convert_ops_.clear();
delete this->executor_; delete this->executor_;
return RET_OK;
} }


int SubGraphOpenCLKernel::InferShape() { return RET_OK; } int SubGraphOpenCLKernel::InferShape() { return RET_OK; }
@@ -363,21 +388,28 @@ int SubGraphOpenCLKernel::Run() {
MS_LOG(ERROR) << "executor is nullptr"; MS_LOG(ERROR) << "executor is nullptr";
return RET_ERROR; return RET_ERROR;
} }
int ret;
for (auto &tensor : in_tensors_) { for (auto &tensor : in_tensors_) {
MS_ASSERT(tensor);
if (tensor->data_c() == nullptr) { if (tensor->data_c() == nullptr) {
MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null"; MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
return RET_ERROR; return RET_ERROR;
} }
allocator_->UnmapBuffer(tensor->data_c()); allocator_->UnmapBuffer(tensor->data_c());
ret = allocator_->UnmapBuffer(tensor->data_c());
if (ret != RET_OK) {
return ret;
}
} }


auto ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_);
if (RET_OK != ret) {
ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret; MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret; return ret;
} }
ocl_runtime_->SyncCommandQueue();

if (!ocl_runtime_->SyncCommandQueue()) {
return RET_ERROR;
}
return RET_OK; return RET_OK;
} }
} // namespace mindspore::kernel } // namespace mindspore::kernel

+ 12
- 13
mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h View File

@@ -41,7 +41,6 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); ocl_runtime_ = ocl_runtime_wrap_.GetInstance();
subgraph_type_ = kGpuSubGraph; subgraph_type_ = kGpuSubGraph;
this->name_ = "GpuSubGraph"; this->name_ = "GpuSubGraph";
this->executor_ = new lite::opencl::OpenCLExecutor();
nodes_set_.insert(nodes.begin(), nodes.end()); nodes_set_.insert(nodes.begin(), nodes.end());
} }
~SubGraphOpenCLKernel() override; ~SubGraphOpenCLKernel() override;
@@ -56,23 +55,23 @@ class SubGraphOpenCLKernel : public SubGraphKernel {
int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); }; int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };


private: private:
int UnInit();
int UpdateTensorDataType();
void UnInit();
void UpdateTensorDataType();
int MallocTensorWithReuse(); int MallocTensorWithReuse();
int ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
lite::opencl::MemType mem_type);
int ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type);
int GetInOutNodes();
void ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
lite::opencl::MemType mem_type);
void ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor,
const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor,
kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type);
void GetInOutNodes();
int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels,
std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters, std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters,
std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type); std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type);
int GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);
void GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors,
const std::vector<kernel::LiteKernel *> &in_kernels,
std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from);
lite::opencl::OpenCLAllocator *allocator_{nullptr}; lite::opencl::OpenCLAllocator *allocator_{nullptr};
std::vector<lite::Tensor *> in_convert_tensors_; std::vector<lite::Tensor *> in_convert_tensors_;
std::vector<lite::Tensor *> out_convert_tensors_; std::vector<lite::Tensor *> out_convert_tensors_;


+ 7
- 1
mindspore/lite/src/runtime/kernel/opencl/utils.cc View File

@@ -87,6 +87,7 @@ int GetMaxDivisorStrategy1(int x, int divisor) {
} }


std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) {
MS_ASSERT(local.size() == global.size() && local.size() == 3);
std::vector<size_t> result(3); std::vector<size_t> result(3);
for (int i = 0; i < 3; ++i) { for (int i = 0; i < 3; ++i) {
result[i] = UP_ROUND(global[i], local[i]); result[i] = UP_ROUND(global[i], local[i]);
@@ -95,6 +96,7 @@ std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const
} }


std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) {
MS_ASSERT(global.size() == 3);
size_t local_z = GetMaxDivisorStrategy0(global[2], 8); size_t local_z = GetMaxDivisorStrategy0(global[2], 8);
if (local_z == 0) { if (local_z == 0) {
MS_LOG(ERROR) << "Divide by zero"; MS_LOG(ERROR) << "Divide by zero";
@@ -239,6 +241,7 @@ std::string CLErrorCode(cl_int error_code) {
} }


int WriteToBin(const std::string &file_path, void *data, size_t size) { int WriteToBin(const std::string &file_path, void *data, size_t size) {
MS_ASSERT(data);
std::ofstream out_file; std::ofstream out_file;


out_file.open(file_path.c_str(), std::ios::binary); out_file.open(file_path.c_str(), std::ios::binary);
@@ -256,7 +259,7 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) {
} }


void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) {
if (tensor->data_c() == nullptr) {
if (tensor == nullptr || tensor->data_c() == nullptr) {
return; return;
} }


@@ -305,6 +308,9 @@ void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std:
} }


void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) { void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) {
if (kernel == nullptr) {
return;
}
printf("%-30s", kernel->name().c_str()); printf("%-30s", kernel->name().c_str());
if (!kernel->out_tensors().empty()) { if (!kernel->out_tensors().empty()) {
PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file); PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file);


+ 11
- 0
mindspore/lite/src/runtime/kernel/opencl/utils.h View File

@@ -61,6 +61,8 @@ std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape


template <class T1, class T2> template <class T1, class T2>
void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
MS_ASSERT(src);
MS_ASSERT(dst);
int c4 = UP_DIV(channel, C4NUM); int c4 = UP_DIV(channel, C4NUM);
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
int src_offset = b * plane * channel; int src_offset = b * plane * channel;
@@ -81,6 +83,8 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c


template <class T1, class T2> template <class T1, class T2>
void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
MS_ASSERT(src);
MS_ASSERT(dst);
int c4 = UP_DIV(channel, C4NUM); int c4 = UP_DIV(channel, C4NUM);
int nhwc4_batch_unit_offset = c4 * C4NUM * plane; int nhwc4_batch_unit_offset = c4 * C4NUM * plane;
int ic_remainder_ = channel % C4NUM; int ic_remainder_ = channel % C4NUM;
@@ -106,6 +110,8 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co


template <class T1, class T2> template <class T1, class T2>
void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) {
MS_ASSERT(src);
MS_ASSERT(dst);
int c4 = UP_DIV(channel, C4NUM); int c4 = UP_DIV(channel, C4NUM);
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
int src_oc_offset = b * plane * channel; int src_oc_offset = b * plane * channel;
@@ -142,6 +148,11 @@ std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) {
template <typename SRC_T, typename DST_T> template <typename SRC_T, typename DST_T>
void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1,
const size_t CI_TILE = 4, const size_t CO_TILE = 4) { const size_t CI_TILE = 4, const size_t CO_TILE = 4) {
MS_ASSERT(src);
MS_ASSERT(dst);
MS_ASSERT(CI_TILE);
MS_ASSERT(CO_TILE);
MS_ASSERT(OGroup);
if (CO_TILE == 0 || CI_TILE == 0) return; if (CO_TILE == 0 || CI_TILE == 0) return;
auto origin_weight = reinterpret_cast<SRC_T *>(src); auto origin_weight = reinterpret_cast<SRC_T *>(src);
auto packed_weight = reinterpret_cast<DST_T *>(dst); auto packed_weight = reinterpret_cast<DST_T *>(dst);


+ 31
- 8
mindspore/lite/src/runtime/opencl/opencl_allocator.cc View File

@@ -66,6 +66,7 @@ void *OpenCLAllocator::MinimumFit(size_t size, const std::vector<size_t> &img_si


void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) {
cl_int ret = CL_SUCCESS; cl_int ret = CL_SUCCESS;
MS_ASSERT(buffer);
*buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret); *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret);
if (*buffer == nullptr) { if (*buffer == nullptr) {
MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")";
@@ -78,7 +79,11 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B
return nullptr; return nullptr;
} }
cl::Memory *mem = *buffer; cl::Memory *mem = *buffer;
ocl_runtime_->UnmapBuffer(*mem, host_ptr);
MS_ASSERT(mem);
ret = ocl_runtime_->UnmapBuffer(*mem, host_ptr);
if (ret != RET_OK) {
MS_LOG(WARNING) << "UnmapBuffer failed.";
}
return host_ptr; return host_ptr;
} }


@@ -110,7 +115,10 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img
return nullptr; return nullptr;
} }
cl::Memory *mem = *image; cl::Memory *mem = *image;
ocl_runtime_->UnmapBuffer(*mem, host_ptr);
ret = ocl_runtime_->UnmapBuffer(*mem, host_ptr);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << "UnmapBuffer failed.";
}
} }
return host_ptr; return host_ptr;
} }
@@ -131,7 +139,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size,
} }
Lock(); Lock();
void *host_ptr = MinimumFit(size, img_size); void *host_ptr = MinimumFit(size, img_size);
if ((host_ptr != nullptr) && (data == nullptr)) {
if (host_ptr != nullptr && data == nullptr) {
UnLock(); UnLock();
return host_ptr; return host_ptr;
} }
@@ -188,7 +196,10 @@ void OpenCLAllocator::Free(void *buf) {
auto iter = allocated_list_.find(buf); auto iter = allocated_list_.find(buf);
if (iter != allocated_list_.end()) { if (iter != allocated_list_.end()) {
if (iter->second->map_flags) { if (iter->second->map_flags) {
UnmapBuffer(buf);
int ret = UnmapBuffer(buf);
if (ret != RET_OK) {
MS_LOG(WARNING) << "UnmapBuffer failed.";
}
iter->second->map_flags = false; iter->second->map_flags = false;
} }
auto mem_buf = iter->second; auto mem_buf = iter->second;
@@ -240,7 +251,10 @@ void OpenCLAllocator::Clear() {
auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); auto svm_capabilities = ocl_runtime_->GetSVMCapabilities();
for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) { for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) {
if (it->second->map_flags) { if (it->second->map_flags) {
UnmapBuffer(it->second->host_ptr_);
int ret = UnmapBuffer(it->second->host_ptr_);
if (ret != RET_OK) {
MS_LOG(WARNING) << "UnmapBuffer failed.";
}
} }
if (svm_capabilities) { if (svm_capabilities) {
clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_); clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_);
@@ -295,7 +309,11 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr; MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr;
return nullptr; return nullptr;
} }
ocl_runtime_->MapBuffer(host_ptr, flags, it->second->size_, static_cast<cl::CommandQueue *>(command_queue), sync);
int ret = ocl_runtime_->MapBuffer(host_ptr, flags, it->second->size_,
static_cast<cl::CommandQueue *>(command_queue), sync);
if (ret != RET_OK) {
MS_LOG(WARNING) << "MapBuffer failed.";
}
} }
return host_ptr; return host_ptr;
} }
@@ -313,14 +331,16 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue,
return host_ptr; return host_ptr;
} }
MemBuf *mem_buf = it->second; MemBuf *mem_buf = it->second;
void *new_host_ptr{nullptr};
MS_ASSERT(mem_buf);
void *new_host_ptr;
if (mem_buf->img_size.empty()) { if (mem_buf->img_size.empty()) {
cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_);
MS_ASSERT(buffer);
new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync);
} else { } else {
cl::ImageFormat image_format(CL_RGBA, mem_buf->img_size[2]);
std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1}; std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1};
cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_); cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_);
MS_ASSERT(image);
new_host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region); new_host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region);
} }
if (new_host_ptr == nullptr) { if (new_host_ptr == nullptr) {
@@ -373,6 +393,7 @@ MemType OpenCLAllocator::GetMemType(void *host_ptr) {
return mem_type; return mem_type;
} }
MemBuf *mem_buf = it->second; MemBuf *mem_buf = it->second;
MS_ASSERT(mem_buf);
if (mem_buf->img_size.empty()) { if (mem_buf->img_size.empty()) {
mem_type = MemType::BUF; mem_type = MemType::BUF;
} else { } else {
@@ -383,6 +404,7 @@ MemType OpenCLAllocator::GetMemType(void *host_ptr) {
} }


int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size) { int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size) {
MS_ASSERT(img_size);
Lock(); Lock();
auto it = allocated_list_.find(host_ptr); auto it = allocated_list_.find(host_ptr);
if (it == allocated_list_.end()) { if (it == allocated_list_.end()) {
@@ -391,6 +413,7 @@ int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size)
return RET_OK; return RET_OK;
} }
MemBuf *mem_buf = it->second; MemBuf *mem_buf = it->second;
MS_ASSERT(mem_buf);
if (!mem_buf->img_size.empty()) { if (!mem_buf->img_size.empty()) {
*img_size = mem_buf->img_size; *img_size = mem_buf->img_size;
} }


+ 3
- 14
mindspore/lite/src/runtime/opencl/opencl_allocator.h View File

@@ -14,8 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */


#ifndef MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_
#define MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_
#ifndef MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_
#define MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_


#include <memory> #include <memory>
#include <string> #include <string>
@@ -29,17 +29,6 @@


namespace mindspore::lite::opencl { namespace mindspore::lite::opencl {


#define MS_HOST_BUFFER 0
#define MS_CL_BUFFER (1 << 1)
#define MS_CL_IMAGE2D (1 << 2)
typedef int32_t OpenCLMemoryType;

struct OpenclMemory {
void *host_ptr{nullptr};
void *device_ptr{nullptr};
OpenCLMemoryType mem_type{MS_HOST_BUFFER | MS_CL_BUFFER};
};

class OpenCLRuntime; class OpenCLRuntime;
enum class MemType : char { BUF, IMG }; enum class MemType : char { BUF, IMG };


@@ -95,4 +84,4 @@ class OpenCLAllocator : public Allocator {


} // namespace mindspore::lite::opencl } // namespace mindspore::lite::opencl


#endif // MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_
#endif // MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_

+ 17
- 10
mindspore/lite/src/runtime/opencl/opencl_executor.cc View File

@@ -21,14 +21,13 @@


namespace mindspore::lite::opencl { namespace mindspore::lite::opencl {


int OpenCLExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) { return RET_OK; }

int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs,
std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before,
const KernelCallBack &after) { const KernelCallBack &after) {
int ret;
kernel::LiteKernelUtil::InitTensorRefCount(kernels); kernel::LiteKernelUtil::InitTensorRefCount(kernels);
for (auto *kernel : kernels) { for (auto *kernel : kernels) {
MS_ASSERT(nullptr != kernel);
MS_ASSERT(kernel);
CallBackParam callbackParam; CallBackParam callbackParam;
callbackParam.node_name = kernel->name(); callbackParam.node_name = kernel->name();


@@ -41,19 +40,27 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou
auto cur_outputs = kernel->out_tensors(); auto cur_outputs = kernel->out_tensors();
for (auto i = 0; i < cur_outputs.size(); ++i) { for (auto i = 0; i < cur_outputs.size(); ++i) {
auto *output = cur_outputs.at(i); auto *output = cur_outputs.at(i);
MS_ASSERT(nullptr != output);
MS_ASSERT(output);
if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) { if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) {
std::vector<size_t> img_size; std::vector<size_t> img_size;
op_kernel->GetImageSize(i, &img_size);
ret = op_kernel->GetImageSize(i, &img_size);
if (ret != RET_OK) {
MS_LOG(ERROR) << "GetImageSize failed";
return ret;
}
auto data_ptr = allocator_->Malloc(output->Size(), img_size); auto data_ptr = allocator_->Malloc(output->Size(), img_size);
output->set_data(data_ptr); output->set_data(data_ptr);
} else { } else {
output->MallocData(allocator_);
ret = output->MallocData(allocator_);
if (ret != RET_OK) {
MS_LOG(ERROR) << "MallocData failed";
return ret;
}
} }
} }


auto ret = kernel->Run();
if (0 != ret) {
ret = kernel->Run();
if (ret != RET_OK) {
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
return ret; return ret;
} }
@@ -64,9 +71,9 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou
} }
} }
for (auto input_kernel : kernel->in_kernels()) { for (auto input_kernel : kernel->in_kernels()) {
MS_ASSERT(nullptr != input_kernel);
MS_ASSERT(input_kernel);
ret = input_kernel->DecOutTensorRefCount(); ret = input_kernel->DecOutTensorRefCount();
if (0 != ret) {
if (ret != RET_OK) {
MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed";
} }
} }


+ 3
- 3
mindspore/lite/src/runtime/opencl/opencl_executor.h View File

@@ -14,8 +14,8 @@
* limitations under the License. * limitations under the License.
*/ */


#ifndef MINDSPORE_LITE_SRC_OPENCL_EXECUTOR_H_
#define MINDSPORE_LITE_SRC_OPENCL_EXECUTOR_H_
#ifndef MINDSPORE_LITE_SRC_RUNTIME_OPENCL_EXECUTOR_H_
#define MINDSPORE_LITE_SRC_RUNTIME_OPENCL_EXECUTOR_H_


#include <vector> #include <vector>
#include "src/runtime/opencl/opencl_runtime.h" #include "src/runtime/opencl/opencl_runtime.h"
@@ -29,7 +29,7 @@ class OpenCLExecutor : public Executor {
public: public:
OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); } OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); }


int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override;
int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override { return RET_OK; }


int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels,
Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, Allocator *allocator = nullptr, const KernelCallBack &before = nullptr,


+ 31
- 8
mindspore/lite/src/runtime/opencl/opencl_runtime.cc View File

@@ -100,8 +100,14 @@ int OpenCLRuntime::Init() {
std::vector<cl::Device> devices; std::vector<cl::Device> devices;
for (auto it = platforms.begin(); it != platforms.end(); ++it) { for (auto it = platforms.begin(); it != platforms.end(); ++it) {
std::string platform_name; std::string platform_name;
it->getInfo(CL_PLATFORM_NAME, &platform_name);
it->getDevices(CL_DEVICE_TYPE_GPU, &devices);
ret = it->getInfo(CL_PLATFORM_NAME, &platform_name);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << CLErrorCode(ret);
}
ret = it->getDevices(CL_DEVICE_TYPE_GPU, &devices);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << CLErrorCode(ret);
}
MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs"; MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs";


if (devices.size() > 0) { if (devices.size() > 0) {
@@ -178,9 +184,18 @@ int OpenCLRuntime::Init() {
} }


// get cache size, compute units and frequency. // get cache size, compute units and frequency.
device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << CLErrorCode(ret);
}
ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << CLErrorCode(ret);
}
ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_);
if (ret != CL_SUCCESS) {
MS_LOG(WARNING) << CLErrorCode(ret);
}
cl_device_fp_config fp_config; cl_device_fp_config fp_config;
auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config); auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config);
support_fp16_ = CL_SUCCESS == success && fp_config > 0; support_fp16_ = CL_SUCCESS == success && fp_config > 0;
@@ -281,7 +296,9 @@ uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; }
uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) { uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) {
uint64_t max_workgroup_size = 0; uint64_t max_workgroup_size = 0;
int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size); int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size);
if (ret != 0) max_workgroup_size = 0;
if (ret != CL_SUCCESS) {
max_workgroup_size = 0;
}
return max_workgroup_size; return max_workgroup_size;
} }


@@ -421,7 +438,10 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t>
static int cnt = 0; static int cnt = 0;
const int flush_period = 10; const int flush_period = 10;
if (cnt % flush_period == 0) { if (cnt % flush_period == 0) {
command_queue->flush();
auto flush_ret = command_queue->flush();
if (flush_ret != CL_SUCCESS) {
MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
}
} }
cnt++; cnt++;
MS_LOG(DEBUG) << "RunKernel success!"; MS_LOG(DEBUG) << "RunKernel success!";
@@ -454,7 +474,10 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global
static int cnt = 0; static int cnt = 0;
const int flush_period = 10; const int flush_period = 10;
if (cnt % flush_period == 0) { if (cnt % flush_period == 0) {
command_queue->flush();
auto flush_ret = command_queue->flush();
if (flush_ret != CL_SUCCESS) {
MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret);
}
} }
cnt++; cnt++;
MS_LOG(DEBUG) << "RunKernel success!"; MS_LOG(DEBUG) << "RunKernel success!";


Loading…
Cancel
Save