From: @wangdongxu6 Reviewed-by: @ddwsky,@HilbertDavid Signed-off-by: @ddwskytags/v1.1.0
| @@ -67,7 +67,7 @@ __kernel void Conv2D_H1W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| } | } | ||||
| if (bias) { | |||||
| if (bias != 0) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | out_h0_w0_c0 += bias[co_slice0]; | ||||
| } | } | ||||
| @@ -135,7 +135,7 @@ __kernel void Conv2D_H2W1C1(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| } | } | ||||
| if (bias) { | |||||
| if (bias != 0) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | out_h0_w0_c0 += bias[co_slice0]; | ||||
| out_h1_w0_c0 += bias[co_slice0]; | out_h1_w0_c0 += bias[co_slice0]; | ||||
| } | } | ||||
| @@ -224,7 +224,7 @@ __kernel void Conv2D_H2W1C2(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| } | } | ||||
| if (bias) { | |||||
| if (bias != 0) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | out_h0_w0_c0 += bias[co_slice0]; | ||||
| out_h1_w0_c0 += bias[co_slice0]; | out_h1_w0_c0 += bias[co_slice0]; | ||||
| out_h0_w0_c1 += bias[co_slice1]; | out_h0_w0_c1 += bias[co_slice1]; | ||||
| @@ -357,7 +357,7 @@ __kernel void Conv2D_H2W2C2(__read_only image2d_t input, __write_only image2d_t | |||||
| } | } | ||||
| } | } | ||||
| if (bias) { | |||||
| if (bias != 0) { | |||||
| out_h0_w0_c0 += bias[co_slice0]; | out_h0_w0_c0 += bias[co_slice0]; | ||||
| out_h0_w1_c0 += bias[co_slice0]; | out_h0_w1_c0 += bias[co_slice0]; | ||||
| out_h1_w0_c0 += bias[co_slice0]; | out_h1_w0_c0 += bias[co_slice0]; | ||||
| @@ -10,6 +10,7 @@ __kernel void SpaceToDepth(__read_only image2d_t src_data, __write_only image2d_ | |||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H * N | int Z = get_global_id(2); // H * N | ||||
| if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; | if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; | ||||
| if (out_shape.y == 0 || ci_size == 0 || block_size == 0) return; | |||||
| int N = Z / out_shape.y; | int N = Z / out_shape.y; | ||||
| int H = Z % out_shape.y; | int H = Z % out_shape.y; | ||||
| int co_base = X * C4NUM; | int co_base = X * C4NUM; | ||||
| @@ -43,6 +44,7 @@ __kernel void SpaceToDepthAlign(__read_only image2d_t src_data, __write_only ima | |||||
| int Y = get_global_id(1); // W | int Y = get_global_id(1); // W | ||||
| int Z = get_global_id(2); // H * N | int Z = get_global_id(2); // H * N | ||||
| if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; | if (X >= out_shape.w || Y >= out_shape.z || Z >= out_shape.x * out_shape.y) return; | ||||
| if (out_shape.y == 0 || in_shape.w == 0 || block_size == 0) return; | |||||
| int N = Z / out_shape.y; | int N = Z / out_shape.y; | ||||
| int H = Z % out_shape.y; | int H = Z % out_shape.y; | ||||
| @@ -171,7 +171,7 @@ __kernel void Winograd36To4x4(__read_only image2d_t input, __write_only image2d_ | |||||
| acc += AtM_row[y] * At[idx]; | acc += AtM_row[y] * At[idx]; | ||||
| } | } | ||||
| if (bias) { | |||||
| if (bias != 0) { | |||||
| acc += bias[slice]; | acc += bias[slice]; | ||||
| } | } | ||||
| @@ -42,6 +42,8 @@ using mindspore::schema::PrimitiveType_Square; | |||||
| namespace mindspore::kernel { | namespace mindspore::kernel { | ||||
| void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) { | void ArithmeticSelfOpenCLKernel::GetKernelName(std::string *kernel_name, ArithmeticSelfParameter *param) { | ||||
| MS_ASSERT(kernel_name); | |||||
| MS_ASSERT(param); | |||||
| switch (param->op_parameter_.type_) { | switch (param->op_parameter_.type_) { | ||||
| case PrimitiveType_Abs: | case PrimitiveType_Abs: | ||||
| kernel_name[0] += "_ElementAbs"; | kernel_name[0] += "_ElementAbs"; | ||||
| @@ -51,6 +51,8 @@ int SpaceToBatchNDOpenCLKernel::CheckSpecs() { | |||||
| MS_LOG(ERROR) << "block_sizes_ must > 1, actual " << param->block_sizes_[0] << ", " << param->block_sizes_[1]; | MS_LOG(ERROR) << "block_sizes_ must > 1, actual " << param->block_sizes_[0] << ", " << param->block_sizes_[1]; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| MS_ASSERT(param->block_sizes_[0]); | |||||
| MS_ASSERT(param->block_sizes_[1]); | |||||
| if (param->padded_in_shape_[kNHWC_H] % param->block_sizes_[0] || | if (param->padded_in_shape_[kNHWC_H] % param->block_sizes_[0] || | ||||
| param->padded_in_shape_[kNHWC_W] % param->block_sizes_[1]) { | param->padded_in_shape_[kNHWC_W] % param->block_sizes_[1]) { | ||||
| MS_LOG(ERROR) << "padded shape must be multiple of block!"; | MS_LOG(ERROR) << "padded shape must be multiple of block!"; | ||||
| @@ -169,6 +169,7 @@ void StridedSliceOpenCLKernel::SetGlobalLocal() { | |||||
| const int max_divider = 8; | const int max_divider = 8; | ||||
| auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | auto max_work_group_size = ocl_runtime_->DeviceMaxWorkGroupSize(); | ||||
| size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); | size_t local_c = GetMaxDivisorStrategy0(global[2], max_divider); | ||||
| local_c = std::max<size_t>(local_c, 1); | |||||
| size_t local_hw = max_work_group_size / local_c; | size_t local_hw = max_work_group_size / local_c; | ||||
| size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); | size_t local_h = std::min(UP_DIV(global[0], 2), local_hw); | ||||
| size_t local_w = std::min(local_hw / local_h, global[1]); | size_t local_w = std::min(local_hw / local_h, global[1]); | ||||
| @@ -36,6 +36,8 @@ struct OpenCLToFormatParameter { | |||||
| template <typename SrcT, typename DstT> | template <typename SrcT, typename DstT> | ||||
| void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | ||||
| MS_ASSERT(dst); | |||||
| MS_ASSERT(src); | |||||
| auto *N = dst; | auto *N = dst; | ||||
| auto *H = dst + 1; | auto *H = dst + 1; | ||||
| auto *W = dst + 2; | auto *W = dst + 2; | ||||
| @@ -54,13 +56,15 @@ void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num) { | |||||
| *H = src[1]; | *H = src[1]; | ||||
| *W = src[2]; | *W = src[2]; | ||||
| *C = src[3]; | *C = src[3]; | ||||
| } else if (src_num >= 5) { | |||||
| } else if (src_num > 4) { | |||||
| MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num; | MS_LOG(ERROR) << "GPU doesn't support ndim>=" << src_num; | ||||
| } | } | ||||
| } | } | ||||
| template <typename SrcT, typename DstT> | template <typename SrcT, typename DstT> | ||||
| void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { | void Broadcast2GpuShape(DstT *dst, const SrcT *src, int src_num, DstT default_value) { | ||||
| MS_ASSERT(dst); | |||||
| MS_ASSERT(src); | |||||
| for (int i = 0; i < 4; ++i) { | for (int i = 0; i < 4; ++i) { | ||||
| dst[i] = default_value; | dst[i] = default_value; | ||||
| } | } | ||||
| @@ -101,6 +105,7 @@ struct GpuTensorInfo { | |||||
| size_t RowPitch() const { | size_t RowPitch() const { | ||||
| auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | auto runtime_wrapper = lite::opencl::OpenCLRuntimeWrapper(); | ||||
| int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | int alignment = runtime_wrapper.GetInstance()->GetImagePitchAlignment(); | ||||
| MS_ASSERT(alignment); | |||||
| size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size; | size_t row_pitch = UP_ROUND(width, alignment) * FLT4_size; | ||||
| return row_pitch; | return row_pitch; | ||||
| } | } | ||||
| @@ -143,31 +148,31 @@ class OpenCLKernel : public LiteKernel { | |||||
| int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) { | int AlignGlobalLocal(const std::vector<size_t> &global, const std::vector<size_t> &local) { | ||||
| std::vector<size_t> internal_global_ws = global; | std::vector<size_t> internal_global_ws = global; | ||||
| for (size_t i = 0; i < local.size(); ++i) { | for (size_t i = 0; i < local.size(); ++i) { | ||||
| internal_global_ws[i] = UP_ROUND(global[i], local[i]); | |||||
| internal_global_ws.at(i) = UP_ROUND(global.at(i), local.at(i)); | |||||
| } | } | ||||
| MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); | MS_LOG(DEBUG) << "global size: " << global.size() << ", local size: " << local.size(); | ||||
| for (size_t i = 0; i < global.size(); i++) { | for (size_t i = 0; i < global.size(); i++) { | ||||
| MS_LOG(DEBUG) << "global[" << i << "] = " << global[i]; | |||||
| MS_LOG(DEBUG) << "global[" << i << "] = " << global.at(i); | |||||
| } | } | ||||
| for (size_t i = 0; i < local.size(); i++) { | for (size_t i = 0; i < local.size(); i++) { | ||||
| MS_LOG(DEBUG) << "local[" << i << "] = " << local[i]; | |||||
| MS_LOG(DEBUG) << "local[" << i << "] = " << local.at(i); | |||||
| } | } | ||||
| if (global.size() == 1) { | if (global.size() == 1) { | ||||
| global_range_ = cl::NDRange(internal_global_ws[0]); | |||||
| global_range_ = cl::NDRange(internal_global_ws.at(0)); | |||||
| if (!local.empty()) { | if (!local.empty()) { | ||||
| local_range_ = cl::NDRange(local[0]); | |||||
| local_range_ = cl::NDRange(local.at(0)); | |||||
| } | } | ||||
| } else if (global.size() == 2) { | } else if (global.size() == 2) { | ||||
| global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1]); | |||||
| global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1)); | |||||
| if (!local.empty()) { | if (!local.empty()) { | ||||
| local_range_ = cl::NDRange(local[0], local[1]); | |||||
| local_range_ = cl::NDRange(local.at(0), local.at(1)); | |||||
| } | } | ||||
| } else if (global.size() == 3) { | } else if (global.size() == 3) { | ||||
| global_range_ = cl::NDRange(internal_global_ws[0], internal_global_ws[1], internal_global_ws[2]); | |||||
| global_range_ = cl::NDRange(internal_global_ws.at(0), internal_global_ws.at(1), internal_global_ws.at(2)); | |||||
| if (!local.empty()) { | if (!local.empty()) { | ||||
| local_range_ = cl::NDRange(local[0], local[1], local[2]); | |||||
| local_range_ = cl::NDRange(local.at(0), local.at(1), local.at(2)); | |||||
| } | } | ||||
| } else { | } else { | ||||
| MS_LOG(ERROR) << "Not supported NDRange!"; | MS_LOG(ERROR) << "Not supported NDRange!"; | ||||
| @@ -191,6 +196,7 @@ class OpenCLKernel : public LiteKernel { | |||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| int GetImageSize(size_t idx, std::vector<size_t> *img_size) { | int GetImageSize(size_t idx, std::vector<size_t> *img_size) { | ||||
| MS_ASSERT(img_size); | |||||
| if (idx >= out_tensors_.size()) { | if (idx >= out_tensors_.size()) { | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| @@ -28,11 +28,12 @@ using mindspore::lite::opencl::MemType; | |||||
| SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | SubGraphOpenCLKernel::~SubGraphOpenCLKernel() { UnInit(); } | ||||
| int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||||
| void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||||
| const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | const std::vector<lite::Tensor *> &in_tensors, const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | ||||
| MemType mem_type) { | MemType mem_type) { | ||||
| for (size_t i = 0; i < in_tensors.size(); ++i) { | for (size_t i = 0; i < in_tensors.size(); ++i) { | ||||
| for (auto &jv : in_kernels.at(i)) { | for (auto &jv : in_kernels.at(i)) { | ||||
| MS_ASSERT(jv); | |||||
| auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors(); | auto tensors = (mem_type == MemType::IMG) ? jv->in_tensors() : jv->out_tensors(); | ||||
| auto ft = std::find_if(tensors.begin(), tensors.end(), | auto ft = std::find_if(tensors.begin(), tensors.end(), | ||||
| [&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); }); | [&in_tensors, &i](lite::Tensor *kv) { return kv == in_tensors.at(i); }); | ||||
| @@ -43,6 +44,7 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||||
| std::replace_if( | std::replace_if( | ||||
| kernels.begin(), kernels.end(), | kernels.begin(), kernels.end(), | ||||
| [this, &in_tensors, &i](kernel::LiteKernel *kv) { | [this, &in_tensors, &i](kernel::LiteKernel *kv) { | ||||
| MS_ASSERT(kv); | |||||
| return std::find_if(kv->in_tensors().begin(), kv->in_tensors().end(), | return std::find_if(kv->in_tensors().begin(), kv->in_tensors().end(), | ||||
| [&in_tensors, &i](lite::Tensor *xv) { return xv == in_tensors.at(i); }) != | [&in_tensors, &i](lite::Tensor *xv) { return xv == in_tensors.at(i); }) != | ||||
| kv->in_tensors().end() && | kv->in_tensors().end() && | ||||
| @@ -58,14 +60,16 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToNull( | |||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| lite::Tensor *new_tensor, | |||||
| kernel::LiteKernel *in_convert_op, MemType mem_type) { | |||||
| void SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| lite::Tensor *new_tensor, | |||||
| kernel::LiteKernel *in_convert_op, MemType mem_type) { | |||||
| MS_ASSERT(in_convert_op); | |||||
| auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); | auto in_opencl_op = reinterpret_cast<OpenCLKernel *>(in_convert_op); | ||||
| for (auto &iv : in_kernels) { | for (auto &iv : in_kernels) { | ||||
| MS_ASSERT(iv); | |||||
| auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels(); | auto kernels = (mem_type == MemType::IMG) ? iv->in_kernels() : iv->out_kernels(); | ||||
| auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; }); | auto fk = std::find_if(kernels.begin(), kernels.end(), [&](kernel::LiteKernel *kv) { return kv == nullptr; }); | ||||
| if (fk != kernels.end()) { | if (fk != kernels.end()) { | ||||
| @@ -90,13 +94,16 @@ int SubGraphOpenCLKernel::ReplaceOutTensorAndKernelToConvert(const lite::Tensor | |||||
| in_convert_op->AddInKernel(iv); | in_convert_op->AddInKernel(iv); | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | ||||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | ||||
| std::vector<lite::Tensor *> *out_tensors, | std::vector<lite::Tensor *> *out_tensors, | ||||
| std::vector<OpenCLToFormatParameter *> *out_parameters, | std::vector<OpenCLToFormatParameter *> *out_parameters, | ||||
| std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) { | std::vector<LiteKernel *> *out_convert_ops, MemType mem_type) { | ||||
| MS_ASSERT(out_tensors); | |||||
| MS_ASSERT(out_parameters); | |||||
| MS_ASSERT(out_convert_ops); | |||||
| out_tensors->clear(); | out_tensors->clear(); | ||||
| out_parameters->clear(); | out_parameters->clear(); | ||||
| out_convert_ops->clear(); | out_convert_ops->clear(); | ||||
| @@ -167,6 +174,7 @@ int SubGraphOpenCLKernel::GenToFormatOp(const std::vector<lite::Tensor *> &in_te | |||||
| // replace in_tensor of inner kernel which use out tensor | // replace in_tensor of inner kernel which use out tensor | ||||
| if (mem_type == MemType::BUF) { | if (mem_type == MemType::BUF) { | ||||
| for (auto &iv : loop_kernels[i]) { | for (auto &iv : loop_kernels[i]) { | ||||
| MS_ASSERT(iv); | |||||
| auto tensors = iv->in_tensors(); | auto tensors = iv->in_tensors(); | ||||
| auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i)); | auto jv = std::find(tensors.begin(), tensors.end(), in_tensors.at(i)); | ||||
| if (jv != tensors.end()) { | if (jv != tensors.end()) { | ||||
| @@ -185,9 +193,11 @@ int SubGraphOpenCLKernel::Init() { | |||||
| allocator_ = ocl_runtime_->GetAllocator(); | allocator_ = ocl_runtime_->GetAllocator(); | ||||
| MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); | MS_LOG(DEBUG) << "input num=" << in_tensors_.size() << ", output num=" << out_tensors_.size(); | ||||
| for (const auto tensor : in_tensors_) { | for (const auto tensor : in_tensors_) { | ||||
| MS_ASSERT(tensor); | |||||
| tensor->set_allocator(allocator_); | tensor->set_allocator(allocator_); | ||||
| } | } | ||||
| for (const auto tensor : out_tensors_) { | for (const auto tensor : out_tensors_) { | ||||
| MS_ASSERT(tensor); | |||||
| tensor->set_allocator(allocator_); | tensor->set_allocator(allocator_); | ||||
| } | } | ||||
| @@ -223,72 +233,83 @@ int SubGraphOpenCLKernel::Init() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int SubGraphOpenCLKernel::UpdateTensorDataType() { | |||||
| void SubGraphOpenCLKernel::UpdateTensorDataType() { | |||||
| bool is_fp16 = ocl_runtime_->GetFp16Enable(); | bool is_fp16 = ocl_runtime_->GetFp16Enable(); | ||||
| MS_ASSERT(in_tensors_[0]); | |||||
| if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) { | if (is_fp16 && (in_tensors_[0]->data_type() == kNumberTypeFloat32)) { | ||||
| std::set<lite::Tensor *> out_set; | std::set<lite::Tensor *> out_set; | ||||
| out_set.insert(in_tensors_.begin(), in_tensors_.end()); | out_set.insert(in_tensors_.begin(), in_tensors_.end()); | ||||
| out_set.insert(out_tensors_.begin(), out_tensors_.end()); | out_set.insert(out_tensors_.begin(), out_tensors_.end()); | ||||
| for (auto iv : nodes_) { | for (auto iv : nodes_) { | ||||
| MS_ASSERT(iv); | |||||
| auto cur_outs = iv->out_tensors(); | auto cur_outs = iv->out_tensors(); | ||||
| for (auto jv : cur_outs) { | for (auto jv : cur_outs) { | ||||
| if (out_set.count(jv) == 0) { | if (out_set.count(jv) == 0) { | ||||
| MS_ASSERT(jv); | |||||
| jv->set_data_type(kNumberTypeFloat16); | jv->set_data_type(kNumberTypeFloat16); | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::MallocTensorWithReuse() { | int SubGraphOpenCLKernel::MallocTensorWithReuse() { | ||||
| int ret; | |||||
| kernel::LiteKernelUtil::InitTensorRefCount(nodes_); | kernel::LiteKernelUtil::InitTensorRefCount(nodes_); | ||||
| for (auto *kernel : nodes_) { | for (auto *kernel : nodes_) { | ||||
| MS_ASSERT(nullptr != kernel); | |||||
| MS_ASSERT(kernel); | |||||
| auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | auto *op_kernel = reinterpret_cast<kernel::OpenCLKernel *>(kernel); | ||||
| auto outputs = kernel->out_tensors(); | auto outputs = kernel->out_tensors(); | ||||
| for (auto i = 0; i < outputs.size(); ++i) { | for (auto i = 0; i < outputs.size(); ++i) { | ||||
| auto *output = outputs.at(i); | auto *output = outputs.at(i); | ||||
| MS_ASSERT(nullptr != output); | |||||
| MS_ASSERT(output); | |||||
| if (op_kernel->GetMemType() == MemType::IMG) { | if (op_kernel->GetMemType() == MemType::IMG) { | ||||
| std::vector<size_t> img_size; | std::vector<size_t> img_size; | ||||
| op_kernel->GetImageSize(i, &img_size); | |||||
| ret = op_kernel->GetImageSize(i, &img_size); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "GetImageSize failed"; | |||||
| } | |||||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | auto data_ptr = allocator_->Malloc(output->Size(), img_size); | ||||
| output->set_data(data_ptr); | output->set_data(data_ptr); | ||||
| } else { | } else { | ||||
| output->MallocData(allocator_); | |||||
| ret = output->MallocData(allocator_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "MallocData failed"; | |||||
| } | |||||
| } | } | ||||
| output->set_allocator(allocator_); | output->set_allocator(allocator_); | ||||
| } | } | ||||
| for (auto input_kernel : kernel->in_kernels()) { | for (auto input_kernel : kernel->in_kernels()) { | ||||
| MS_ASSERT(nullptr != input_kernel); | |||||
| auto ret = input_kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_ASSERT(input_kernel); | |||||
| ret = input_kernel->DecOutTensorRefCount(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | ||||
| } | } | ||||
| } | } | ||||
| } | } | ||||
| for (auto kernel : out_kernels_) { | for (auto kernel : out_kernels_) { | ||||
| MS_ASSERT(nullptr != kernel); | |||||
| auto ret = kernel->DecOutTensorRefCount(); | |||||
| if (0 != ret) { | |||||
| MS_ASSERT(kernel); | |||||
| ret = kernel->DecOutTensorRefCount(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, | |||||
| bool is_from) { | |||||
| void SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, | |||||
| bool is_from) { | |||||
| std::vector<std::set<lite::Tensor *>> ksets; | std::vector<std::set<lite::Tensor *>> ksets; | ||||
| for (auto jv : in_kernels) { | for (auto jv : in_kernels) { | ||||
| MS_ASSERT(jv); | |||||
| auto tens = is_from ? jv->in_tensors() : jv->out_tensors(); | auto tens = is_from ? jv->in_tensors() : jv->out_tensors(); | ||||
| std::set<lite::Tensor *> kset; | std::set<lite::Tensor *> kset; | ||||
| kset.insert(tens.begin(), tens.end()); | kset.insert(tens.begin(), tens.end()); | ||||
| ksets.emplace_back(kset); | ksets.emplace_back(kset); | ||||
| } | } | ||||
| MS_ASSERT(out_kernels); | |||||
| for (auto in_tensor : in_tensors) { | for (auto in_tensor : in_tensors) { | ||||
| std::vector<kernel::LiteKernel *> kvec; | std::vector<kernel::LiteKernel *> kvec; | ||||
| for (size_t j = 0; j < in_kernels.size(); ++j) { | for (size_t j = 0; j < in_kernels.size(); ++j) { | ||||
| @@ -298,13 +319,13 @@ int SubGraphOpenCLKernel::GetKernelFromToTensor(const std::vector<lite::Tensor * | |||||
| } | } | ||||
| out_kernels->emplace_back(kvec); | out_kernels->emplace_back(kvec); | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::GetInOutNodes() { | |||||
| void SubGraphOpenCLKernel::GetInOutNodes() { | |||||
| std::vector<std::set<lite::Tensor *>> ksets_in; | std::vector<std::set<lite::Tensor *>> ksets_in; | ||||
| std::vector<std::set<lite::Tensor *>> ksets_out; | std::vector<std::set<lite::Tensor *>> ksets_out; | ||||
| for (auto jv : nodes_) { | for (auto jv : nodes_) { | ||||
| MS_ASSERT(jv); | |||||
| std::set<lite::Tensor *> kset; | std::set<lite::Tensor *> kset; | ||||
| kset.insert(jv->in_tensors().begin(), jv->in_tensors().end()); | kset.insert(jv->in_tensors().begin(), jv->in_tensors().end()); | ||||
| ksets_in.emplace_back(kset); | ksets_in.emplace_back(kset); | ||||
| @@ -323,10 +344,15 @@ int SubGraphOpenCLKernel::GetInOutNodes() { | |||||
| out_nodes_.emplace_back(nodes_.at(j)); | out_nodes_.emplace_back(nodes_.at(j)); | ||||
| } | } | ||||
| } | } | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::Prepare() { | int SubGraphOpenCLKernel::Prepare() { | ||||
| executor_ = new (std::nothrow) lite::opencl::OpenCLExecutor(); | |||||
| if (executor_ == nullptr) { | |||||
| MS_LOG(ERROR) << "Create OpenCLExecutor fail"; | |||||
| return RET_ERROR; | |||||
| } | |||||
| auto ret = Init(); | auto ret = Init(); | ||||
| if (ret != RET_OK) { | if (ret != RET_OK) { | ||||
| MS_LOG(ERROR) << "OpenCL subgraph init fail"; | MS_LOG(ERROR) << "OpenCL subgraph init fail"; | ||||
| @@ -335,7 +361,7 @@ int SubGraphOpenCLKernel::Prepare() { | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| int SubGraphOpenCLKernel::UnInit() { | |||||
| void SubGraphOpenCLKernel::UnInit() { | |||||
| for (const auto &tensor : in_convert_tensors_) { | for (const auto &tensor : in_convert_tensors_) { | ||||
| delete tensor; | delete tensor; | ||||
| } | } | ||||
| @@ -351,7 +377,6 @@ int SubGraphOpenCLKernel::UnInit() { | |||||
| in_convert_ops_.clear(); | in_convert_ops_.clear(); | ||||
| out_convert_ops_.clear(); | out_convert_ops_.clear(); | ||||
| delete this->executor_; | delete this->executor_; | ||||
| return RET_OK; | |||||
| } | } | ||||
| int SubGraphOpenCLKernel::InferShape() { return RET_OK; } | int SubGraphOpenCLKernel::InferShape() { return RET_OK; } | ||||
| @@ -363,21 +388,28 @@ int SubGraphOpenCLKernel::Run() { | |||||
| MS_LOG(ERROR) << "executor is nullptr"; | MS_LOG(ERROR) << "executor is nullptr"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| int ret; | |||||
| for (auto &tensor : in_tensors_) { | for (auto &tensor : in_tensors_) { | ||||
| MS_ASSERT(tensor); | |||||
| if (tensor->data_c() == nullptr) { | if (tensor->data_c() == nullptr) { | ||||
| MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null"; | MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null"; | ||||
| return RET_ERROR; | return RET_ERROR; | ||||
| } | } | ||||
| allocator_->UnmapBuffer(tensor->data_c()); | allocator_->UnmapBuffer(tensor->data_c()); | ||||
| ret = allocator_->UnmapBuffer(tensor->data_c()); | |||||
| if (ret != RET_OK) { | |||||
| return ret; | |||||
| } | |||||
| } | } | ||||
| auto ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_); | |||||
| if (RET_OK != ret) { | |||||
| ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | MS_LOG(ERROR) << "Run opencl executor failed: " << ret; | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| ocl_runtime_->SyncCommandQueue(); | |||||
| if (!ocl_runtime_->SyncCommandQueue()) { | |||||
| return RET_ERROR; | |||||
| } | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| } // namespace mindspore::kernel | } // namespace mindspore::kernel | ||||
| @@ -41,7 +41,6 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||||
| ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); | ocl_runtime_ = ocl_runtime_wrap_.GetInstance(); | ||||
| subgraph_type_ = kGpuSubGraph; | subgraph_type_ = kGpuSubGraph; | ||||
| this->name_ = "GpuSubGraph"; | this->name_ = "GpuSubGraph"; | ||||
| this->executor_ = new lite::opencl::OpenCLExecutor(); | |||||
| nodes_set_.insert(nodes.begin(), nodes.end()); | nodes_set_.insert(nodes.begin(), nodes.end()); | ||||
| } | } | ||||
| ~SubGraphOpenCLKernel() override; | ~SubGraphOpenCLKernel() override; | ||||
| @@ -56,23 +55,23 @@ class SubGraphOpenCLKernel : public SubGraphKernel { | |||||
| int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); }; | int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); }; | ||||
| private: | private: | ||||
| int UnInit(); | |||||
| int UpdateTensorDataType(); | |||||
| void UnInit(); | |||||
| void UpdateTensorDataType(); | |||||
| int MallocTensorWithReuse(); | int MallocTensorWithReuse(); | ||||
| int ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||||
| lite::opencl::MemType mem_type); | |||||
| int ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor, | |||||
| kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type); | |||||
| int GetInOutNodes(); | |||||
| void ReplaceOutTensorAndKernelToNull(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | |||||
| lite::opencl::MemType mem_type); | |||||
| void ReplaceOutTensorAndKernelToConvert(const lite::Tensor *in_tensor, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, lite::Tensor *new_tensor, | |||||
| kernel::LiteKernel *in_convert_op, lite::opencl::MemType mem_type); | |||||
| void GetInOutNodes(); | |||||
| int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | int GenToFormatOp(const std::vector<lite::Tensor *> &in_tensors, | ||||
| const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | const std::vector<std::vector<kernel::LiteKernel *>> &in_kernels, | ||||
| std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters, | std::vector<lite::Tensor *> *out_tensors, std::vector<OpenCLToFormatParameter *> *out_parameters, | ||||
| std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type); | std::vector<LiteKernel *> *out_convert_ops, lite::opencl::MemType mem_type); | ||||
| int GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from); | |||||
| void GetKernelFromToTensor(const std::vector<lite::Tensor *> &in_tensors, | |||||
| const std::vector<kernel::LiteKernel *> &in_kernels, | |||||
| std::vector<std::vector<kernel::LiteKernel *>> *out_kernels, bool is_from); | |||||
| lite::opencl::OpenCLAllocator *allocator_{nullptr}; | lite::opencl::OpenCLAllocator *allocator_{nullptr}; | ||||
| std::vector<lite::Tensor *> in_convert_tensors_; | std::vector<lite::Tensor *> in_convert_tensors_; | ||||
| std::vector<lite::Tensor *> out_convert_tensors_; | std::vector<lite::Tensor *> out_convert_tensors_; | ||||
| @@ -87,6 +87,7 @@ int GetMaxDivisorStrategy1(int x, int divisor) { | |||||
| } | } | ||||
| std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { | std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const std::vector<size_t> &global) { | ||||
| MS_ASSERT(local.size() == global.size() && local.size() == 3); | |||||
| std::vector<size_t> result(3); | std::vector<size_t> result(3); | ||||
| for (int i = 0; i < 3; ++i) { | for (int i = 0; i < 3; ++i) { | ||||
| result[i] = UP_ROUND(global[i], local[i]); | result[i] = UP_ROUND(global[i], local[i]); | ||||
| @@ -95,6 +96,7 @@ std::vector<size_t> GetCommonGlobalSize(const std::vector<size_t> &local, const | |||||
| } | } | ||||
| std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | std::vector<size_t> GetCommonLocalSize(const std::vector<size_t> &global, int max_size) { | ||||
| MS_ASSERT(global.size() == 3); | |||||
| size_t local_z = GetMaxDivisorStrategy0(global[2], 8); | size_t local_z = GetMaxDivisorStrategy0(global[2], 8); | ||||
| if (local_z == 0) { | if (local_z == 0) { | ||||
| MS_LOG(ERROR) << "Divide by zero"; | MS_LOG(ERROR) << "Divide by zero"; | ||||
| @@ -239,6 +241,7 @@ std::string CLErrorCode(cl_int error_code) { | |||||
| } | } | ||||
| int WriteToBin(const std::string &file_path, void *data, size_t size) { | int WriteToBin(const std::string &file_path, void *data, size_t size) { | ||||
| MS_ASSERT(data); | |||||
| std::ofstream out_file; | std::ofstream out_file; | ||||
| out_file.open(file_path.c_str(), std::ios::binary); | out_file.open(file_path.c_str(), std::ios::binary); | ||||
| @@ -256,7 +259,7 @@ int WriteToBin(const std::string &file_path, void *data, size_t size) { | |||||
| } | } | ||||
| void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { | void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std::string &out_file) { | ||||
| if (tensor->data_c() == nullptr) { | |||||
| if (tensor == nullptr || tensor->data_c() == nullptr) { | |||||
| return; | return; | ||||
| } | } | ||||
| @@ -305,6 +308,9 @@ void PrintTensor(const lite::Tensor *tensor, MemType mem_type, int n, const std: | |||||
| } | } | ||||
| void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) { | void PrintKernelOutput(OpenCLKernel *kernel, int n, const std::string &out_file) { | ||||
| if (kernel == nullptr) { | |||||
| return; | |||||
| } | |||||
| printf("%-30s", kernel->name().c_str()); | printf("%-30s", kernel->name().c_str()); | ||||
| if (!kernel->out_tensors().empty()) { | if (!kernel->out_tensors().empty()) { | ||||
| PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file); | PrintTensor(kernel->out_tensors()[0], kernel->GetMemType(), n, out_file); | ||||
| @@ -61,6 +61,8 @@ std::vector<size_t> GetImage2dShapeFromNHWC(const std::vector<int> &tensor_shape | |||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | ||||
| MS_ASSERT(src); | |||||
| MS_ASSERT(dst); | |||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| int src_offset = b * plane * channel; | int src_offset = b * plane * channel; | ||||
| @@ -81,6 +83,8 @@ void PackNCHWToNC4HW4(void *src, void *dst, int batch, int plane, int channel, c | |||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | ||||
| MS_ASSERT(src); | |||||
| MS_ASSERT(dst); | |||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| int nhwc4_batch_unit_offset = c4 * C4NUM * plane; | int nhwc4_batch_unit_offset = c4 * C4NUM * plane; | ||||
| int ic_remainder_ = channel % C4NUM; | int ic_remainder_ = channel % C4NUM; | ||||
| @@ -106,6 +110,8 @@ void PackNHWCToNHWC4(void *src, void *dst, int batch, int plane, int channel, co | |||||
| template <class T1, class T2> | template <class T1, class T2> | ||||
| void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | void PackNHWCToNC4HW4(void *src, void *dst, int batch, int plane, int channel, const std::function<T2(T1)> &to_dtype) { | ||||
| MS_ASSERT(src); | |||||
| MS_ASSERT(dst); | |||||
| int c4 = UP_DIV(channel, C4NUM); | int c4 = UP_DIV(channel, C4NUM); | ||||
| for (int b = 0; b < batch; b++) { | for (int b = 0; b < batch; b++) { | ||||
| int src_oc_offset = b * plane * channel; | int src_oc_offset = b * plane * channel; | ||||
| @@ -142,6 +148,11 @@ std::vector<T> MatrixMultiply(const T A[], const T B[], int M, int N, int K) { | |||||
| template <typename SRC_T, typename DST_T> | template <typename SRC_T, typename DST_T> | ||||
| void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, | void ConvertConvWeight4DTo7D(void *src, void *dst, size_t CO, size_t KH, size_t KW, size_t CI, size_t OGroup = 1, | ||||
| const size_t CI_TILE = 4, const size_t CO_TILE = 4) { | const size_t CI_TILE = 4, const size_t CO_TILE = 4) { | ||||
| MS_ASSERT(src); | |||||
| MS_ASSERT(dst); | |||||
| MS_ASSERT(CI_TILE); | |||||
| MS_ASSERT(CO_TILE); | |||||
| MS_ASSERT(OGroup); | |||||
| if (CO_TILE == 0 || CI_TILE == 0) return; | if (CO_TILE == 0 || CI_TILE == 0) return; | ||||
| auto origin_weight = reinterpret_cast<SRC_T *>(src); | auto origin_weight = reinterpret_cast<SRC_T *>(src); | ||||
| auto packed_weight = reinterpret_cast<DST_T *>(dst); | auto packed_weight = reinterpret_cast<DST_T *>(dst); | ||||
| @@ -66,6 +66,7 @@ void *OpenCLAllocator::MinimumFit(size_t size, const std::vector<size_t> &img_si | |||||
| void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::Buffer **buffer) { | ||||
| cl_int ret = CL_SUCCESS; | cl_int ret = CL_SUCCESS; | ||||
| MS_ASSERT(buffer); | |||||
| *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret); | *buffer = new (std::nothrow) cl::Buffer(*ocl_runtime_->Context(), flags, size, data, &ret); | ||||
| if (*buffer == nullptr) { | if (*buffer == nullptr) { | ||||
| MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | MS_LOG(ERROR) << "Create OpenCL buffer failed! (ERROR CODE: " << ret << ")"; | ||||
| @@ -78,7 +79,11 @@ void *OpenCLAllocator::CreateBuffer(size_t size, void *data, size_t flags, cl::B | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| cl::Memory *mem = *buffer; | cl::Memory *mem = *buffer; | ||||
| ocl_runtime_->UnmapBuffer(*mem, host_ptr); | |||||
| MS_ASSERT(mem); | |||||
| ret = ocl_runtime_->UnmapBuffer(*mem, host_ptr); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "UnmapBuffer failed."; | |||||
| } | |||||
| return host_ptr; | return host_ptr; | ||||
| } | } | ||||
| @@ -110,7 +115,10 @@ void *OpenCLAllocator::CreateImage2D(size_t size, const std::vector<size_t> &img | |||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| cl::Memory *mem = *image; | cl::Memory *mem = *image; | ||||
| ocl_runtime_->UnmapBuffer(*mem, host_ptr); | |||||
| ret = ocl_runtime_->UnmapBuffer(*mem, host_ptr); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << "UnmapBuffer failed."; | |||||
| } | |||||
| } | } | ||||
| return host_ptr; | return host_ptr; | ||||
| } | } | ||||
| @@ -131,7 +139,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector<size_t> &img_size, | |||||
| } | } | ||||
| Lock(); | Lock(); | ||||
| void *host_ptr = MinimumFit(size, img_size); | void *host_ptr = MinimumFit(size, img_size); | ||||
| if ((host_ptr != nullptr) && (data == nullptr)) { | |||||
| if (host_ptr != nullptr && data == nullptr) { | |||||
| UnLock(); | UnLock(); | ||||
| return host_ptr; | return host_ptr; | ||||
| } | } | ||||
| @@ -188,7 +196,10 @@ void OpenCLAllocator::Free(void *buf) { | |||||
| auto iter = allocated_list_.find(buf); | auto iter = allocated_list_.find(buf); | ||||
| if (iter != allocated_list_.end()) { | if (iter != allocated_list_.end()) { | ||||
| if (iter->second->map_flags) { | if (iter->second->map_flags) { | ||||
| UnmapBuffer(buf); | |||||
| int ret = UnmapBuffer(buf); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "UnmapBuffer failed."; | |||||
| } | |||||
| iter->second->map_flags = false; | iter->second->map_flags = false; | ||||
| } | } | ||||
| auto mem_buf = iter->second; | auto mem_buf = iter->second; | ||||
| @@ -240,7 +251,10 @@ void OpenCLAllocator::Clear() { | |||||
| auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); | auto svm_capabilities = ocl_runtime_->GetSVMCapabilities(); | ||||
| for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) { | for (auto it = allocated_list_.begin(); it != allocated_list_.end(); it++) { | ||||
| if (it->second->map_flags) { | if (it->second->map_flags) { | ||||
| UnmapBuffer(it->second->host_ptr_); | |||||
| int ret = UnmapBuffer(it->second->host_ptr_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "UnmapBuffer failed."; | |||||
| } | |||||
| } | } | ||||
| if (svm_capabilities) { | if (svm_capabilities) { | ||||
| clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_); | clSVMFree((*ocl_runtime_->Context())(), it->second->host_ptr_); | ||||
| @@ -295,7 +309,11 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, | |||||
| MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr; | MS_LOG(ERROR) << "Map buffer failed, can not found buffer :" << host_ptr; | ||||
| return nullptr; | return nullptr; | ||||
| } | } | ||||
| ocl_runtime_->MapBuffer(host_ptr, flags, it->second->size_, static_cast<cl::CommandQueue *>(command_queue), sync); | |||||
| int ret = ocl_runtime_->MapBuffer(host_ptr, flags, it->second->size_, | |||||
| static_cast<cl::CommandQueue *>(command_queue), sync); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "MapBuffer failed."; | |||||
| } | |||||
| } | } | ||||
| return host_ptr; | return host_ptr; | ||||
| } | } | ||||
| @@ -313,14 +331,16 @@ void *OpenCLAllocator::MapBuffer(void *host_ptr, int flags, void *command_queue, | |||||
| return host_ptr; | return host_ptr; | ||||
| } | } | ||||
| MemBuf *mem_buf = it->second; | MemBuf *mem_buf = it->second; | ||||
| void *new_host_ptr{nullptr}; | |||||
| MS_ASSERT(mem_buf); | |||||
| void *new_host_ptr; | |||||
| if (mem_buf->img_size.empty()) { | if (mem_buf->img_size.empty()) { | ||||
| cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); | cl::Buffer *buffer = static_cast<cl::Buffer *>(mem_buf->device_ptr_); | ||||
| MS_ASSERT(buffer); | |||||
| new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); | new_host_ptr = ocl_runtime_->MapBuffer(*buffer, flags, mem_buf->size_, nullptr, sync); | ||||
| } else { | } else { | ||||
| cl::ImageFormat image_format(CL_RGBA, mem_buf->img_size[2]); | |||||
| std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1}; | std::vector<size_t> region{mem_buf->img_size[0], mem_buf->img_size[1], 1}; | ||||
| cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_); | cl::Image2D *image = static_cast<cl::Image2D *>(mem_buf->image_ptr_); | ||||
| MS_ASSERT(image); | |||||
| new_host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region); | new_host_ptr = ocl_runtime_->MapBuffer(*image, 0, CL_MAP_READ | CL_MAP_WRITE, region); | ||||
| } | } | ||||
| if (new_host_ptr == nullptr) { | if (new_host_ptr == nullptr) { | ||||
| @@ -373,6 +393,7 @@ MemType OpenCLAllocator::GetMemType(void *host_ptr) { | |||||
| return mem_type; | return mem_type; | ||||
| } | } | ||||
| MemBuf *mem_buf = it->second; | MemBuf *mem_buf = it->second; | ||||
| MS_ASSERT(mem_buf); | |||||
| if (mem_buf->img_size.empty()) { | if (mem_buf->img_size.empty()) { | ||||
| mem_type = MemType::BUF; | mem_type = MemType::BUF; | ||||
| } else { | } else { | ||||
| @@ -383,6 +404,7 @@ MemType OpenCLAllocator::GetMemType(void *host_ptr) { | |||||
| } | } | ||||
| int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size) { | int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size) { | ||||
| MS_ASSERT(img_size); | |||||
| Lock(); | Lock(); | ||||
| auto it = allocated_list_.find(host_ptr); | auto it = allocated_list_.find(host_ptr); | ||||
| if (it == allocated_list_.end()) { | if (it == allocated_list_.end()) { | ||||
| @@ -391,6 +413,7 @@ int OpenCLAllocator::GetImageSize(void *host_ptr, std::vector<size_t> *img_size) | |||||
| return RET_OK; | return RET_OK; | ||||
| } | } | ||||
| MemBuf *mem_buf = it->second; | MemBuf *mem_buf = it->second; | ||||
| MS_ASSERT(mem_buf); | |||||
| if (!mem_buf->img_size.empty()) { | if (!mem_buf->img_size.empty()) { | ||||
| *img_size = mem_buf->img_size; | *img_size = mem_buf->img_size; | ||||
| } | } | ||||
| @@ -14,8 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_ | |||||
| #define MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_ | |||||
| #include <memory> | #include <memory> | ||||
| #include <string> | #include <string> | ||||
| @@ -29,17 +29,6 @@ | |||||
| namespace mindspore::lite::opencl { | namespace mindspore::lite::opencl { | ||||
| #define MS_HOST_BUFFER 0 | |||||
| #define MS_CL_BUFFER (1 << 1) | |||||
| #define MS_CL_IMAGE2D (1 << 2) | |||||
| typedef int32_t OpenCLMemoryType; | |||||
| struct OpenclMemory { | |||||
| void *host_ptr{nullptr}; | |||||
| void *device_ptr{nullptr}; | |||||
| OpenCLMemoryType mem_type{MS_HOST_BUFFER | MS_CL_BUFFER}; | |||||
| }; | |||||
| class OpenCLRuntime; | class OpenCLRuntime; | ||||
| enum class MemType : char { BUF, IMG }; | enum class MemType : char { BUF, IMG }; | ||||
| @@ -95,4 +84,4 @@ class OpenCLAllocator : public Allocator { | |||||
| } // namespace mindspore::lite::opencl | } // namespace mindspore::lite::opencl | ||||
| #endif // MINDSPORE_LITE_SRC_OPENCL_ALLOCATOR_H_ | |||||
| #endif // MINDSPORE_LITE_SRC_RUNTIME_OPENCL_ALLOCATOR_H_ | |||||
| @@ -21,14 +21,13 @@ | |||||
| namespace mindspore::lite::opencl { | namespace mindspore::lite::opencl { | ||||
| int OpenCLExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels) { return RET_OK; } | |||||
| int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, | ||||
| std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, | std::vector<kernel::LiteKernel *> &kernels, Allocator *allocator, const KernelCallBack &before, | ||||
| const KernelCallBack &after) { | const KernelCallBack &after) { | ||||
| int ret; | |||||
| kernel::LiteKernelUtil::InitTensorRefCount(kernels); | kernel::LiteKernelUtil::InitTensorRefCount(kernels); | ||||
| for (auto *kernel : kernels) { | for (auto *kernel : kernels) { | ||||
| MS_ASSERT(nullptr != kernel); | |||||
| MS_ASSERT(kernel); | |||||
| CallBackParam callbackParam; | CallBackParam callbackParam; | ||||
| callbackParam.node_name = kernel->name(); | callbackParam.node_name = kernel->name(); | ||||
| @@ -41,19 +40,27 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou | |||||
| auto cur_outputs = kernel->out_tensors(); | auto cur_outputs = kernel->out_tensors(); | ||||
| for (auto i = 0; i < cur_outputs.size(); ++i) { | for (auto i = 0; i < cur_outputs.size(); ++i) { | ||||
| auto *output = cur_outputs.at(i); | auto *output = cur_outputs.at(i); | ||||
| MS_ASSERT(nullptr != output); | |||||
| MS_ASSERT(output); | |||||
| if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) { | if (op_kernel->GetMemType() == lite::opencl::MemType::IMG) { | ||||
| std::vector<size_t> img_size; | std::vector<size_t> img_size; | ||||
| op_kernel->GetImageSize(i, &img_size); | |||||
| ret = op_kernel->GetImageSize(i, &img_size); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "GetImageSize failed"; | |||||
| return ret; | |||||
| } | |||||
| auto data_ptr = allocator_->Malloc(output->Size(), img_size); | auto data_ptr = allocator_->Malloc(output->Size(), img_size); | ||||
| output->set_data(data_ptr); | output->set_data(data_ptr); | ||||
| } else { | } else { | ||||
| output->MallocData(allocator_); | |||||
| ret = output->MallocData(allocator_); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "MallocData failed"; | |||||
| return ret; | |||||
| } | |||||
| } | } | ||||
| } | } | ||||
| auto ret = kernel->Run(); | |||||
| if (0 != ret) { | |||||
| ret = kernel->Run(); | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name(); | ||||
| return ret; | return ret; | ||||
| } | } | ||||
| @@ -64,9 +71,9 @@ int OpenCLExecutor::Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &ou | |||||
| } | } | ||||
| } | } | ||||
| for (auto input_kernel : kernel->in_kernels()) { | for (auto input_kernel : kernel->in_kernels()) { | ||||
| MS_ASSERT(nullptr != input_kernel); | |||||
| MS_ASSERT(input_kernel); | |||||
| ret = input_kernel->DecOutTensorRefCount(); | ret = input_kernel->DecOutTensorRefCount(); | ||||
| if (0 != ret) { | |||||
| if (ret != RET_OK) { | |||||
| MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | MS_LOG(WARNING) << "DecOutTensorRefCount for kernel" << kernel->name() << " failed"; | ||||
| } | } | ||||
| } | } | ||||
| @@ -14,8 +14,8 @@ | |||||
| * limitations under the License. | * limitations under the License. | ||||
| */ | */ | ||||
| #ifndef MINDSPORE_LITE_SRC_OPENCL_EXECUTOR_H_ | |||||
| #define MINDSPORE_LITE_SRC_OPENCL_EXECUTOR_H_ | |||||
| #ifndef MINDSPORE_LITE_SRC_RUNTIME_OPENCL_EXECUTOR_H_ | |||||
| #define MINDSPORE_LITE_SRC_RUNTIME_OPENCL_EXECUTOR_H_ | |||||
| #include <vector> | #include <vector> | ||||
| #include "src/runtime/opencl/opencl_runtime.h" | #include "src/runtime/opencl/opencl_runtime.h" | ||||
| @@ -29,7 +29,7 @@ class OpenCLExecutor : public Executor { | |||||
| public: | public: | ||||
| OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); } | OpenCLExecutor() : Executor() { allocator_ = ocl_runtime.GetInstance()->GetAllocator(); } | ||||
| int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override; | |||||
| int Prepare(const std::vector<kernel::LiteKernel *> &kernels) override { return RET_OK; } | |||||
| int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, | int Run(std::vector<Tensor *> &inputs, std::vector<Tensor *> &outputs, std::vector<kernel::LiteKernel *> &kernels, | ||||
| Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, | Allocator *allocator = nullptr, const KernelCallBack &before = nullptr, | ||||
| @@ -100,8 +100,14 @@ int OpenCLRuntime::Init() { | |||||
| std::vector<cl::Device> devices; | std::vector<cl::Device> devices; | ||||
| for (auto it = platforms.begin(); it != platforms.end(); ++it) { | for (auto it = platforms.begin(); it != platforms.end(); ++it) { | ||||
| std::string platform_name; | std::string platform_name; | ||||
| it->getInfo(CL_PLATFORM_NAME, &platform_name); | |||||
| it->getDevices(CL_DEVICE_TYPE_GPU, &devices); | |||||
| ret = it->getInfo(CL_PLATFORM_NAME, &platform_name); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << CLErrorCode(ret); | |||||
| } | |||||
| ret = it->getDevices(CL_DEVICE_TYPE_GPU, &devices); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << CLErrorCode(ret); | |||||
| } | |||||
| MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs"; | MS_LOG(INFO) << "Platform (" << platform_name << ") has " << devices.size() << " GPUs"; | ||||
| if (devices.size() > 0) { | if (devices.size() > 0) { | ||||
| @@ -178,9 +184,18 @@ int OpenCLRuntime::Init() { | |||||
| } | } | ||||
| // get cache size, compute units and frequency. | // get cache size, compute units and frequency. | ||||
| device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_); | |||||
| device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_); | |||||
| device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_); | |||||
| ret = device_->getInfo(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, &global_memery_cachesize_); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << CLErrorCode(ret); | |||||
| } | |||||
| ret = device_->getInfo(CL_DEVICE_MAX_COMPUTE_UNITS, &compute_units_); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << CLErrorCode(ret); | |||||
| } | |||||
| ret = device_->getInfo(CL_DEVICE_MAX_CLOCK_FREQUENCY, &max_freq_); | |||||
| if (ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << CLErrorCode(ret); | |||||
| } | |||||
| cl_device_fp_config fp_config; | cl_device_fp_config fp_config; | ||||
| auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config); | auto success = device_->getInfo(CL_DEVICE_HALF_FP_CONFIG, &fp_config); | ||||
| support_fp16_ = CL_SUCCESS == success && fp_config > 0; | support_fp16_ = CL_SUCCESS == success && fp_config > 0; | ||||
| @@ -281,7 +296,9 @@ uint32_t OpenCLRuntime::DeviceMaxFreq() const { return max_freq_; } | |||||
| uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) { | uint64_t OpenCLRuntime::GetMaxWorkGroupSize(const cl::Kernel &kernel) { | ||||
| uint64_t max_workgroup_size = 0; | uint64_t max_workgroup_size = 0; | ||||
| int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size); | int ret = kernel.getWorkGroupInfo(*device_, CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size); | ||||
| if (ret != 0) max_workgroup_size = 0; | |||||
| if (ret != CL_SUCCESS) { | |||||
| max_workgroup_size = 0; | |||||
| } | |||||
| return max_workgroup_size; | return max_workgroup_size; | ||||
| } | } | ||||
| @@ -421,7 +438,10 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const std::vector<size_t> | |||||
| static int cnt = 0; | static int cnt = 0; | ||||
| const int flush_period = 10; | const int flush_period = 10; | ||||
| if (cnt % flush_period == 0) { | if (cnt % flush_period == 0) { | ||||
| command_queue->flush(); | |||||
| auto flush_ret = command_queue->flush(); | |||||
| if (flush_ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret); | |||||
| } | |||||
| } | } | ||||
| cnt++; | cnt++; | ||||
| MS_LOG(DEBUG) << "RunKernel success!"; | MS_LOG(DEBUG) << "RunKernel success!"; | ||||
| @@ -454,7 +474,10 @@ int OpenCLRuntime::RunKernel(const cl::Kernel &kernel, const cl::NDRange &global | |||||
| static int cnt = 0; | static int cnt = 0; | ||||
| const int flush_period = 10; | const int flush_period = 10; | ||||
| if (cnt % flush_period == 0) { | if (cnt % flush_period == 0) { | ||||
| command_queue->flush(); | |||||
| auto flush_ret = command_queue->flush(); | |||||
| if (flush_ret != CL_SUCCESS) { | |||||
| MS_LOG(WARNING) << "CL Flush failed:" << CLErrorCode(ret); | |||||
| } | |||||
| } | } | ||||
| cnt++; | cnt++; | ||||
| MS_LOG(DEBUG) << "RunKernel success!"; | MS_LOG(DEBUG) << "RunKernel success!"; | ||||