diff --git a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl index e6f3c9da2e..943b84c04d 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl +++ b/mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl @@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data, } dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); } +__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size, + int4 shape) { + int X = get_global_id(0); + int Y = get_global_id(1); + int Z = get_global_id(2); + if (X >= size.x || Y >= size.y || Z >= size.z) { + return; + } + dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X))); +} diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index a7f0ea66cd..725c8817ab 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -21,24 +21,31 @@ #include "src/runtime/kernel/opencl/kernel/convolution.h" #include "src/kernel_registry.h" #include "include/errorcode.h" -#include "nnacl/fp32/common_func.h" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; using mindspore::lite::RET_OK; using mindspore::schema::PrimitiveType_Conv2D; +using mindspore::schema::Format::Format_NC4HW4; +using mindspore::schema::Format::Format_NCHW; +using mindspore::schema::Format::Format_NHWC; +using mindspore::schema::Format::Format_NHWC4; namespace mindspore::kernel { +constexpr size_t CI_TILE = C4NUM; +constexpr size_t CO_TILE = C4NUM; + int ConvolutionOpenCLKernel::Init() { static int init_count = 0; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); + auto param = reinterpret_cast(op_parameter_); std::set build_options; init_count++; use_fp16_ = ocl_runtime->GetFp16Enable(); - if (op_format_ != schema::Format::Format_NHWC4 && op_format_ != schema::Format::Format_NC4HW4) { + if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) { MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!"; } in_ori_format_ = in_tensors_[0]->GetFormat(); @@ -46,19 +53,21 @@ int ConvolutionOpenCLKernel::Init() { in_tensors_[0]->SetFormat(op_format_); out_tensors_[0]->SetFormat(op_format_); - CI = in_tensors_[0]->Channel(); - IH = in_tensors_[0]->Height(); - IW = in_tensors_[0]->Width(); - CO = out_tensors_[0]->Channel(); - OH = out_tensors_[0]->Height(); - OW = out_tensors_[0]->Width(); - CI_SLICES = UP_DIV(CI, C4NUM); - CO_SLICES = UP_DIV(CO, C4NUM); + CI_ = in_tensors_[0]->Channel(); + IH_ = in_tensors_[0]->Height(); + IW_ = in_tensors_[0]->Width(); + CO_ = out_tensors_[0]->Channel(); + OH_ = out_tensors_[0]->Height(); + OW_ = out_tensors_[0]->Width(); + CI_SLICES_ = UP_DIV(CI_, C4NUM); + CO_SLICES_ = UP_DIV(CO_, C4NUM); + KH_ = param->kernel_h_; + KW_ = param->kernel_w_; // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true - TILES_X = UP_DIV(OW, 4); - TILES_Y = UP_DIV(OH, 4); - TILES_XY = TILES_X * TILES_Y; + TILES_X_ = UP_DIV(OW_, 4); + TILES_Y_ = UP_DIV(OH_, 4); + TILES_XY_ = TILES_X_ * TILES_Y_; use_winograd_ = UseWinograd4x4To6x6(); // build kernel @@ -67,36 +76,34 @@ int ConvolutionOpenCLKernel::Init() { std::string program_name; program_name = "Winograd4x4To36" + std::to_string(init_count); ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36()); - ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options); + ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options); program_name = "WinogradConvolution" + std::to_string(init_count); ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution()); - ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options); + ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options); program_name = "Winograd36To4x4" + std::to_string(init_count); ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4()); - ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options); + ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options); } else { std::string program_name = "convolution" + std::to_string(init_count); - std::string source = - op_format_ == schema::Format::Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4(); + std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4(); ocl_runtime->LoadSource(program_name, source); - ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options); + ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options); } // allocate winograd memory if (use_winograd_) { size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; - size_t sizeof_FLT = use_fp16_ ? 2 : 4; - size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT; - size_t width = TILES_XY; - size_t height = CI_SLICES * 36; + size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT(); + size_t width = TILES_XY_; + size_t height = CI_SLICES_ * 36; winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); - size = TILES_XY * CO_SLICES * 36 * sizeof_FLT; - width = TILES_XY; - height = CO_SLICES * 36; + size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT(); + width = TILES_XY_; + height = CO_SLICES_ * 36; winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); } @@ -106,142 +113,177 @@ int ConvolutionOpenCLKernel::Init() { return RET_OK; } -int ConvolutionOpenCLKernel::InitBuffer() { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - auto allocator = ocl_runtime->GetAllocator(); - size_t sizeof_FLT = use_fp16_ ? 2 : 4; +int ConvolutionOpenCLKernel::RearrangeWinogradWeight() { + constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, + 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, + 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; + constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702, + 1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808, + 1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000}; - auto param = reinterpret_cast(op_parameter_); - size_t KH = param->kernel_h_; - size_t KW = param->kernel_w_; - constexpr size_t CI_TILE = C4NUM; - constexpr size_t CO_TILE = C4NUM; - size_t packed_weight_size; - if (use_winograd_) { - packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT; + auto weight_tensor = in_tensors_[1]; + auto origin_weight_fp32 = reinterpret_cast(weight_tensor->data_c()); + auto origin_weight_fp16 = reinterpret_cast(weight_tensor->data_c()); + std::function access_func; + if (weight_tensor->data_type() == kNumberTypeFloat32) { + access_func = [=](int idx) { return origin_weight_fp32[idx]; }; } else { - packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT; + access_func = [=](int idx) { return static_cast(origin_weight_fp16[idx]); }; } - packed_weight_ = allocator->Malloc(packed_weight_size); - auto packed_weight_fp32 = reinterpret_cast(packed_weight_); - auto packed_weight_fp16 = reinterpret_cast(packed_weight_); - allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); - memset(packed_weight_, 0x00, packed_weight_size); - auto weight_tensor = in_tensors_[1]; - auto origin_weight_fp32 = reinterpret_cast(weight_tensor->MutableData()); - auto origin_weight_fp16 = reinterpret_cast(weight_tensor->MutableData()); - - if (use_winograd_) { - // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4 - std::vector encoded_weight(CO * 6 * 6 * CI); - std::vector Gt = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, - 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, - 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; - - std::vector G(Gt.size()); - for (int y = 0; y < 3; ++y) { - for (int x = 0; x < 6; ++x) { - G[x * 3 + y] = Gt[y * 6 + x]; - } - } - for (int co = 0; co < CO; ++co) { - for (int ci = 0; ci < CI; ++ci) { - std::vector in_vals(9); - for (int kh = 0; kh < 3; ++kh) { - for (int kw = 0; kw < 3; ++kw) { - const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci; - if (use_fp16_) { - in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]); - } else { - in_vals[kh * 3 + kw] = origin_weight_fp32[f_index]; - } - } - } - - auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3); - auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6); - for (int kh = 0; kh < 6; ++kh) { - for (int kw = 0; kw < 6; ++kw) { - const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci; - encoded_weight[f_index] = out_vals[kh * 6 + kw]; - } + // OHWI -> O66I + std::vector encoded_weight(CO_ * 6 * 6 * CI_); + for (int co = 0; co < CO_; ++co) { + for (int ci = 0; ci < CI_; ++ci) { + float in_vals[9]; + for (int kh = 0; kh < 3; ++kh) { + for (int kw = 0; kw < 3; ++kw) { + const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci; + in_vals[kh * 3 + kw] = access_func(f_index); } } - } - for (int co = 0, src_idx = 0; co < CO; ++co) { + auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3); + auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6); for (int kh = 0; kh < 6; ++kh) { for (int kw = 0; kw < 6; ++kw) { - for (int ci = 0; ci < CI; ++ci) { - int co_outer = co / 8; - int co_inner_group = co % 8 / 4; - int co_inner = co % 8 % 4; - int ci_outer = ci / 4; - int ci_inner = ci % 4; - size_t dst_idx = - (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) * - CO_TILE + - co_inner; - if (use_fp16_) { - packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]); - } else { - packed_weight_fp32[dst_idx] = encoded_weight[src_idx++]; - } - } + const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci; + encoded_weight[f_index] = out_vals[kh * 6 + kw]; } } } + } + + if (use_fp16_) { + OHWI2OHWIOGroupI4O4(encoded_weight.data(), 6, 6, 2); } else { - // weight: OHWI -> O/4 H W I/4 I4 O4 - for (int co = 0, src_idx = 0; co < CO; ++co) { - for (int kh = 0; kh < KH; ++kh) { - for (int kw = 0; kw < KW; ++kw) { - for (int ci = 0; ci < CI; ++ci) { - auto co_outer = co / CO_TILE; - auto co_inner = co % CO_TILE; - auto ci_outer = ci / CI_TILE; - auto ci_inner = ci % CI_TILE; - size_t dst_idx = - ((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner; - if (use_fp16_) { - packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++]; - } else { - packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++]; - } - } + OHWI2OHWIOGroupI4O4(encoded_weight.data(), 6, 6, 2); + } + + return RET_OK; +} + +template +int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) { + auto origin_weight = reinterpret_cast(weight_OHWI); + auto packed_weight = reinterpret_cast(packed_weight_); + + // OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4 + for (size_t co = 0, src_idx = 0; co < CO_; ++co) { + for (size_t kh = 0; kh < KH; ++kh) { + for (size_t kw = 0; kw < KW; ++kw) { + for (size_t ci = 0; ci < CI_; ++ci) { + size_t co_outer = co / (CO_TILE * OGroup); + size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE; + size_t co_inner = co % CO_TILE; + size_t ci_outer = ci / CI_TILE; + size_t ci_inner = ci % CI_TILE; + size_t dst_idx = + (((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) * + CO_TILE + + co_inner; + packed_weight[dst_idx] = static_cast(origin_weight[src_idx++]); } } } } + return RET_OK; +} + +int ConvolutionOpenCLKernel::InitWeight() { + auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator(); + + // allocate memory + size_t packed_weight_size; + if (use_winograd_) { + packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT(); + } else { + packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT(); + } + packed_weight_ = allocator->Malloc(packed_weight_size); + allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); + memset(packed_weight_, 0x00, packed_weight_size); + + // rearrange weight + if (use_winograd_) { + RearrangeWinogradWeight(); + } else { + auto weight_tensor = in_tensors_[1]; + if (weight_tensor->data_type() == kNumberTypeFloat16) { + if (use_fp16_) { + OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + } else { + OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + } + } else { + if (use_fp16_) { + OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + } else { + OHWI2OHWIOGroupI4O4(weight_tensor->data_c(), KH_, KW_, 1); + } + } + } + allocator->UnmapBuffer(packed_weight_); + return RET_OK; +} + +int ConvolutionOpenCLKernel::InitBias() { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto allocator = ocl_runtime->GetAllocator(); // align bias from C to C4 auto bias_tensor = in_tensors_[2]; - size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT; + size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT(); packed_bias_ = allocator->Malloc(packed_bias_size); + allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true); memset(packed_bias_, 0x00, packed_bias_size); - memcpy(packed_bias_, bias_tensor->MutableData(), CO * sizeof_FLT); + if (bias_tensor->data_type() == kNumberTypeFloat16) { + if (use_fp16_) { + memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); + } else { + auto packed_bias_fp32 = reinterpret_cast(packed_bias_); + auto origin_bias_fp16 = reinterpret_cast(bias_tensor->data_c()); + for (int i = 0; i < CO_; ++i) { + packed_bias_fp32[i] = static_cast(origin_bias_fp16[i]); + } + } + } else { + if (use_fp16_) { + auto packed_bias_fp16 = reinterpret_cast(packed_bias_); + auto origin_bias_fp32 = reinterpret_cast(bias_tensor->data_c()); + for (int i = 0; i < CO_; ++i) { + packed_bias_fp16[i] = static_cast(origin_bias_fp32[i]); + } + } else { + memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT()); + } + } allocator->UnmapBuffer(packed_bias_); + return RET_OK; +} +int ConvolutionOpenCLKernel::InitBuffer() { + InitWeight(); + InitBias(); return RET_OK; } int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { size_t im_dst_x, im_dst_y; - if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) { - if (out_tensors_[0]->Width() * CO_SLICES < 65536) { + if (in_tensors_[0]->GetFormat() == Format_NHWC4) { + if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) { { - im_dst_x = out_tensors_[0]->Width() * CO_SLICES; + im_dst_x = out_tensors_[0]->Width() * CO_SLICES_; im_dst_y = out_tensors_[0]->Height(); } } else { - im_dst_x = out_tensors_[0]->Height() * CO_SLICES; + im_dst_x = out_tensors_[0]->Height() * CO_SLICES_; im_dst_y = out_tensors_[0]->Width(); } } else { - im_dst_y = out_tensors_[0]->Height() * CO_SLICES; + im_dst_y = out_tensors_[0]->Height() * CO_SLICES_; im_dst_x = out_tensors_[0]->Width(); } size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; @@ -259,52 +301,52 @@ int ConvolutionOpenCLKernel::Run() { int arg_cn = 0; if (use_winograd_) { arg_cn = 0; - cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES}; - cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES}; - ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape); - ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape); + cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_}; + cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_}; + ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape); + ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape); arg_cn = 0; - cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES}; - cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES}; - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape); + cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_}; + cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_}; + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape); arg_cn = 0; - cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES}; - cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES}; - ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape); - ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape); + cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_}; + cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_}; + ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape); + ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape); } else { arg_cn = 0; - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); - if (op_format_ == schema::Format::Format_NC4HW4) { - cl_int4 input_shape = {1, IH, IW, CI_SLICES}; - cl_int4 output_shape = {1, OH, OW, CO_SLICES}; - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, input_shape); - ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, output_shape); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF); + if (op_format_ == Format_NC4HW4) { + cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_}; + cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_}; + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape); + ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape); } } if (use_winograd_) { - ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {8, 6, 4}, nullptr); - ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr); - ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr); + ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr); + ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr); + ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr); } else { std::vector global, local; SetGlobalLocalConv(&global, &local); - ocl_runtime->RunKernel(kernel_conv, global, local, nullptr); + ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr); } return RET_OK; @@ -312,10 +354,8 @@ int ConvolutionOpenCLKernel::Run() { std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { auto param = reinterpret_cast(op_parameter_); - const size_t CI_ALIGN = CI_SLICES * C4NUM; - const size_t CO_ALIGN = CO_SLICES * C4NUM; - const size_t KH = param->kernel_h_; - const size_t KW = param->kernel_w_; + const size_t CI_ALIGN = CI_SLICES_ * C4NUM; + const size_t CO_ALIGN = CO_SLICES_ * C4NUM; const size_t strideH = param->stride_h_; const size_t strideW = param->stride_w_; const size_t padTop = param->pad_u_; @@ -327,21 +367,21 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { code += "#define CI_TILE 4\n"; code += "#define CO_TILE 4\n\n"; code += "#define CI " + std::to_string(CI_ALIGN) + "\n"; - code += "#define IH " + std::to_string(IH) + "\n"; - code += "#define IW " + std::to_string(IW) + "\n"; + code += "#define IH " + std::to_string(IH_) + "\n"; + code += "#define IW " + std::to_string(IW_) + "\n"; code += "#define CO " + std::to_string(CO_ALIGN) + "\n"; - code += "#define OH " + std::to_string(OH) + "\n"; - code += "#define OW " + std::to_string(OW) + "\n"; - code += "#define KH " + std::to_string(KH) + "\n"; - code += "#define KW " + std::to_string(KW) + "\n"; + code += "#define OH " + std::to_string(OH_) + "\n"; + code += "#define OW " + std::to_string(OW_) + "\n"; + code += "#define KH " + std::to_string(KH_) + "\n"; + code += "#define KW " + std::to_string(KW_) + "\n"; code += "#define strideH " + std::to_string(strideH) + "\n"; code += "#define strideW " + std::to_string(strideW) + "\n"; code += "#define padTop " + std::to_string(padTop) + "\n"; code += "#define padBottom " + std::to_string(padBottom) + "\n"; code += "#define padLeft " + std::to_string(padLeft) + "\n"; code += "#define padRight " + std::to_string(padRight) + "\n"; - code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n"; - code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n"; + code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n"; + code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n"; if (use_fp16_) { code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; @@ -401,7 +441,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; } - if (OW * CO_SLICES < 65536) { + if (OW_ * CO_SLICES_ < 65536) { code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; } else { code += " WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; @@ -411,8 +451,6 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() { std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { auto param = reinterpret_cast(op_parameter_); - const size_t KH = param->kernel_h_; - const size_t KW = param->kernel_w_; const size_t strideH = param->stride_h_; const size_t strideW = param->stride_w_; const size_t padTop = param->pad_u_; @@ -442,12 +480,12 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { " int CI_SLICES = input_shape.w;\n" " int CO_SLICES = output_shape.w;\n\n"; - code += " #define IH " + std::to_string(IH) + "\n"; - code += " #define IW " + std::to_string(IW) + "\n"; - code += " #define OH " + std::to_string(OH) + "\n"; - code += " #define OW " + std::to_string(OW) + "\n"; - code += " #define KH " + std::to_string(KH) + "\n"; - code += " #define KW " + std::to_string(KW) + "\n"; + code += " #define IH " + std::to_string(IH_) + "\n"; + code += " #define IW " + std::to_string(IW_) + "\n"; + code += " #define OH " + std::to_string(OH_) + "\n"; + code += " #define OW " + std::to_string(OW_) + "\n"; + code += " #define KH " + std::to_string(KH_) + "\n"; + code += " #define KW " + std::to_string(KW_) + "\n"; code += " #define strideH " + std::to_string(strideH) + "\n"; code += " #define strideW " + std::to_string(strideW) + "\n"; code += " #define padTop " + std::to_string(padTop) + "\n"; @@ -457,7 +495,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() { " if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" " return;\n\n"; - bool check_ow = (OW % 2) == 1; + bool check_ow = (OW_ % 2) == 1; if (check_ow) { code += " int last_is_double = 1;\n" @@ -607,12 +645,12 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { " {\n" " int y_idx = tile_y * 4 - PAD + y;\n"; - if (op_format_ == schema::Format::Format_NHWC4) { + if (op_format_ == Format_NHWC4) { code += " for (int x = 0; x < 6; x++)\n" " {\n" " int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n"; - } else if (op_format_ == schema::Format::Format_NC4HW4) { + } else if (op_format_ == Format_NC4HW4) { code += " if(y_idx < 0 || y_idx >= IH)\n" " {\n" @@ -788,9 +826,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " int tile_x = tile_xy % TILE_X * 4;\n" " int tile_y = tile_xy / TILE_X * 4;\n"; - if (op_format_ == schema::Format::Format_NHWC4) { + if (op_format_ == Format_NHWC4) { code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n"; - } else if (op_format_ == schema::Format::Format_NC4HW4) { + } else if (op_format_ == Format_NC4HW4) { code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n"; } @@ -804,12 +842,12 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); constexpr size_t work_group_size[] = {4, 4, 1}; auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); - size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())()); + size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())()); const size_t max_z_size = std::min(16, max_work_item_sizes[2]); - size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0]; - size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1]; - size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2]; + size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0]; + size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1]; + size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2]; size_t local_c = GetBiggestDivider(global_c, max_z_size); if (local_c == 0) { @@ -823,22 +861,22 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std local_h = global_h / 2; } - if (op_format_ == schema::Format::Format_NHWC4) { - if (OW * CO_SLICES > 65536) { + if (op_format_ == Format_NHWC4) { + if (OW_ * CO_SLICES_ > 65536) { local_w = 4; } } global->clear(); - global->push_back(UP_DIV(OW, local_w) * local_w); - global->push_back(UP_DIV(OH, local_h) * local_h); - global->push_back(UP_DIV(CO_SLICES, local_c) * local_c); + global->push_back(UP_DIV(OW_, local_w) * local_w); + global->push_back(UP_DIV(OH_, local_h) * local_h); + global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c); local->clear(); local->push_back(local_w); local->push_back(local_h); local->push_back(local_c); - if (op_format_ == schema::Format::Format_NC4HW4) { + if (op_format_ == Format_NC4HW4) { // calculate 2 FLT4 along width per work-item global->at(0) = UP_DIV(global->at(0), 2); if (local->at(0) > global->at(0)) { diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 458fc8f16b..785e144336 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { private: bool use_fp16_ = false; - int CI; - int IH; - int IW; - int CO; - int OH; - int OW; - int CI_SLICES; - int CO_SLICES; + int CI_{}; + int IH_{}; + int IW_{}; + int CO_{}; + int OH_{}; + int OW_{}; + int CI_SLICES_{}; + int CO_SLICES_{}; + int KH_{}; + int KW_{}; void *packed_weight_ = nullptr; void *packed_bias_ = nullptr; bool use_winograd_ = false; - int TILES_X; - int TILES_Y; - int TILES_XY; + int TILES_X_{}; + int TILES_Y_{}; + int TILES_XY_{}; void *winograd_mem0_ = nullptr; void *winograd_mem1_ = nullptr; - cl::Kernel kernel_4x4to36; - cl::Kernel kernel_conv; - cl::Kernel kernel_36to4x4; + cl::Kernel kernel_4x4to36_; + cl::Kernel kernel_conv_; + cl::Kernel kernel_36to4x4_; + + int InitWeight(); + int InitBias(); + int RearrangeWinogradWeight(); + template + int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup); std::string CodeGenConvolutionNHWC4(); std::string CodeGenConvolutionNC4HW4(); @@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { std::string CodeGenWinograd36To4x4(); int SetGlobalLocalConv(std::vector *global, std::vector *local); + size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); } + bool UseWinograd4x4To6x6() { auto param = reinterpret_cast(op_parameter_); const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 && param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1; - const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12; - const bool hw_good = TILES_X * TILES_Y >= 16; + const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12; + const bool hw_good = TILES_X_ * TILES_Y_ >= 16; return attr_valid && channel_good && hw_good; } - std::vector MatrixMultiply(const std::vector &A, const std::vector &B, int M, int N, int K) { + static std::vector MatrixMultiply(const float A[], const float B[], int M, int N, int K) { std::vector C(M * K); for (int i = 0; i < M; ++i) { for (int j = 0; j < K; ++j) { diff --git a/mindspore/lite/test/run_test.sh b/mindspore/lite/test/run_test.sh index f048dbb364..7dd8e10497 100755 --- a/mindspore/lite/test/run_test.sh +++ b/mindspore/lite/test/run_test.sh @@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data ./lite-test --gtest_filter=TestDeconvInt8.* ./lite-test --gtest_filter="TestTfliteParser*" + +# for GPU OpenCL +./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*" diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index d15798dd16..e6f56178c7 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -21,19 +21,18 @@ #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h" #include "nnacl/pack.h" -#include "nnacl/fp32/common_func.h" using mindspore::kernel::ConvolutionOpenCLKernel; using mindspore::kernel::LiteKernel; using mindspore::kernel::SubGraphOpenCLKernel; using mindspore::lite::Tensor; using mindspore::schema::Format; -using mindspore::schema::Format_KHWC; -using mindspore::schema::Format_NC4HW4; -using mindspore::schema::Format_NCHW; -using mindspore::schema::Format_NHWC; -using mindspore::schema::Format_NHWC4; using mindspore::schema::NodeType_ValueNode; +using mindspore::schema::Format::Format_KHWC; +using mindspore::schema::Format::Format_NC4HW4; +using mindspore::schema::Format::Format_NCHW; +using mindspore::schema::Format::Format_NHWC; +using mindspore::schema::Format::Format_NHWC4; namespace mindspore { @@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {}; void LoadData(Tensor *tensor, const float *src) { if (tensor->data_type() == kNumberTypeFloat16) { - auto num = tensor->Size() / 2; - auto tensor_data = reinterpret_cast(tensor->MutableData()); + auto num = tensor->Size() / sizeof(float16_t); + auto tensor_data = reinterpret_cast(tensor->data_c()); for (int i = 0; i < num; ++i) { - tensor_data[i] = Float32ToShort(src[i]); + tensor_data[i] = static_cast(src[i]); } } else { - memcpy(tensor->MutableData(), src, tensor->Size()); + memcpy(tensor->data_c(), src, tensor->Size()); } } void CompareOutput(Tensor *output, const float *expect_data, const float atol) { - auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4; + auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4); std::vector output_data(num); if (output->data_type() == kNumberTypeFloat16) { - auto output_data_fp16 = reinterpret_cast(output->MutableData()); for (int i = 0; i < output_data.size(); ++i) { - output_data[i] = ShortToFloat32((output_data_fp16[i])); + output_data[i] = static_cast(reinterpret_cast(output->data_c())[i]); } } else { - memcpy(output_data.data(), output->MutableData(), output->Size()); + memcpy(output_data.data(), output->data_c(), output->Size()); } printf("output:"); @@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) { } printf("\n"); - float max_err = 0.0f; + float max_err = -1.0f; std::array idx_5d{}; - int idx = -1; + int max_err_idx = -1, first_err_idx = -1; auto SLICES = UP_DIV(output->Channel(), 4); int I = 1, J = 1, K = 1, L = 1, M = 1; switch (output->GetFormat()) { @@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) { for (int l = 0; l < L; ++l) { for (int m = 0; m < M; ++m) { auto err = std::fabs(output_data[cn] - expect_data[cn]); + if (first_err_idx == -1 && max_err > atol) { + first_err_idx = cn; + } if (err > max_err) { max_err = err; idx_5d = {i, j, k, l, m}; - idx = cn; + max_err_idx = cn; } cn++; } @@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) { } } - float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx])); - if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) { - printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]); - } else { - printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]); - } - printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx], - max_err, relative_err * 100); - if (max_err > atol) { + printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx], + output_data[first_err_idx]); FAIL(); } else { + float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx])); + if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) { + printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]); + } else { + printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]); + } + printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx], + output_data[max_err_idx], max_err, relative_err * 100); printf("COMPARE SUCCESS!\n\n"); } }