Browse Source

!6088 optimize opencl convolution fp16 kernel

Merge pull request !6088 from 王东旭/opencl_convolution_support_fp16
tags/v1.0.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
37561b1b4c
5 changed files with 299 additions and 236 deletions
  1. +10
    -0
      mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl
  2. +231
    -193
      mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
  3. +27
    -17
      mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
  4. +3
    -0
      mindspore/lite/test/run_test.sh
  5. +28
    -26
      mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc

+ 10
- 0
mindspore/lite/src/runtime/kernel/opencl/cl/to_format.cl View File

@@ -310,3 +310,13 @@ __kernel void to_format_NHWC4_to_NHWC4_BUF_float(__read_only image2d_t src_data,
}
dst_data[(X * size.y + Y) * size.z + Z] = convert_float4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}
__kernel void to_format_NHWC4_to_NHWC4_BUF_half(__read_only image2d_t src_data, __global half4 *dst_data, int4 size,
int4 shape) {
int X = get_global_id(0);
int Y = get_global_id(1);
int Z = get_global_id(2);
if (X >= size.x || Y >= size.y || Z >= size.z) {
return;
}
dst_data[(X * size.y + Y) * size.z + Z] = convert_half4(READ_IMAGE(src_data, smp_zero, (int2)(Y * size.z + Z, X)));
}

+ 231
- 193
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc View File

@@ -21,24 +21,31 @@
#include "src/runtime/kernel/opencl/kernel/convolution.h"
#include "src/kernel_registry.h"
#include "include/errorcode.h"
#include "nnacl/fp32/common_func.h"

using mindspore::kernel::KERNEL_ARCH::kGPU;
using mindspore::lite::KernelRegistrar;
using mindspore::lite::RET_OK;
using mindspore::schema::PrimitiveType_Conv2D;
using mindspore::schema::Format::Format_NC4HW4;
using mindspore::schema::Format::Format_NCHW;
using mindspore::schema::Format::Format_NHWC;
using mindspore::schema::Format::Format_NHWC4;

namespace mindspore::kernel {

constexpr size_t CI_TILE = C4NUM;
constexpr size_t CO_TILE = C4NUM;

int ConvolutionOpenCLKernel::Init() {
static int init_count = 0;
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator();
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
std::set<std::string> build_options;
init_count++;
use_fp16_ = ocl_runtime->GetFp16Enable();

if (op_format_ != schema::Format::Format_NHWC4 && op_format_ != schema::Format::Format_NC4HW4) {
if (op_format_ != Format_NHWC4 && op_format_ != Format_NC4HW4) {
MS_LOG(ERROR) << "op_format_ " << op_format_ << " not support!";
}
in_ori_format_ = in_tensors_[0]->GetFormat();
@@ -46,19 +53,21 @@ int ConvolutionOpenCLKernel::Init() {
in_tensors_[0]->SetFormat(op_format_);
out_tensors_[0]->SetFormat(op_format_);

CI = in_tensors_[0]->Channel();
IH = in_tensors_[0]->Height();
IW = in_tensors_[0]->Width();
CO = out_tensors_[0]->Channel();
OH = out_tensors_[0]->Height();
OW = out_tensors_[0]->Width();
CI_SLICES = UP_DIV(CI, C4NUM);
CO_SLICES = UP_DIV(CO, C4NUM);
CI_ = in_tensors_[0]->Channel();
IH_ = in_tensors_[0]->Height();
IW_ = in_tensors_[0]->Width();
CO_ = out_tensors_[0]->Channel();
OH_ = out_tensors_[0]->Height();
OW_ = out_tensors_[0]->Width();
CI_SLICES_ = UP_DIV(CI_, C4NUM);
CO_SLICES_ = UP_DIV(CO_, C4NUM);
KH_ = param->kernel_h_;
KW_ = param->kernel_w_;

// note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
TILES_X = UP_DIV(OW, 4);
TILES_Y = UP_DIV(OH, 4);
TILES_XY = TILES_X * TILES_Y;
TILES_X_ = UP_DIV(OW_, 4);
TILES_Y_ = UP_DIV(OH_, 4);
TILES_XY_ = TILES_X_ * TILES_Y_;
use_winograd_ = UseWinograd4x4To6x6();

// build kernel
@@ -67,36 +76,34 @@ int ConvolutionOpenCLKernel::Init() {
std::string program_name;
program_name = "Winograd4x4To36" + std::to_string(init_count);
ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36());
ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options);
ocl_runtime->BuildKernel(kernel_4x4to36_, program_name, "Winograd4x4To36", build_options);

program_name = "WinogradConvolution" + std::to_string(init_count);
ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution());
ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options);
ocl_runtime->BuildKernel(kernel_conv_, program_name, "WinogradConvolution", build_options);

program_name = "Winograd36To4x4" + std::to_string(init_count);
ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4());
ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options);
ocl_runtime->BuildKernel(kernel_36to4x4_, program_name, "Winograd36To4x4", build_options);
} else {
std::string program_name = "convolution" + std::to_string(init_count);
std::string source =
op_format_ == schema::Format::Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
std::string source = op_format_ == Format_NHWC4 ? CodeGenConvolutionNHWC4() : CodeGenConvolutionNC4HW4();
ocl_runtime->LoadSource(program_name, source);
ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options);
ocl_runtime->BuildKernel(kernel_conv_, program_name, "Convolution", build_options);
}

// allocate winograd memory
if (use_winograd_) {
size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
size_t sizeof_FLT = use_fp16_ ? 2 : 4;

size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT;
size_t width = TILES_XY;
size_t height = CI_SLICES * 36;
size_t size = TILES_XY_ * CI_SLICES_ * 36 * sizeof_FLT();
size_t width = TILES_XY_;
size_t height = CI_SLICES_ * 36;
winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype});

size = TILES_XY * CO_SLICES * 36 * sizeof_FLT;
width = TILES_XY;
height = CO_SLICES * 36;
size = TILES_XY_ * CO_SLICES_ * 36 * sizeof_FLT();
width = TILES_XY_;
height = CO_SLICES_ * 36;
winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype});
}

@@ -106,142 +113,177 @@ int ConvolutionOpenCLKernel::Init() {
return RET_OK;
}

int ConvolutionOpenCLKernel::InitBuffer() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator();
size_t sizeof_FLT = use_fp16_ ? 2 : 4;
int ConvolutionOpenCLKernel::RearrangeWinogradWeight() {
constexpr float Gt[] = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000,
0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000};
constexpr float G[] = {1.0000000000, 0.0000000000, 0.0000000000, 1.0000000000, 0.7071067691, 0.4999999702,
1.0000000000, -0.7071067691, 0.4999999702, 1.0000000000, 1.4142135382, 1.9999998808,
1.0000000000, -1.4142135382, 1.9999998808, 0.0000000000, 0.0000000000, 1.0000000000};

auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
size_t KH = param->kernel_h_;
size_t KW = param->kernel_w_;
constexpr size_t CI_TILE = C4NUM;
constexpr size_t CO_TILE = C4NUM;
size_t packed_weight_size;
if (use_winograd_) {
packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT;
auto weight_tensor = in_tensors_[1];
auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->data_c());
auto origin_weight_fp16 = reinterpret_cast<float16_t *>(weight_tensor->data_c());
std::function<float(int)> access_func;
if (weight_tensor->data_type() == kNumberTypeFloat32) {
access_func = [=](int idx) { return origin_weight_fp32[idx]; };
} else {
packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT;
access_func = [=](int idx) { return static_cast<float>(origin_weight_fp16[idx]); };
}
packed_weight_ = allocator->Malloc(packed_weight_size);
auto packed_weight_fp32 = reinterpret_cast<float *>(packed_weight_);
auto packed_weight_fp16 = reinterpret_cast<uint16_t *>(packed_weight_);
allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
memset(packed_weight_, 0x00, packed_weight_size);
auto weight_tensor = in_tensors_[1];
auto origin_weight_fp32 = reinterpret_cast<float *>(weight_tensor->MutableData());
auto origin_weight_fp16 = reinterpret_cast<uint16_t *>(weight_tensor->MutableData());

if (use_winograd_) {
// weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4
std::vector<float> encoded_weight(CO * 6 * 6 * CI);
std::vector<float> Gt = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000,
0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000,
0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000};

std::vector<float> G(Gt.size());
for (int y = 0; y < 3; ++y) {
for (int x = 0; x < 6; ++x) {
G[x * 3 + y] = Gt[y * 6 + x];
}
}

for (int co = 0; co < CO; ++co) {
for (int ci = 0; ci < CI; ++ci) {
std::vector<float> in_vals(9);
for (int kh = 0; kh < 3; ++kh) {
for (int kw = 0; kw < 3; ++kw) {
const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci;
if (use_fp16_) {
in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]);
} else {
in_vals[kh * 3 + kw] = origin_weight_fp32[f_index];
}
}
}

auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6);
for (int kh = 0; kh < 6; ++kh) {
for (int kw = 0; kw < 6; ++kw) {
const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci;
encoded_weight[f_index] = out_vals[kh * 6 + kw];
}
// OHWI -> O66I
std::vector<float> encoded_weight(CO_ * 6 * 6 * CI_);
for (int co = 0; co < CO_; ++co) {
for (int ci = 0; ci < CI_; ++ci) {
float in_vals[9];
for (int kh = 0; kh < 3; ++kh) {
for (int kw = 0; kw < 3; ++kw) {
const int f_index = ((co * 3 + kh) * 3 + kw) * CI_ + ci;
in_vals[kh * 3 + kw] = access_func(f_index);
}
}
}

for (int co = 0, src_idx = 0; co < CO; ++co) {
auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3);
auto out_vals = MatrixMultiply(temp_vals.data(), Gt, 6, 3, 6);
for (int kh = 0; kh < 6; ++kh) {
for (int kw = 0; kw < 6; ++kw) {
for (int ci = 0; ci < CI; ++ci) {
int co_outer = co / 8;
int co_inner_group = co % 8 / 4;
int co_inner = co % 8 % 4;
int ci_outer = ci / 4;
int ci_inner = ci % 4;
size_t dst_idx =
(((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) *
CO_TILE +
co_inner;
if (use_fp16_) {
packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]);
} else {
packed_weight_fp32[dst_idx] = encoded_weight[src_idx++];
}
}
const int f_index = ((co * 6 + kh) * 6 + kw) * CI_ + ci;
encoded_weight[f_index] = out_vals[kh * 6 + kw];
}
}
}
}

if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float, float16_t>(encoded_weight.data(), 6, 6, 2);
} else {
// weight: OHWI -> O/4 H W I/4 I4 O4
for (int co = 0, src_idx = 0; co < CO; ++co) {
for (int kh = 0; kh < KH; ++kh) {
for (int kw = 0; kw < KW; ++kw) {
for (int ci = 0; ci < CI; ++ci) {
auto co_outer = co / CO_TILE;
auto co_inner = co % CO_TILE;
auto ci_outer = ci / CI_TILE;
auto ci_inner = ci % CI_TILE;
size_t dst_idx =
((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner;
if (use_fp16_) {
packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++];
} else {
packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++];
}
}
OHWI2OHWIOGroupI4O4<float, float>(encoded_weight.data(), 6, 6, 2);
}

return RET_OK;
}

template <typename SRC_T, typename DST_T>
int ConvolutionOpenCLKernel::OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup) {
auto origin_weight = reinterpret_cast<SRC_T *>(weight_OHWI);
auto packed_weight = reinterpret_cast<DST_T *>(packed_weight_);

// OHWI -> O/OGroup/4 KH KW I/4 OGroup I4 O4
for (size_t co = 0, src_idx = 0; co < CO_; ++co) {
for (size_t kh = 0; kh < KH; ++kh) {
for (size_t kw = 0; kw < KW; ++kw) {
for (size_t ci = 0; ci < CI_; ++ci) {
size_t co_outer = co / (CO_TILE * OGroup);
size_t group_idx = co % (CO_TILE * OGroup) / CO_TILE;
size_t co_inner = co % CO_TILE;
size_t ci_outer = ci / CI_TILE;
size_t ci_inner = ci % CI_TILE;
size_t dst_idx =
(((((co_outer * KH + kh) * KW + kw) * CI_SLICES_ + ci_outer) * OGroup + group_idx) * CI_TILE + ci_inner) *
CO_TILE +
co_inner;
packed_weight[dst_idx] = static_cast<DST_T>(origin_weight[src_idx++]);
}
}
}
}
return RET_OK;
}

int ConvolutionOpenCLKernel::InitWeight() {
auto allocator = lite::opencl::OpenCLRuntime::GetInstance()->GetAllocator();

// allocate memory
size_t packed_weight_size;
if (use_winograd_) {
packed_weight_size = UP_DIV(CO_, 8) * 6 * 6 * CI_SLICES_ * 2 * CI_TILE * CO_TILE * sizeof_FLT();
} else {
packed_weight_size = CO_SLICES_ * KH_ * KW_ * CI_SLICES_ * CI_TILE * CO_TILE * sizeof_FLT();
}
packed_weight_ = allocator->Malloc(packed_weight_size);
allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
memset(packed_weight_, 0x00, packed_weight_size);

// rearrange weight
if (use_winograd_) {
RearrangeWinogradWeight();
} else {
auto weight_tensor = in_tensors_[1];
if (weight_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float16_t, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
} else {
OHWI2OHWIOGroupI4O4<float16_t, float>(weight_tensor->data_c(), KH_, KW_, 1);
}
} else {
if (use_fp16_) {
OHWI2OHWIOGroupI4O4<float, float16_t>(weight_tensor->data_c(), KH_, KW_, 1);
} else {
OHWI2OHWIOGroupI4O4<float, float>(weight_tensor->data_c(), KH_, KW_, 1);
}
}
}

allocator->UnmapBuffer(packed_weight_);
return RET_OK;
}

int ConvolutionOpenCLKernel::InitBias() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator();

// align bias from C to C4
auto bias_tensor = in_tensors_[2];
size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT;
size_t packed_bias_size = CO_SLICES_ * CO_TILE * sizeof_FLT();
packed_bias_ = allocator->Malloc(packed_bias_size);

allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true);
memset(packed_bias_, 0x00, packed_bias_size);
memcpy(packed_bias_, bias_tensor->MutableData(), CO * sizeof_FLT);
if (bias_tensor->data_type() == kNumberTypeFloat16) {
if (use_fp16_) {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
} else {
auto packed_bias_fp32 = reinterpret_cast<float *>(packed_bias_);
auto origin_bias_fp16 = reinterpret_cast<float16_t *>(bias_tensor->data_c());
for (int i = 0; i < CO_; ++i) {
packed_bias_fp32[i] = static_cast<float>(origin_bias_fp16[i]);
}
}
} else {
if (use_fp16_) {
auto packed_bias_fp16 = reinterpret_cast<float16_t *>(packed_bias_);
auto origin_bias_fp32 = reinterpret_cast<float *>(bias_tensor->data_c());
for (int i = 0; i < CO_; ++i) {
packed_bias_fp16[i] = static_cast<float16_t>(origin_bias_fp32[i]);
}
} else {
memcpy(packed_bias_, bias_tensor->data_c(), CO_ * sizeof_FLT());
}
}
allocator->UnmapBuffer(packed_bias_);
return RET_OK;
}

int ConvolutionOpenCLKernel::InitBuffer() {
InitWeight();
InitBias();
return RET_OK;
}

int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
size_t im_dst_x, im_dst_y;
if (in_tensors_[0]->GetFormat() == schema::Format::Format_NHWC4) {
if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
if (in_tensors_[0]->GetFormat() == Format_NHWC4) {
if (out_tensors_[0]->Width() * CO_SLICES_ < 65536) {
{
im_dst_x = out_tensors_[0]->Width() * CO_SLICES;
im_dst_x = out_tensors_[0]->Width() * CO_SLICES_;
im_dst_y = out_tensors_[0]->Height();
}
} else {
im_dst_x = out_tensors_[0]->Height() * CO_SLICES;
im_dst_x = out_tensors_[0]->Height() * CO_SLICES_;
im_dst_y = out_tensors_[0]->Width();
}
} else {
im_dst_y = out_tensors_[0]->Height() * CO_SLICES;
im_dst_y = out_tensors_[0]->Height() * CO_SLICES_;
im_dst_x = out_tensors_[0]->Width();
}
size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT;
@@ -259,52 +301,52 @@ int ConvolutionOpenCLKernel::Run() {
int arg_cn = 0;
if (use_winograd_) {
arg_cn = 0;
cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape);
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape);
cl_int4 _4x4to36_in_shape = {1, IH_, IW_, CI_SLICES_};
cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY_, CI_SLICES_};
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_in_shape);
ocl_runtime->SetKernelArg(kernel_4x4to36_, arg_cn++, _4x4to36_out_shape);

arg_cn = 0;
cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES};
cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape);
cl_int4 conv_in_shape = {1, 36, TILES_XY_, CI_SLICES_};
cl_int4 conv_out_shape = {1, 36, TILES_XY_, CO_SLICES_};
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem0_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_in_shape);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, conv_out_shape);

arg_cn = 0;
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY_, CO_SLICES_};
cl_int4 _36to4x4_out_shape = {1, OH_, OW_, CO_SLICES_};
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, winograd_mem1_, lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_in_shape);
ocl_runtime->SetKernelArg(kernel_36to4x4_, arg_cn++, _36to4x4_out_shape);
} else {
arg_cn = 0;
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->MutableData(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
if (op_format_ == schema::Format::Format_NC4HW4) {
cl_int4 input_shape = {1, IH, IW, CI_SLICES};
cl_int4 output_shape = {1, OH, OW, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, input_shape);
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, output_shape);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, in_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, out_tensors_[0]->data_c(), lite::opencl::MemType::IMG);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_weight_, lite::opencl::MemType::BUF);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, packed_bias_, lite::opencl::MemType::BUF);
if (op_format_ == Format_NC4HW4) {
cl_int4 input_shape = {1, IH_, IW_, CI_SLICES_};
cl_int4 output_shape = {1, OH_, OW_, CO_SLICES_};
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, input_shape);
ocl_runtime->SetKernelArg(kernel_conv_, arg_cn++, output_shape);
}
}

if (use_winograd_) {
ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {8, 6, 4}, nullptr);
ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr);
ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr);
ocl_runtime->RunKernel(kernel_4x4to36_, {size_t(TILES_XY_), 6, size_t(CI_SLICES_)}, {8, 6, 4}, nullptr);
ocl_runtime->RunKernel(kernel_conv_, {size_t(TILES_XY_ / 2), 36, size_t(CO_SLICES_ / 2)}, {8, 6, 2}, nullptr);
ocl_runtime->RunKernel(kernel_36to4x4_, {size_t(TILES_XY_), 4, size_t(CO_SLICES_)}, {32, 4, 2}, nullptr);
} else {
std::vector<size_t> global, local;
SetGlobalLocalConv(&global, &local);
ocl_runtime->RunKernel(kernel_conv, global, local, nullptr);
ocl_runtime->RunKernel(kernel_conv_, global, local, nullptr);
}

return RET_OK;
@@ -312,10 +354,8 @@ int ConvolutionOpenCLKernel::Run() {

std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const size_t CI_ALIGN = CI_SLICES * C4NUM;
const size_t CO_ALIGN = CO_SLICES * C4NUM;
const size_t KH = param->kernel_h_;
const size_t KW = param->kernel_w_;
const size_t CI_ALIGN = CI_SLICES_ * C4NUM;
const size_t CO_ALIGN = CO_SLICES_ * C4NUM;
const size_t strideH = param->stride_h_;
const size_t strideW = param->stride_w_;
const size_t padTop = param->pad_u_;
@@ -327,21 +367,21 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
code += "#define CI_TILE 4\n";
code += "#define CO_TILE 4\n\n";
code += "#define CI " + std::to_string(CI_ALIGN) + "\n";
code += "#define IH " + std::to_string(IH) + "\n";
code += "#define IW " + std::to_string(IW) + "\n";
code += "#define IH " + std::to_string(IH_) + "\n";
code += "#define IW " + std::to_string(IW_) + "\n";
code += "#define CO " + std::to_string(CO_ALIGN) + "\n";
code += "#define OH " + std::to_string(OH) + "\n";
code += "#define OW " + std::to_string(OW) + "\n";
code += "#define KH " + std::to_string(KH) + "\n";
code += "#define KW " + std::to_string(KW) + "\n";
code += "#define OH " + std::to_string(OH_) + "\n";
code += "#define OW " + std::to_string(OW_) + "\n";
code += "#define KH " + std::to_string(KH_) + "\n";
code += "#define KW " + std::to_string(KW_) + "\n";
code += "#define strideH " + std::to_string(strideH) + "\n";
code += "#define strideW " + std::to_string(strideW) + "\n";
code += "#define padTop " + std::to_string(padTop) + "\n";
code += "#define padBottom " + std::to_string(padBottom) + "\n";
code += "#define padLeft " + std::to_string(padLeft) + "\n";
code += "#define padRight " + std::to_string(padRight) + "\n";
code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n";
code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n";
code += "#define CI_SLICES " + std::to_string(CI_SLICES_) + "\n";
code += "#define CO_SLICES " + std::to_string(CO_SLICES_) + "\n\n";

if (use_fp16_) {
code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n";
@@ -401,7 +441,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
}

if (OW * CO_SLICES < 65536) {
if (OW_ * CO_SLICES_ < 65536) {
code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
} else {
code += " WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
@@ -411,8 +451,6 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNHWC4() {

std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const size_t KH = param->kernel_h_;
const size_t KW = param->kernel_w_;
const size_t strideH = param->stride_h_;
const size_t strideW = param->stride_w_;
const size_t padTop = param->pad_u_;
@@ -442,12 +480,12 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
" int CI_SLICES = input_shape.w;\n"
" int CO_SLICES = output_shape.w;\n\n";

code += " #define IH " + std::to_string(IH) + "\n";
code += " #define IW " + std::to_string(IW) + "\n";
code += " #define OH " + std::to_string(OH) + "\n";
code += " #define OW " + std::to_string(OW) + "\n";
code += " #define KH " + std::to_string(KH) + "\n";
code += " #define KW " + std::to_string(KW) + "\n";
code += " #define IH " + std::to_string(IH_) + "\n";
code += " #define IW " + std::to_string(IW_) + "\n";
code += " #define OH " + std::to_string(OH_) + "\n";
code += " #define OW " + std::to_string(OW_) + "\n";
code += " #define KH " + std::to_string(KH_) + "\n";
code += " #define KW " + std::to_string(KW_) + "\n";
code += " #define strideH " + std::to_string(strideH) + "\n";
code += " #define strideW " + std::to_string(strideW) + "\n";
code += " #define padTop " + std::to_string(padTop) + "\n";
@@ -457,7 +495,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolutionNC4HW4() {
" if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n"
" return;\n\n";

bool check_ow = (OW % 2) == 1;
bool check_ow = (OW_ % 2) == 1;
if (check_ow) {
code +=
" int last_is_double = 1;\n"
@@ -607,12 +645,12 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() {
" {\n"
" int y_idx = tile_y * 4 - PAD + y;\n";

if (op_format_ == schema::Format::Format_NHWC4) {
if (op_format_ == Format_NHWC4) {
code +=
" for (int x = 0; x < 6; x++)\n"
" {\n"
" int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n";
} else if (op_format_ == schema::Format::Format_NC4HW4) {
} else if (op_format_ == Format_NC4HW4) {
code +=
" if(y_idx < 0 || y_idx >= IH)\n"
" {\n"
@@ -788,9 +826,9 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
" int tile_x = tile_xy % TILE_X * 4;\n"
" int tile_y = tile_xy / TILE_X * 4;\n";

if (op_format_ == schema::Format::Format_NHWC4) {
if (op_format_ == Format_NHWC4) {
code += " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc);\n";
} else if (op_format_ == schema::Format::Format_NC4HW4) {
} else if (op_format_ == Format_NC4HW4) {
code += " WRITE_IMAGE(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc);\n";
}

@@ -804,12 +842,12 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
constexpr size_t work_group_size[] = {4, 4, 1};
auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())());
size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv_(), (*ocl_runtime->Device())());
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);

size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2];
size_t global_h = UP_DIV(OH_, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(OW_, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(CO_SLICES_, work_group_size[2]) * work_group_size[2];

size_t local_c = GetBiggestDivider(global_c, max_z_size);
if (local_c == 0) {
@@ -823,22 +861,22 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
local_h = global_h / 2;
}

if (op_format_ == schema::Format::Format_NHWC4) {
if (OW * CO_SLICES > 65536) {
if (op_format_ == Format_NHWC4) {
if (OW_ * CO_SLICES_ > 65536) {
local_w = 4;
}
}

global->clear();
global->push_back(UP_DIV(OW, local_w) * local_w);
global->push_back(UP_DIV(OH, local_h) * local_h);
global->push_back(UP_DIV(CO_SLICES, local_c) * local_c);
global->push_back(UP_DIV(OW_, local_w) * local_w);
global->push_back(UP_DIV(OH_, local_h) * local_h);
global->push_back(UP_DIV(CO_SLICES_, local_c) * local_c);
local->clear();
local->push_back(local_w);
local->push_back(local_h);
local->push_back(local_c);

if (op_format_ == schema::Format::Format_NC4HW4) {
if (op_format_ == Format_NC4HW4) {
// calculate 2 FLT4 along width per work-item
global->at(0) = UP_DIV(global->at(0), 2);
if (local->at(0) > global->at(0)) {


+ 27
- 17
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h View File

@@ -42,27 +42,35 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
private:
bool use_fp16_ = false;

int CI;
int IH;
int IW;
int CO;
int OH;
int OW;
int CI_SLICES;
int CO_SLICES;
int CI_{};
int IH_{};
int IW_{};
int CO_{};
int OH_{};
int OW_{};
int CI_SLICES_{};
int CO_SLICES_{};
int KH_{};
int KW_{};
void *packed_weight_ = nullptr;
void *packed_bias_ = nullptr;

bool use_winograd_ = false;
int TILES_X;
int TILES_Y;
int TILES_XY;
int TILES_X_{};
int TILES_Y_{};
int TILES_XY_{};
void *winograd_mem0_ = nullptr;
void *winograd_mem1_ = nullptr;

cl::Kernel kernel_4x4to36;
cl::Kernel kernel_conv;
cl::Kernel kernel_36to4x4;
cl::Kernel kernel_4x4to36_;
cl::Kernel kernel_conv_;
cl::Kernel kernel_36to4x4_;

int InitWeight();
int InitBias();
int RearrangeWinogradWeight();
template <typename SRC_T, typename DST_T>
int OHWI2OHWIOGroupI4O4(void *weight_OHWI, size_t KH, size_t KW, size_t OGroup);

std::string CodeGenConvolutionNHWC4();
std::string CodeGenConvolutionNC4HW4();
@@ -72,16 +80,18 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
std::string CodeGenWinograd36To4x4();
int SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local);

size_t sizeof_FLT() const { return use_fp16_ ? sizeof(float16_t) : sizeof(float); }

bool UseWinograd4x4To6x6() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 &&
param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1;
const bool channel_good = CI_SLICES >= 12 && CO_SLICES >= 12;
const bool hw_good = TILES_X * TILES_Y >= 16;
const bool channel_good = CI_SLICES_ >= 12 && CO_SLICES_ >= 12;
const bool hw_good = TILES_X_ * TILES_Y_ >= 16;
return attr_valid && channel_good && hw_good;
}

std::vector<float> MatrixMultiply(const std::vector<float> &A, const std::vector<float> &B, int M, int N, int K) {
static std::vector<float> MatrixMultiply(const float A[], const float B[], int M, int N, int K) {
std::vector<float> C(M * K);
for (int i = 0; i < M; ++i) {
for (int j = 0; j < K; ++j) {


+ 3
- 0
mindspore/lite/test/run_test.sh View File

@@ -28,3 +28,6 @@ cp -fr $TEST_DATA_DIR/testPK ./data
./lite-test --gtest_filter=TestDeconvInt8.*

./lite-test --gtest_filter="TestTfliteParser*"

# for GPU OpenCL
./lite-test --gtest_filter="TestConvolutionOpenCL.simple_test*"

+ 28
- 26
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc View File

@@ -21,19 +21,18 @@
#include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h"
#include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h"
#include "nnacl/pack.h"
#include "nnacl/fp32/common_func.h"

using mindspore::kernel::ConvolutionOpenCLKernel;
using mindspore::kernel::LiteKernel;
using mindspore::kernel::SubGraphOpenCLKernel;
using mindspore::lite::Tensor;
using mindspore::schema::Format;
using mindspore::schema::Format_KHWC;
using mindspore::schema::Format_NC4HW4;
using mindspore::schema::Format_NCHW;
using mindspore::schema::Format_NHWC;
using mindspore::schema::Format_NHWC4;
using mindspore::schema::NodeType_ValueNode;
using mindspore::schema::Format::Format_KHWC;
using mindspore::schema::Format::Format_NC4HW4;
using mindspore::schema::Format::Format_NCHW;
using mindspore::schema::Format::Format_NHWC;
using mindspore::schema::Format::Format_NHWC4;

namespace mindspore {

@@ -41,26 +40,25 @@ class TestConvolutionOpenCL : public mindspore::CommonTest {};

void LoadData(Tensor *tensor, const float *src) {
if (tensor->data_type() == kNumberTypeFloat16) {
auto num = tensor->Size() / 2;
auto tensor_data = reinterpret_cast<uint16_t *>(tensor->MutableData());
auto num = tensor->Size() / sizeof(float16_t);
auto tensor_data = reinterpret_cast<float16_t *>(tensor->data_c());
for (int i = 0; i < num; ++i) {
tensor_data[i] = Float32ToShort(src[i]);
tensor_data[i] = static_cast<float16_t>(src[i]);
}
} else {
memcpy(tensor->MutableData(), src, tensor->Size());
memcpy(tensor->data_c(), src, tensor->Size());
}
}

void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
auto num = (output->data_type() == kNumberTypeFloat16) ? output->Size() / 2 : output->Size() / 4;
auto num = output->Size() / (output->data_type() == kNumberTypeFloat16 ? 2 : 4);
std::vector<float> output_data(num);
if (output->data_type() == kNumberTypeFloat16) {
auto output_data_fp16 = reinterpret_cast<uint16_t *>(output->MutableData());
for (int i = 0; i < output_data.size(); ++i) {
output_data[i] = ShortToFloat32((output_data_fp16[i]));
output_data[i] = static_cast<float>(reinterpret_cast<float16_t *>(output->data_c())[i]);
}
} else {
memcpy(output_data.data(), output->MutableData(), output->Size());
memcpy(output_data.data(), output->data_c(), output->Size());
}

printf("output:");
@@ -69,9 +67,9 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
}
printf("\n");

float max_err = 0.0f;
float max_err = -1.0f;
std::array<int, 5> idx_5d{};
int idx = -1;
int max_err_idx = -1, first_err_idx = -1;
auto SLICES = UP_DIV(output->Channel(), 4);
int I = 1, J = 1, K = 1, L = 1, M = 1;
switch (output->GetFormat()) {
@@ -98,10 +96,13 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
for (int l = 0; l < L; ++l) {
for (int m = 0; m < M; ++m) {
auto err = std::fabs(output_data[cn] - expect_data[cn]);
if (first_err_idx == -1 && max_err > atol) {
first_err_idx = cn;
}
if (err > max_err) {
max_err = err;
idx_5d = {i, j, k, l, m};
idx = cn;
max_err_idx = cn;
}
cn++;
}
@@ -110,18 +111,19 @@ void CompareOutput(Tensor *output, const float *expect_data, const float atol) {
}
}

float relative_err = max_err / std::fabs(std::max(expect_data[idx], output_data[idx]));
if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
} else {
printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
}
printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[idx], output_data[idx],
max_err, relative_err * 100);

if (max_err > atol) {
printf("first error at %d expect=%.3f output=%.3f\n", first_err_idx, expect_data[first_err_idx],
output_data[first_err_idx]);
FAIL();
} else {
float relative_err = max_err / std::fabs(std::max(expect_data[max_err_idx], output_data[max_err_idx]));
if (output->GetFormat() == Format_NHWC || output->GetFormat() == Format_NCHW) {
printf("max relative error at [%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3]);
} else {
printf("max relative error at [%d,%d,%d,%d,%d]", idx_5d[0], idx_5d[1], idx_5d[2], idx_5d[3], idx_5d[4]);
}
printf(" expect=%.3f output=%.3f absolute_err=%.2e relative_err=%.2f%%\n", expect_data[max_err_idx],
output_data[max_err_idx], max_err, relative_err * 100);
printf("COMPARE SUCCESS!\n\n");
}
}


Loading…
Cancel
Save