Browse Source

fix opencl winograd param invalid bug

tags/v0.7.0-beta
wangdongxu 5 years ago
parent
commit
2d699c93a9
3 changed files with 167 additions and 176 deletions
  1. +97
    -113
      mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc
  2. +6
    -0
      mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h
  3. +64
    -63
      mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc

+ 97
- 113
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc View File

@@ -32,16 +32,21 @@ int ConvolutionOpenCLKernel::Init() {
static int init_count = 0; static int init_count = 0;
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
std::set<std::string> build_options; std::set<std::string> build_options;
init_count++; init_count++;


CI_SLICES = UP_DIV(param->input_channel_, C4NUM);
CO_SLICES = UP_DIV(param->output_channel_, C4NUM);
CI = in_tensors_[0]->Channel();
IH = in_tensors_[0]->Height();
IW = in_tensors_[0]->Width();
CO = out_tensors_[0]->Channel();
OH = out_tensors_[0]->Height();
OW = out_tensors_[0]->Width();
CI_SLICES = UP_DIV(CI, C4NUM);
CO_SLICES = UP_DIV(CO, C4NUM);


// note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true
TILES_X = UP_DIV(param->output_w_, 4);
TILES_Y = UP_DIV(param->output_h_, 4);
TILES_X = UP_DIV(OW, 4);
TILES_Y = UP_DIV(OH, 4);
TILES_XY = TILES_X * TILES_Y; TILES_XY = TILES_X * TILES_Y;
use_winograd_ = UseWinograd4x4To6x6(); use_winograd_ = UseWinograd4x4To6x6();


@@ -96,14 +101,9 @@ int ConvolutionOpenCLKernel::InitBuffer() {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();


auto weight_tensor = in_tensors_[1];
auto weight_shape = weight_tensor->shape();
size_t CO = weight_shape[0];
size_t KH = weight_shape[1];
size_t KW = weight_shape[2];
size_t CI = weight_shape[3];
// size_t CI_SLICES = UP_DIV(CI, C4NUM);
// size_t CO_SLICES = UP_DIV(CO, C4NUM);
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
size_t KH = param->kernel_h_;
size_t KW = param->kernel_w_;
constexpr size_t CI_TILE = C4NUM; constexpr size_t CI_TILE = C4NUM;
constexpr size_t CO_TILE = C4NUM; constexpr size_t CO_TILE = C4NUM;
size_t packed_weight_size; size_t packed_weight_size;
@@ -115,6 +115,7 @@ int ConvolutionOpenCLKernel::InitBuffer() {
packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size)); packed_weight_ = reinterpret_cast<float *>(allocator->Malloc(packed_weight_size));
allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true);
memset(packed_weight_, 0x00, packed_weight_size); memset(packed_weight_, 0x00, packed_weight_size);
auto weight_tensor = in_tensors_[1];
auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data()); auto origin_weight = reinterpret_cast<float *>(weight_tensor->Data());


if (use_winograd_) { if (use_winograd_) {
@@ -205,7 +206,6 @@ int ConvolutionOpenCLKernel::InitBuffer() {
} }


int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) {
// size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM);
size_t im_dst_x, im_dst_y; size_t im_dst_x, im_dst_y;
if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) {
if (out_tensors_[0]->Width() * CO_SLICES < 65536) { if (out_tensors_[0]->Width() * CO_SLICES < 65536) {
@@ -236,12 +236,11 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_s
int ConvolutionOpenCLKernel::Run() { int ConvolutionOpenCLKernel::Run() {
MS_LOG(DEBUG) << this->name() << " Running!"; MS_LOG(DEBUG) << this->name() << " Running!";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);


int arg_cn = 0; int arg_cn = 0;
if (use_winograd_) { if (use_winograd_) {
arg_cn = 0; arg_cn = 0;
cl_int4 _4x4to36_in_shape = {1, param->input_h_, param->input_w_, CI_SLICES};
cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES};
cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES}; cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES};
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_); ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_);
@@ -259,13 +258,14 @@ int ConvolutionOpenCLKernel::Run() {


arg_cn = 0; arg_cn = 0;
cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES}; cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES};
cl_int4 _36to4x4_out_shape = {1, param->output_h_, param->output_w_, CO_SLICES};
cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES};
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape);
ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape);
} else { } else {
arg_cn = 0;
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data());
ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_);
@@ -287,19 +287,8 @@ int ConvolutionOpenCLKernel::Run() {


std::string ConvolutionOpenCLKernel::CodeGenConvolution() { std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
auto param = reinterpret_cast<ConvParameter *>(op_parameter_); auto param = reinterpret_cast<ConvParameter *>(op_parameter_);

auto input_tensor = in_tensors_[0];
auto output_tensor = out_tensors_[0];
const size_t CI = input_tensor->Channel();
// const size_t CI_SLICES = UP_DIV(CI, C4NUM);
const size_t CI_ALIGN = CI_SLICES * C4NUM; const size_t CI_ALIGN = CI_SLICES * C4NUM;
const size_t IH = input_tensor->Height();
const size_t IW = input_tensor->Width();
const size_t CO = output_tensor->Channel();
// const size_t CO_SLICES = UP_DIV(CO, C4NUM);
const size_t CO_ALIGN = CO_SLICES * C4NUM; const size_t CO_ALIGN = CO_SLICES * C4NUM;
const size_t OH = output_tensor->Height();
const size_t OW = output_tensor->Width();
const size_t KH = param->kernel_h_; const size_t KH = param->kernel_h_;
const size_t KW = param->kernel_w_; const size_t KW = param->kernel_w_;
const size_t strideH = param->stride_h_; const size_t strideH = param->stride_h_;
@@ -373,12 +362,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
" {\n" " {\n"
" for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n"
" {\n"; " {\n";

// NHWC4 NHC4W4 NC4HW4
code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n"; code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n";
// code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ih * CI_SLICES + ci_slice)); // NHC4W4: HC W\n\n";
// code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ci_slice * IH + ih)); // NC4HW4: CH W\n\n";

code += code +=
" out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n"
" out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n"
@@ -394,21 +378,18 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() {
" }\n" " }\n"
" }\n\n"; " }\n\n";
code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n";

if (param->is_relu_) { if (param->is_relu_) {
code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n";
} else if (param->is_relu6_) { } else if (param->is_relu6_) {
code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n";
} }
// NHWC4 NHC4W4 NC4HW4
if (OW * CO_SLICES < 65536) { if (OW * CO_SLICES < 65536) {
code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}";
} else { } else {
code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}";
} }
// code += " WRITE_FLT4(output, (int2)(ow, oh * CO_SLICES + co_slice), out0_c4_bias);// NHC4W4: HC W\n}";
// code += " WRITE_FLT4(output, (int2)(ow ,co_slice * OH + oh), out0_c4_bias);// NC4HW4: CH W\n}";

// std::cout << code << std::endl;
return code; return code;
} }


@@ -567,86 +548,91 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() {
} }


std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() {
return "//#define TILE_XY 256\n"
"//#define SLICES 20\n"
"//#define OH 16\n"
"//#define OW 256\n"
"\n"
"//#define __global\n"
"__constant sampler_t\n"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
"\n"
"constant float At[24] = {\n"
" 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n"
" 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n"
" 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n"
" 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n"
"};\n"
"\n"
"__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
" __write_only image2d_t output,\n"
" __global float4 *bias,\n"
" int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n"
" int4 output_shape) // N H W CO_SLICES\n"
"{\n"
" int tile_xy = get_global_id(0);\n"
" int row = get_global_id(1);\n"
" int slice = get_global_id(2);\n"
"\n"
" int TILE_XY = input_shape.z;\n"
" int SLICES = input_shape.w;\n"
" int OH = output_shape.y;\n"
" int OW = output_shape.z;\n"
"\n"
" if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n"
" {\n"
" return;\n"
" }\n"
"\n"
" constant float *At_row = At + row * 6;\n"
" float4 AtM_row[6] = {0};\n"
" for (int y = 0; y < 6; y++)\n"
" {\n"
" for (int x = 0; x < 6; x++)\n"
" {\n"
" AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + "
"x));\n"
" }\n"
" }\n"
"\n"
" for (int x = 0; x < 4; x++)\n"
" {\n"
" float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
" for (int y = 0; y < 6; y++)\n"
" {\n"
" acc += AtM_row[y] * At[x * 6 + y];\n"
" }\n"
" acc += bias[slice];\n"
"\n"
" int TILE_X = OW / 4;\n"
" int tile_x = tile_xy % TILE_X * 4;\n"
" int tile_y = tile_xy / TILE_X * 4;\n"
"// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n"
" write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H "
"width=WC\n"
" }\n"
"}";
std::string code =
"//#define TILE_XY 256\n"
"//#define SLICES 20\n"
"//#define OH 16\n"
"//#define OW 256\n"
"\n"
"//#define __global\n"
"__constant sampler_t\n"
"smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n"
"\n"
"constant float At[24] = {\n"
" 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n"
" 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n"
" 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n"
" 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n"
"};\n"
"\n"
"__kernel void Winograd36To4x4(__read_only image2d_t input,\n"
" __write_only image2d_t output,\n"
" __global float4 *bias,\n"
" int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n"
" int4 output_shape) // N H W CO_SLICES\n"
"{\n"
" int tile_xy = get_global_id(0);\n"
" int row = get_global_id(1);\n"
" int slice = get_global_id(2);\n"
"\n"
" int TILE_XY = input_shape.z;\n"
" int SLICES = input_shape.w;\n"
" int OH = output_shape.y;\n"
" int OW = output_shape.z;\n"
"\n"
" if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n"
" {\n"
" return;\n"
" }\n"
"\n"
" constant float *At_row = At + row * 6;\n"
" float4 AtM_row[6] = {0};\n"
" for (int y = 0; y < 6; y++)\n"
" {\n"
" for (int x = 0; x < 6; x++)\n"
" {\n"
" AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + "
"x));\n"
" }\n"
" }\n"
"\n"
" for (int x = 0; x < 4; x++)\n"
" {\n"
" float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n"
" for (int y = 0; y < 6; y++)\n"
" {\n"
" acc += AtM_row[y] * At[x * 6 + y];\n"
" }\n"
" acc += bias[slice];\n";

auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
if (param->is_relu_) {
code += " acc = max(acc, (float4)(0.0f));\n";
} else if (param->is_relu6_) {
code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n";
}

code +=
" int TILE_X = OW / 4;\n"
" int tile_x = tile_xy % TILE_X * 4;\n"
" int tile_y = tile_xy / TILE_X * 4;\n"
"// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n"
" write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H "
"width=WC\n"
" }\n"
"}";
return code;
} }


int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) { int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std::vector<size_t> *local) {
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
auto param = reinterpret_cast<ConvParameter *>(op_parameter_);
param->output_h_ = out_tensors_[0]->Height();
param->output_w_ = out_tensors_[0]->Width();
param->output_channel_ = out_tensors_[0]->Channel();

constexpr size_t work_group_size[] = {4, 4, 1}; constexpr size_t work_group_size[] = {4, 4, 1};
auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); auto max_work_item_sizes = ocl_runtime->GetWorkItemSize();
size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())()); size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())());
const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]); const size_t max_z_size = std::min<size_t>(16, max_work_item_sizes[2]);


size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1];
size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0];
size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1];
size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2]; size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2];


size_t local_c = GetBiggestDivider(global_c, max_z_size); size_t local_c = GetBiggestDivider(global_c, max_z_size);
@@ -661,15 +647,13 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector<size_t> *global, std
local_h = global_h / 2; local_h = global_h / 2;
} }


auto output_tensor = out_tensors_[0];
const size_t OW = output_tensor->Width();
if (OW * CO_SLICES > 65536) { if (OW * CO_SLICES > 65536) {
local_w = 4; local_w = 4;
} }


global->clear(); global->clear();
global->push_back(UP_DIV(param->output_h_, local_h) * local_h);
global->push_back(UP_DIV(param->output_w_, local_w) * local_w);
global->push_back(UP_DIV(OH, local_h) * local_h);
global->push_back(UP_DIV(OW, local_w) * local_w);
global->push_back(UP_DIV(CO_SLICES, local_c) * local_c); global->push_back(UP_DIV(CO_SLICES, local_c) * local_c);
local->clear(); local->clear();
local->push_back(local_h); local->push_back(local_h);


+ 6
- 0
mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h View File

@@ -40,6 +40,12 @@ class ConvolutionOpenCLKernel : public OpenCLKernel {
int GetImageSize(size_t idx, std::vector<size_t> *img_size) override; int GetImageSize(size_t idx, std::vector<size_t> *img_size) override;


private: private:
int CI;
int IH;
int IW;
int CO;
int OH;
int OW;
int CI_SLICES; int CI_SLICES;
int CO_SLICES; int CO_SLICES;
float *packed_weight_ = nullptr; float *packed_weight_ = nullptr;


+ 64
- 63
mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc View File

@@ -41,7 +41,7 @@ void LoadData(void *dst, size_t dst_size, const std::string &file_path) {


void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) { void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) {
auto *output_data = reinterpret_cast<float *>(output_tensor->Data()); auto *output_data = reinterpret_cast<float *>(output_tensor->Data());
printf("output[0:10]:");
printf("\noutput[0:10]:");
for (int i = 0; i < 10; i++) { for (int i = 0; i < 10; i++) {
printf("%d:%.3f ", i, output_data[i]); printf("%d:%.3f ", i, output_data[i]);
} }
@@ -58,15 +58,15 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil
return; return;
} }
} }
printf("compare success!\n");
printf("compare success!\n");
printf("compare success!\n\n\n");
printf("COMPARE SUCCESS!\n\n\n");
} }


void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path, void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path,
std::string attr_str) { std::string attr_str) {
assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4);
auto param = new ConvParameter;
auto param = new (std::nothrow) ConvParameter;
if (param == nullptr) {
return;
}
sscanf(attr_str.c_str(), sscanf(attr_str.c_str(),
"inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_" "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_"
"dilationHW_%dx%d", "dilationHW_%dx%d",
@@ -79,67 +79,81 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const
auto weight_file = testcase_path + "weight_OHWI.bin"; auto weight_file = testcase_path + "weight_OHWI.bin";
auto bias_file = testcase_path + "bias_C.bin"; auto bias_file = testcase_path + "bias_C.bin";
auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin"); auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin");
std::cout << "input_file:" << input_file << std::endl;
std::cout << "weight_file:" << weight_file << std::endl;
std::cout << "bias_file:" << bias_file << std::endl;
std::cout << "expect_file:" << expect_file << std::endl;
std::cout << "input_file :" << input_file << std::endl;
std::cout << "weight_file :" << weight_file << std::endl;
std::cout << "bias_file :" << bias_file << std::endl;
std::cout << "expect_file :" << expect_file << std::endl;


std::cout << "initialize OpenCLRuntime";
std::cout << "initialize OpenCLRuntime and OpenCLAllocator";
auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance();
ocl_runtime->Init(); ocl_runtime->Init();
auto allocator = ocl_runtime->GetAllocator(); auto allocator = ocl_runtime->GetAllocator();


std::cout << "create Tensors(framework will do!!!)";
std::cout << "create Tensors";
std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_}; std::vector<int> input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_};
std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; std::vector<int> weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_};
std::vector<int> bias_shape = {param->output_channel_}; std::vector<int> bias_shape = {param->output_channel_};
std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; std::vector<int> output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_};
auto data_type = kNumberTypeFloat32; auto data_type = kNumberTypeFloat32;
auto tensor_type = schema::NodeType_ValueNode; auto tensor_type = schema::NodeType_ValueNode;
auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type);
auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type);
auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type);
auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type);
std::vector<lite::tensor::Tensor *> inputs{input_tensor, weight_tensor, bias_tensor};
std::vector<lite::tensor::Tensor *> outputs{output_tensor};

std::cout << "allocate and initialize weight/bias memory by hand here(framework will do!!!)";
std::vector<float> weight_vec(weight_tensor->ElementsNum());
std::vector<float> bias_vec(weight_tensor->ElementsNum());
weight_tensor->SetData(weight_vec.data());
bias_tensor->SetData(bias_vec.data());
LoadData(weight_tensor->Data(), weight_tensor->Size(), weight_file);
LoadData(bias_tensor->Data(), bias_tensor->Size(), bias_file);

std::cout << "create OpenCL Kernel"; // weight/bias has been allcated by framework
auto *conv_kernel = new ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
conv_kernel->Init();

std::cout << "create SubGraphOpenCLKernel";
inputs[0]->MallocData(allocator); // allocate input memory by hand here, framework will do!!!
auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, {conv_kernel}, {conv_kernel}, {conv_kernel});
auto input_tensor = lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type);
auto weight_tensor = lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type);
auto bias_tensor = lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type);
auto output_tensor = lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type);
std::vector<lite::tensor::Tensor *> inputs{&input_tensor, &weight_tensor, &bias_tensor};
std::vector<lite::tensor::Tensor *> outputs{&output_tensor};

std::cout << "allocate memory and initialize weight/bias";
weight_tensor.MallocData();
bias_tensor.MallocData();
LoadData(weight_tensor.Data(), weight_tensor.Size(), weight_file);
LoadData(bias_tensor.Data(), bias_tensor.Size(), bias_file);

std::cout << "create OpenCL Kernel";
auto kernel = ConvolutionOpenCLKernel(reinterpret_cast<OpParameter *>(param), inputs, outputs);
kernel.Init();

std::cout << "create SubGraph";
auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input_tensor}, outputs, {&kernel}, {&kernel}, {&kernel});
if (sub_graph == nullptr) {
return;
}
input_tensor.MallocData(allocator); // before MapBuffer()
sub_graph->Init(); sub_graph->Init();
LoadData(input_tensor->Data(), input_tensor->Size(), input_file); // initialize input Tensors data
printf("input[0] =%.3f\n", reinterpret_cast<float *>(input_tensor->Data())[0]);
printf("weight[0]=%.3f\n", reinterpret_cast<float *>(weight_tensor->Data())[0]);
printf("bias[0] =%.3f\n", reinterpret_cast<float *>(bias_tensor->Data())[0]);
LoadData(input_tensor.Data(), input_tensor.Size(), input_file); // after MapBuffer()
printf("input[0-2] =%.3f\n", reinterpret_cast<float *>(input_tensor.Data())[0]);
printf("weight[0-2]=%.3f\n", reinterpret_cast<float *>(weight_tensor.Data())[0]);
printf("bias[0-2] =%.3f\n", reinterpret_cast<float *>(bias_tensor.Data())[0]);
sub_graph->Run(); sub_graph->Run();

std::cout << "compare result";
MyCompareOutput(output_tensor, expect_file);
// lite::CompareOutput(reinterpret_cast<float *>(output_tensor->Data()), expect_file);

for (auto tensor : inputs) {
delete tensor;
}
for (auto tensor : outputs) {
delete tensor;
}
delete conv_kernel;
MyCompareOutput(&output_tensor, expect_file);

std::cout << "release resources";
weight_tensor.FreeData();
bias_tensor.FreeData();
input_tensor.SetData(nullptr);
output_tensor.SetData(nullptr);
weight_tensor.SetData(nullptr);
bias_tensor.SetData(nullptr);
delete param;
delete sub_graph; delete sub_graph;
lite::opencl::OpenCLRuntime::DeleteInstance(); lite::opencl::OpenCLRuntime::DeleteInstance();
} }


TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
// change W/H
TEST_MAIN(
schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
"inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_"
"1x1");
}

TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) {
// speed up
TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
"inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
"dilationHW_1x1");
}

TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
TEST_MAIN( TEST_MAIN(
schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/", schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/",
@@ -147,13 +161,6 @@ TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) {
"1x1"); "1x1");
} }


// TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) {
// TEST_MAIN(
// schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
// "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_"
// "1x1");
//}

TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) {
TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
"inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
@@ -165,12 +172,6 @@ TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x100_outputNH
"dilationHW_1x1"); "dilationHW_1x1");
} }


// TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) {
// TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/",
// "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"
// "dilationHW_1x1");
//}

TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) { TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) {
TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/",
"inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" "inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"


Loading…
Cancel
Save