diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 788f6fa3fc..48dd48e5fe 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -32,16 +32,21 @@ int ConvolutionOpenCLKernel::Init() { static int init_count = 0; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); - auto param = reinterpret_cast(op_parameter_); std::set build_options; init_count++; - CI_SLICES = UP_DIV(param->input_channel_, C4NUM); - CO_SLICES = UP_DIV(param->output_channel_, C4NUM); + CI = in_tensors_[0]->Channel(); + IH = in_tensors_[0]->Height(); + IW = in_tensors_[0]->Width(); + CO = out_tensors_[0]->Channel(); + OH = out_tensors_[0]->Height(); + OW = out_tensors_[0]->Width(); + CI_SLICES = UP_DIV(CI, C4NUM); + CO_SLICES = UP_DIV(CO, C4NUM); // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true - TILES_X = UP_DIV(param->output_w_, 4); - TILES_Y = UP_DIV(param->output_h_, 4); + TILES_X = UP_DIV(OW, 4); + TILES_Y = UP_DIV(OH, 4); TILES_XY = TILES_X * TILES_Y; use_winograd_ = UseWinograd4x4To6x6(); @@ -96,14 +101,9 @@ int ConvolutionOpenCLKernel::InitBuffer() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); - auto weight_tensor = in_tensors_[1]; - auto weight_shape = weight_tensor->shape(); - size_t CO = weight_shape[0]; - size_t KH = weight_shape[1]; - size_t KW = weight_shape[2]; - size_t CI = weight_shape[3]; - // size_t CI_SLICES = UP_DIV(CI, C4NUM); - // size_t CO_SLICES = UP_DIV(CO, C4NUM); + auto param = reinterpret_cast(op_parameter_); + size_t KH = param->kernel_h_; + size_t KW = param->kernel_w_; constexpr size_t CI_TILE = C4NUM; constexpr size_t CO_TILE = C4NUM; size_t packed_weight_size; @@ -115,6 +115,7 @@ int ConvolutionOpenCLKernel::InitBuffer() { packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); memset(packed_weight_, 0x00, packed_weight_size); + auto weight_tensor = in_tensors_[1]; auto origin_weight = reinterpret_cast(weight_tensor->Data()); if (use_winograd_) { @@ -205,7 +206,6 @@ int ConvolutionOpenCLKernel::InitBuffer() { } int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { - // size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM); size_t im_dst_x, im_dst_y; if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { if (out_tensors_[0]->Width() * CO_SLICES < 65536) { @@ -236,12 +236,11 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_s int ConvolutionOpenCLKernel::Run() { MS_LOG(DEBUG) << this->name() << " Running!"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - auto param = reinterpret_cast(op_parameter_); int arg_cn = 0; if (use_winograd_) { arg_cn = 0; - cl_int4 _4x4to36_in_shape = {1, param->input_h_, param->input_w_, CI_SLICES}; + cl_int4 _4x4to36_in_shape = {1, IH, IW, CI_SLICES}; cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES}; ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_); @@ -259,13 +258,14 @@ int ConvolutionOpenCLKernel::Run() { arg_cn = 0; cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES}; - cl_int4 _36to4x4_out_shape = {1, param->output_h_, param->output_w_, CO_SLICES}; + cl_int4 _36to4x4_out_shape = {1, OH, OW, CO_SLICES}; ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape); ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape); } else { + arg_cn = 0; ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data()); ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); @@ -287,19 +287,8 @@ int ConvolutionOpenCLKernel::Run() { std::string ConvolutionOpenCLKernel::CodeGenConvolution() { auto param = reinterpret_cast(op_parameter_); - - auto input_tensor = in_tensors_[0]; - auto output_tensor = out_tensors_[0]; - const size_t CI = input_tensor->Channel(); - // const size_t CI_SLICES = UP_DIV(CI, C4NUM); const size_t CI_ALIGN = CI_SLICES * C4NUM; - const size_t IH = input_tensor->Height(); - const size_t IW = input_tensor->Width(); - const size_t CO = output_tensor->Channel(); - // const size_t CO_SLICES = UP_DIV(CO, C4NUM); const size_t CO_ALIGN = CO_SLICES * C4NUM; - const size_t OH = output_tensor->Height(); - const size_t OW = output_tensor->Width(); const size_t KH = param->kernel_h_; const size_t KW = param->kernel_w_; const size_t strideH = param->stride_h_; @@ -373,12 +362,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { " {\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " {\n"; - - // NHWC4 NHC4W4 NC4HW4 code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n"; - // code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ih * CI_SLICES + ci_slice)); // NHC4W4: HC W\n\n"; - // code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw, ci_slice * IH + ih)); // NC4HW4: CH W\n\n"; - code += " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" @@ -394,21 +378,18 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { " }\n" " }\n\n"; code += " FLT4 out0_c4_bias = out0_c4 + bias[co_slice];\n"; + if (param->is_relu_) { code += " out0_c4_bias = max(out0_c4_bias, (FLT4)(0.0f));\n"; } else if (param->is_relu6_) { code += " out0_c4_bias = clamp(out0_c4_bias, (FLT4)(0.0f), (FLT4)(6.0f));\n"; } - // NHWC4 NHC4W4 NC4HW4 + if (OW * CO_SLICES < 65536) { code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; } else { code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; } - // code += " WRITE_FLT4(output, (int2)(ow, oh * CO_SLICES + co_slice), out0_c4_bias);// NHC4W4: HC W\n}"; - // code += " WRITE_FLT4(output, (int2)(ow ,co_slice * OH + oh), out0_c4_bias);// NC4HW4: CH W\n}"; - - // std::cout << code << std::endl; return code; } @@ -567,86 +548,91 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { } std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { - return "//#define TILE_XY 256\n" - "//#define SLICES 20\n" - "//#define OH 16\n" - "//#define OW 256\n" - "\n" - "//#define __global\n" - "__constant sampler_t\n" - "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" - "\n" - "constant float At[24] = {\n" - " 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n" - " 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n" - " 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n" - " 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n" - "};\n" - "\n" - "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" - " __write_only image2d_t output,\n" - " __global float4 *bias,\n" - " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" - " int4 output_shape) // N H W CO_SLICES\n" - "{\n" - " int tile_xy = get_global_id(0);\n" - " int row = get_global_id(1);\n" - " int slice = get_global_id(2);\n" - "\n" - " int TILE_XY = input_shape.z;\n" - " int SLICES = input_shape.w;\n" - " int OH = output_shape.y;\n" - " int OW = output_shape.z;\n" - "\n" - " if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n" - " {\n" - " return;\n" - " }\n" - "\n" - " constant float *At_row = At + row * 6;\n" - " float4 AtM_row[6] = {0};\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " for (int x = 0; x < 6; x++)\n" - " {\n" - " AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + " - "x));\n" - " }\n" - " }\n" - "\n" - " for (int x = 0; x < 4; x++)\n" - " {\n" - " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" - " for (int y = 0; y < 6; y++)\n" - " {\n" - " acc += AtM_row[y] * At[x * 6 + y];\n" - " }\n" - " acc += bias[slice];\n" - "\n" - " int TILE_X = OW / 4;\n" - " int tile_x = tile_xy % TILE_X * 4;\n" - " int tile_y = tile_xy / TILE_X * 4;\n" - "// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n" - " write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H " - "width=WC\n" - " }\n" - "}"; + std::string code = + "//#define TILE_XY 256\n" + "//#define SLICES 20\n" + "//#define OH 16\n" + "//#define OW 256\n" + "\n" + "//#define __global\n" + "__constant sampler_t\n" + "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" + "\n" + "constant float At[24] = {\n" + " 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n" + " 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n" + " 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n" + " 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n" + "};\n" + "\n" + "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " __global float4 *bias,\n" + " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" + " int4 output_shape) // N H W CO_SLICES\n" + "{\n" + " int tile_xy = get_global_id(0);\n" + " int row = get_global_id(1);\n" + " int slice = get_global_id(2);\n" + "\n" + " int TILE_XY = input_shape.z;\n" + " int SLICES = input_shape.w;\n" + " int OH = output_shape.y;\n" + " int OW = output_shape.z;\n" + "\n" + " if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n" + " {\n" + " return;\n" + " }\n" + "\n" + " constant float *At_row = At + row * 6;\n" + " float4 AtM_row[6] = {0};\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + " + "x));\n" + " }\n" + " }\n" + "\n" + " for (int x = 0; x < 4; x++)\n" + " {\n" + " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " acc += AtM_row[y] * At[x * 6 + y];\n" + " }\n" + " acc += bias[slice];\n"; + + auto param = reinterpret_cast(op_parameter_); + if (param->is_relu_) { + code += " acc = max(acc, (float4)(0.0f));\n"; + } else if (param->is_relu6_) { + code += " acc = clamp(acc, (float4)(0.0f), (float4)(6.0f));\n"; + } + + code += + " int TILE_X = OW / 4;\n" + " int tile_x = tile_xy % TILE_X * 4;\n" + " int tile_y = tile_xy / TILE_X * 4;\n" + "// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n" + " write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H " + "width=WC\n" + " }\n" + "}"; + return code; } int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std::vector *local) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - auto param = reinterpret_cast(op_parameter_); - param->output_h_ = out_tensors_[0]->Height(); - param->output_w_ = out_tensors_[0]->Width(); - param->output_channel_ = out_tensors_[0]->Channel(); - constexpr size_t work_group_size[] = {4, 4, 1}; auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())()); const size_t max_z_size = std::min(16, max_work_item_sizes[2]); - size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0]; - size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1]; + size_t global_h = UP_DIV(OH, work_group_size[0]) * work_group_size[0]; + size_t global_w = UP_DIV(OW, work_group_size[1]) * work_group_size[1]; size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2]; size_t local_c = GetBiggestDivider(global_c, max_z_size); @@ -661,15 +647,13 @@ int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std local_h = global_h / 2; } - auto output_tensor = out_tensors_[0]; - const size_t OW = output_tensor->Width(); if (OW * CO_SLICES > 65536) { local_w = 4; } global->clear(); - global->push_back(UP_DIV(param->output_h_, local_h) * local_h); - global->push_back(UP_DIV(param->output_w_, local_w) * local_w); + global->push_back(UP_DIV(OH, local_h) * local_h); + global->push_back(UP_DIV(OW, local_w) * local_w); global->push_back(UP_DIV(CO_SLICES, local_c) * local_c); local->clear(); local->push_back(local_h); diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 8c651740ef..373b703f52 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -40,6 +40,12 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int GetImageSize(size_t idx, std::vector *img_size) override; private: + int CI; + int IH; + int IW; + int CO; + int OH; + int OW; int CI_SLICES; int CO_SLICES; float *packed_weight_ = nullptr; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index f269d59b54..7a2307036d 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -41,7 +41,7 @@ void LoadData(void *dst, size_t dst_size, const std::string &file_path) { void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) { auto *output_data = reinterpret_cast(output_tensor->Data()); - printf("output[0:10]:"); + printf("\noutput[0:10]:"); for (int i = 0; i < 10; i++) { printf("%d:%.3f ", i, output_data[i]); } @@ -58,15 +58,15 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil return; } } - printf("compare success!\n"); - printf("compare success!\n"); - printf("compare success!\n\n\n"); + printf("COMPARE SUCCESS!\n\n\n"); } void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path, std::string attr_str) { - assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4); - auto param = new ConvParameter; + auto param = new (std::nothrow) ConvParameter; + if (param == nullptr) { + return; + } sscanf(attr_str.c_str(), "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_" "dilationHW_%dx%d", @@ -79,67 +79,81 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const auto weight_file = testcase_path + "weight_OHWI.bin"; auto bias_file = testcase_path + "bias_C.bin"; auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin"); - std::cout << "input_file:" << input_file << std::endl; - std::cout << "weight_file:" << weight_file << std::endl; - std::cout << "bias_file:" << bias_file << std::endl; - std::cout << "expect_file:" << expect_file << std::endl; + std::cout << "input_file :" << input_file << std::endl; + std::cout << "weight_file :" << weight_file << std::endl; + std::cout << "bias_file :" << bias_file << std::endl; + std::cout << "expect_file :" << expect_file << std::endl; - std::cout << "initialize OpenCLRuntime"; + std::cout << "initialize OpenCLRuntime and OpenCLAllocator"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); auto allocator = ocl_runtime->GetAllocator(); - std::cout << "create Tensors(framework will do!!!)"; + std::cout << "create Tensors"; std::vector input_shape = {param->input_batch_, param->input_h_, param->input_w_, param->input_channel_}; std::vector weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; std::vector bias_shape = {param->output_channel_}; std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; auto data_type = kNumberTypeFloat32; auto tensor_type = schema::NodeType_ValueNode; - auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type); - auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type); - auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type); - auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type); - std::vector inputs{input_tensor, weight_tensor, bias_tensor}; - std::vector outputs{output_tensor}; - - std::cout << "allocate and initialize weight/bias memory by hand here(framework will do!!!)"; - std::vector weight_vec(weight_tensor->ElementsNum()); - std::vector bias_vec(weight_tensor->ElementsNum()); - weight_tensor->SetData(weight_vec.data()); - bias_tensor->SetData(bias_vec.data()); - LoadData(weight_tensor->Data(), weight_tensor->Size(), weight_file); - LoadData(bias_tensor->Data(), bias_tensor->Size(), bias_file); - - std::cout << "create OpenCL Kernel"; // weight/bias has been allcated by framework - auto *conv_kernel = new ConvolutionOpenCLKernel(reinterpret_cast(param), inputs, outputs); - conv_kernel->Init(); - - std::cout << "create SubGraphOpenCLKernel"; - inputs[0]->MallocData(allocator); // allocate input memory by hand here, framework will do!!! - auto *sub_graph = new SubGraphOpenCLKernel({input_tensor}, outputs, {conv_kernel}, {conv_kernel}, {conv_kernel}); + auto input_tensor = lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type); + auto weight_tensor = lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type); + auto bias_tensor = lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensor_type); + auto output_tensor = lite::tensor::Tensor(data_type, output_shape, output_format, tensor_type); + std::vector inputs{&input_tensor, &weight_tensor, &bias_tensor}; + std::vector outputs{&output_tensor}; + + std::cout << "allocate memory and initialize weight/bias"; + weight_tensor.MallocData(); + bias_tensor.MallocData(); + LoadData(weight_tensor.Data(), weight_tensor.Size(), weight_file); + LoadData(bias_tensor.Data(), bias_tensor.Size(), bias_file); + + std::cout << "create OpenCL Kernel"; + auto kernel = ConvolutionOpenCLKernel(reinterpret_cast(param), inputs, outputs); + kernel.Init(); + + std::cout << "create SubGraph"; + auto sub_graph = new (std::nothrow) SubGraphOpenCLKernel({&input_tensor}, outputs, {&kernel}, {&kernel}, {&kernel}); + if (sub_graph == nullptr) { + return; + } + input_tensor.MallocData(allocator); // before MapBuffer() sub_graph->Init(); - LoadData(input_tensor->Data(), input_tensor->Size(), input_file); // initialize input Tensors data - printf("input[0] =%.3f\n", reinterpret_cast(input_tensor->Data())[0]); - printf("weight[0]=%.3f\n", reinterpret_cast(weight_tensor->Data())[0]); - printf("bias[0] =%.3f\n", reinterpret_cast(bias_tensor->Data())[0]); + LoadData(input_tensor.Data(), input_tensor.Size(), input_file); // after MapBuffer() + printf("input[0-2] =%.3f\n", reinterpret_cast(input_tensor.Data())[0]); + printf("weight[0-2]=%.3f\n", reinterpret_cast(weight_tensor.Data())[0]); + printf("bias[0-2] =%.3f\n", reinterpret_cast(bias_tensor.Data())[0]); sub_graph->Run(); - - std::cout << "compare result"; - MyCompareOutput(output_tensor, expect_file); - // lite::CompareOutput(reinterpret_cast(output_tensor->Data()), expect_file); - - for (auto tensor : inputs) { - delete tensor; - } - for (auto tensor : outputs) { - delete tensor; - } - delete conv_kernel; + MyCompareOutput(&output_tensor, expect_file); + + std::cout << "release resources"; + weight_tensor.FreeData(); + bias_tensor.FreeData(); + input_tensor.SetData(nullptr); + output_tensor.SetData(nullptr); + weight_tensor.SetData(nullptr); + bias_tensor.SetData(nullptr); + delete param; delete sub_graph; lite::opencl::OpenCLRuntime::DeleteInstance(); } +TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { + // change W/H + TEST_MAIN( + schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", + "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_" + "1x1"); +} + +TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) { + // speed up + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", + "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" + "dilationHW_1x1"); +} + TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { TEST_MAIN( schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/", @@ -147,13 +161,6 @@ TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { "1x1"); } -// TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { -// TEST_MAIN( -// schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", -// "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_" -// "1x1"); -//} - TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" @@ -165,12 +172,6 @@ TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x100_outputNH "dilationHW_1x1"); } -// TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) { -// TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", -// "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" -// "dilationHW_1x1"); -//} - TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) { TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", "inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_"