From 925c24653be9e7762f513e04c94bec53135fa5ca Mon Sep 17 00:00:00 2001 From: wangdongxu Date: Tue, 25 Aug 2020 16:27:58 +0800 Subject: [PATCH] opencl convolution support fp16 --- .../kernel/opencl/kernel/convolution.cc | 166 ++++++++---------- .../kernel/opencl/kernel/convolution.h | 6 +- .../kernel/opencl/convolution_tests.cc | 112 +++++------- 3 files changed, 124 insertions(+), 160 deletions(-) diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index 29ef635c42..9557f45c27 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -17,9 +17,11 @@ #include #include #include +#include "src/common/utils.h" #include "src/runtime/kernel/opencl/kernel/convolution.h" #include "src/kernel_registry.h" #include "include/errorcode.h" +#include "nnacl/fp32/common_func.h" using mindspore::kernel::KERNEL_ARCH::kGPU; using mindspore::lite::KernelRegistrar; @@ -31,6 +33,7 @@ namespace mindspore::kernel { int ConvolutionOpenCLKernel::Init() { static int init_count = 0; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + use_fp16_ = ocl_runtime->GetFp16Enable(); auto allocator = ocl_runtime->GetAllocator(); std::set build_options; init_count++; @@ -73,19 +76,15 @@ int ConvolutionOpenCLKernel::Init() { // allocate winograd memory if (use_winograd_) { -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; - size_t sizeof_datatype = 2; -#else - size_t img_dtype = CL_FLOAT; - size_t sizeof_datatype = 4; -#endif - size_t size = TILES_XY * CI_SLICES * 36 * sizeof_datatype; + size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; + size_t sizeof_FLT = use_fp16_ ? 2 : 4; + + size_t size = TILES_XY * CI_SLICES * 36 * sizeof_FLT; size_t width = TILES_XY; size_t height = CI_SLICES * 36; winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); - size = TILES_XY * CO_SLICES * 36 * sizeof_datatype; + size = TILES_XY * CO_SLICES * 36 * sizeof_FLT; width = TILES_XY; height = CO_SLICES * 36; winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); @@ -103,6 +102,7 @@ int ConvolutionOpenCLKernel::Init() { int ConvolutionOpenCLKernel::InitBuffer() { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto allocator = ocl_runtime->GetAllocator(); + size_t sizeof_FLT = use_fp16_ ? 2 : 4; auto param = reinterpret_cast(op_parameter_); size_t KH = param->kernel_h_; @@ -111,15 +111,18 @@ int ConvolutionOpenCLKernel::InitBuffer() { constexpr size_t CO_TILE = C4NUM; size_t packed_weight_size; if (use_winograd_) { - packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof(float); + packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof_FLT; } else { - packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); + packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof_FLT; } - packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); + packed_weight_ = allocator->Malloc(packed_weight_size); + auto packed_weight_fp32 = reinterpret_cast(packed_weight_); + auto packed_weight_fp16 = reinterpret_cast(packed_weight_); allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); memset(packed_weight_, 0x00, packed_weight_size); auto weight_tensor = in_tensors_[1]; - auto origin_weight = reinterpret_cast(weight_tensor->Data()); + auto origin_weight_fp32 = reinterpret_cast(weight_tensor->Data()); + auto origin_weight_fp16 = reinterpret_cast(weight_tensor->Data()); if (use_winograd_) { // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4 @@ -141,7 +144,11 @@ int ConvolutionOpenCLKernel::InitBuffer() { for (int kh = 0; kh < 3; ++kh) { for (int kw = 0; kw < 3; ++kw) { const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci; - in_vals[kh * 3 + kw] = origin_weight[f_index]; + if (use_fp16_) { + in_vals[kh * 3 + kw] = ShortToFloat32(origin_weight_fp16[f_index]); + } else { + in_vals[kh * 3 + kw] = origin_weight_fp32[f_index]; + } } } @@ -169,14 +176,18 @@ int ConvolutionOpenCLKernel::InitBuffer() { (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) * CO_TILE + co_inner; - packed_weight_[dst_idx] = encoded_weight[src_idx++]; + if (use_fp16_) { + packed_weight_fp16[dst_idx] = Float32ToShort(encoded_weight[src_idx++]); + } else { + packed_weight_fp32[dst_idx] = encoded_weight[src_idx++]; + } } } } } } else { // weight: OHWI -> O/4 H W I/4 I4 O4 - for (int co = 0; co < CO; ++co) { + for (int co = 0, src_idx = 0; co < CO; ++co) { for (int kh = 0; kh < KH; ++kh) { for (int kw = 0; kw < KW; ++kw) { for (int ci = 0; ci < CI; ++ci) { @@ -184,8 +195,13 @@ int ConvolutionOpenCLKernel::InitBuffer() { auto co_inner = co % CO_TILE; auto ci_outer = ci / CI_TILE; auto ci_inner = ci % CI_TILE; - packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + - co_inner] = *(origin_weight++); + size_t dst_idx = + ((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + co_inner; + if (use_fp16_) { + packed_weight_fp16[dst_idx] = origin_weight_fp16[src_idx++]; + } else { + packed_weight_fp32[dst_idx] = origin_weight_fp32[src_idx++]; + } } } } @@ -195,14 +211,11 @@ int ConvolutionOpenCLKernel::InitBuffer() { // align bias from C to C4 auto bias_tensor = in_tensors_[2]; - size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); - packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); - packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); + size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof_FLT; + packed_bias_ = allocator->Malloc(packed_bias_size); + allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true); memset(packed_bias_, 0x00, packed_bias_size); - auto bias_data = reinterpret_cast(bias_tensor->Data()); - for (int co = 0; co < CO; ++co) { - packed_bias_[co] = bias_data[co]; - } + memcpy(packed_bias_, bias_tensor->Data(), CO * sizeof_FLT); allocator->UnmapBuffer(packed_bias_); return RET_OK; @@ -224,11 +237,7 @@ int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_s im_dst_y = out_tensors_[0]->Height() * CO_SLICES; im_dst_x = out_tensors_[0]->Width(); } -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; -#else - size_t img_dtype = CL_FLOAT; -#endif + size_t img_dtype = use_fp16_ ? CL_HALF_FLOAT : CL_FLOAT; img_size->clear(); img_size->push_back(im_dst_x); img_size->push_back(im_dst_y); @@ -321,18 +330,9 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { code += "#define CI_SLICES " + std::to_string(CI_SLICES) + "\n"; code += "#define CO_SLICES " + std::to_string(CO_SLICES) + "\n\n"; -#ifdef ENABLE_FP16 - code += - "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" - "#define FLT4 half4\n" - "#define READ_FLT4 read_imageh\n" - "#define WRITE_FLT4 write_imageh\n\n"; -#else - code += - "#define FLT4 float4\n" - "#define READ_FLT4 read_imagef\n" - "#define WRITE_FLT4 write_imagef\n\n"; -#endif + if (use_fp16_) { + code += "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n"; + } code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n"; @@ -365,7 +365,7 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { " {\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " {\n"; - code += "FLT4 in_c4 = READ_FLT4(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n"; + code += "FLT4 in_c4 = READ_IMAGE(input, smp_zero, (int2)(iw * CI_SLICES + ci_slice, ih)); // NHWC4: H WC\n\n"; code += " out0_c4 += w0_ic1_oc4[0] * in_c4.x;\n" " out0_c4 += w0_ic1_oc4[1] * in_c4.y;\n" @@ -389,21 +389,22 @@ std::string ConvolutionOpenCLKernel::CodeGenConvolution() { } if (OW * CO_SLICES < 65536) { - code += " WRITE_FLT4(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; + code += " WRITE_IMAGE(output, (int2)(ow * CO_SLICES + co_slice, oh), out0_c4_bias);// NHWC4: H WC\n}"; } else { - code += " WRITE_FLT4(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; + code += " WRITE_IMAGE(output, (int2)(oh * CO_SLICES + co_slice, ow), out0_c4_bias);// NHWC4: H WC\n}"; } return code; } std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { - return "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" + return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" "#define PAD 1\n" "\n" "__constant sampler_t\n" "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" "\n" - "constant float Bt[36] = {\n" + "constant FLT Bt[36] = {\n" " 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n" " 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n" " 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n" @@ -433,52 +434,40 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { " int tile_x = tile_xy % TILE_X;\n" " int tile_y = tile_xy / TILE_X;\n" "\n" - " constant float *Bt_row = Bt + row * 6;\n" - " float4 BtD_row[6] = {0};\n" + " constant FLT *Bt_row = Bt + row * 6;\n" + " FLT4 BtD_row[6] = {0};\n" " for (int y = 0; y < 6; y++)\n" " {\n" " int y_idx = tile_y * 4 - PAD + y;\n" " for (int x = 0; x < 6; x++)\n" " {\n" " int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n" - " BtD_row[x] += Bt_row[y] * read_imagef(input, smp_none, (int2)(x_idx, y_idx));\n" + " BtD_row[x] += Bt_row[y] * READ_IMAGE(input, smp_none, (int2)(x_idx, y_idx));\n" " }\n" " }\n" "\n" " for (int y = 0; y < 6; y++)\n" " {\n" - " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" " for (int x = 0; x < 6; x++)\n" " {\n" " acc += BtD_row[x] * Bt[y * 6 + x];\n" " }\n" - "// write_imagef(output, (int2)((row * 6 + y) * SLICES + slice, tile_xy), acc); // H WC W=36\n" - " write_imagef(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" + " WRITE_IMAGE(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" " }\n" "}"; } std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { - return "#define CI_TILE 4\n" + return "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" + "#define CI_TILE 4\n" "#define H 36\n" - "//#define W 256\n" - "//#define CI 96\n" - "//#define CO 80s\n" - "//#define CI_SLICES 24\n" - "//#define CO_SLICES 20\n" - "\n" - "#define FLT4 float4\n" - "#define READ_FLT4 read_imagef\n" - "#define WRITE_FLT4 write_imagef\n" - "\n" - "//#define __global\n" - "\n" "__constant sampler_t\n" "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" "\n" "__kernel void WinogradConvolution(__read_only image2d_t input,\n" " __write_only image2d_t output,\n" - " __global float16 *weight,\n" + " __global FLT16 *weight,\n" " int4 input_shape, // N 36 H/4*W/4 CI_SLICES\n" " int4 output_shape) // N 36 H/4*W/4 CO_SLICES\n" "{\n" @@ -501,14 +490,14 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { " FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" "\n" " int y_idx = h;\n" - " __global float16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n" + " __global FLT16 *weight_ptr = weight + (co_slice / 2 * 36 + h) * CI_SLICES * 2;\n" " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" " {\n" - " FLT4 in0 = READ_FLT4(input, smp_none, (int2)(w + 0, y_idx));\n" - " FLT4 in1 = READ_FLT4(input, smp_none, (int2)(w + 1, y_idx));\n" + " FLT4 in0 = READ_IMAGE(input, smp_none, (int2)(w + 0, y_idx));\n" + " FLT4 in1 = READ_IMAGE(input, smp_none, (int2)(w + 1, y_idx));\n" " y_idx += 36;\n" "\n" - " float16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n" + " FLT16 weight0 = weight_ptr[0], weight1 = weight_ptr[1];\n" " weight_ptr += 2;\n" "\n" "\n" @@ -533,18 +522,18 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { " out11 += in1.w * weight1.scdef;\n" " }\n" "\n" - " WRITE_FLT4(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n" + " WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 0) * H + h), out00);\n" " if (w + 1 < W)\n" " {\n" - " WRITE_FLT4(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n" + " WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 0) * H + h), out01);\n" " }\n" "\n" " if (co_slice + 1 < CO_SLICES)\n" " {\n" - " WRITE_FLT4(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n" + " WRITE_IMAGE(output, (int2)(w + 0, (co_slice + 1) * H + h), out10);\n" " if (w + 1 < W)\n" " {\n" - " WRITE_FLT4(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n" + " WRITE_IMAGE(output, (int2)(w + 1, (co_slice + 1) * H + h), out11);\n" " }\n" " }\n" "}"; @@ -552,16 +541,11 @@ std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { std::string code = - "//#define TILE_XY 256\n" - "//#define SLICES 20\n" - "//#define OH 16\n" - "//#define OW 256\n" - "\n" - "//#define __global\n" + "#pragma OPENCL EXTENSION cl_khr_fp16 : enable\n" "__constant sampler_t\n" "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" "\n" - "constant float At[24] = {\n" + "constant FLT At[24] = {\n" " 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n" " 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n" " 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n" @@ -570,7 +554,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { "\n" "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" " __write_only image2d_t output,\n" - " __global float4 *bias,\n" + " __global FLT4 *bias,\n" " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" " int4 output_shape) // N H W CO_SLICES\n" "{\n" @@ -588,25 +572,25 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " return;\n" " }\n" "\n" - " constant float *At_row = At + row * 6;\n" - " float4 AtM_row[6] = {0};\n" + " constant FLT *At_row = At + row * 6;\n" + " FLT4 AtM_row[6] = {0};\n" " for (int y = 0; y < 6; y++)\n" " {\n" " for (int x = 0; x < 6; x++)\n" " {\n" - " AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + " - "x));\n" + " AtM_row[x] += At_row[y] * READ_IMAGE(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + x));\n" " }\n" " }\n" "\n" " for (int x = 0; x < 4; x++)\n" " {\n" - " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 acc = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" " for (int y = 0; y < 6; y++)\n" " {\n" " acc += AtM_row[y] * At[x * 6 + y];\n" " }\n" - " acc += bias[slice];\n"; + " acc += bias[slice];\n" + "\n"; auto param = reinterpret_cast(op_parameter_); if (param->is_relu_) { @@ -619,9 +603,7 @@ std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { " int TILE_X = OW / 4;\n" " int tile_x = tile_xy % TILE_X * 4;\n" " int tile_y = tile_xy / TILE_X * 4;\n" - "// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n" - " write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H " - "width=WC\n" + " WRITE_IMAGE(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H width=WC\n" " }\n" "}"; return code; diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index 4ad51a15a4..851fd09a13 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -40,6 +40,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int GetImageSize(size_t idx, std::vector *img_size) override; private: + bool use_fp16_ = false; + int CI; int IH; int IW; @@ -48,8 +50,8 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int OW; int CI_SLICES; int CO_SLICES; - float *packed_weight_ = nullptr; - float *packed_bias_ = nullptr; + void *packed_weight_ = nullptr; + void *packed_bias_ = nullptr; bool use_winograd_ = false; int TILES_X; diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index ecf7310701..457021b8a7 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -18,9 +18,10 @@ #include "common/common_test.h" #include "mindspore/lite/src/common/file_utils.h" #include "mindspore/lite/src/runtime/opencl/opencl_runtime.h" -#include "nnacl/pack.h" #include "mindspore/lite/src/runtime/kernel/opencl/subgraph_opencl_kernel.h" #include "mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h" +#include "nnacl/pack.h" +#include "nnacl/fp32/common_func.h" using mindspore::kernel::ConvolutionOpenCLKernel; using mindspore::kernel::LiteKernel; @@ -34,22 +35,37 @@ void LoadData(void *dst, size_t dst_size, const std::string &file_path) { if (file_path.empty()) { memset(dst, 0x00, dst_size); } else { - auto src_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &dst_size)); + auto src_data = mindspore::lite::ReadFile(file_path.c_str(), &dst_size); memcpy(dst, src_data, dst_size); } } -void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path) { - auto *output_data = reinterpret_cast(output_tensor->Data()); +void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &file_path, const TypeId data_type, + const float atol) { + size_t output_size = output_tensor->Size(); + auto output_data_ori = output_tensor->Data(); + auto expect_data_ori = mindspore::lite::ReadFile(file_path.c_str(), &output_size); + std::vector output_data_vec(output_tensor->ElementsC4Num()); + std::vector expect_data_vec(output_tensor->ElementsC4Num()); + float *output_data, *expect_data; + if (data_type == kNumberTypeFloat16) { + for (int i = 0; i < output_data_vec.size(); ++i) { + output_data_vec[i] = ShortToFloat32(reinterpret_cast(output_data_ori)[i]); + expect_data_vec[i] = ShortToFloat32(reinterpret_cast(expect_data_ori)[i]); + } + output_data = output_data_vec.data(); + expect_data = expect_data_vec.data(); + } else { + output_data = reinterpret_cast(output_data_ori); + expect_data = reinterpret_cast(expect_data_ori); + } + printf("\noutput[0:10]:"); for (int i = 0; i < 10; i++) { printf("%d:%.3f ", i, output_data[i]); } printf("\n"); - size_t output_size = output_tensor->Size(); - auto expect_data = reinterpret_cast(mindspore::lite::ReadFile(file_path.c_str(), &output_size)); - constexpr float atol = 0.5; for (int i = 0; i < output_tensor->ElementsNum(); ++i) { if (std::fabs(output_data[i] - expect_data[i]) > atol) { printf("error at idx[%d] expect=%.3f output=%.3f\n", i, expect_data[i], output_data[i]); @@ -61,8 +77,8 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil printf("COMPARE SUCCESS!\n\n\n"); } -void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path, - std::string attr_str) { +void TEST_MAIN(schema::Format input_format, schema::Format output_format, const TypeId data_type, + const std::string &data_path, std::string attr_str) { auto param = new (std::nothrow) ConvParameter; if (param == nullptr) { return; @@ -87,6 +103,7 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::cout << "initialize OpenCLRuntime and OpenCLAllocator"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); ocl_runtime->Init(); + ocl_runtime->SetFp16Enable(data_type == kNumberTypeFloat16); auto allocator = ocl_runtime->GetAllocator(); std::cout << "create Tensors"; @@ -94,7 +111,6 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::vector weight_shape = {param->output_channel_, param->kernel_h_, param->kernel_w_, param->input_channel_}; std::vector bias_shape = {param->output_channel_}; std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; - auto data_type = kNumberTypeFloat32; auto tensor_type = schema::NodeType_ValueNode; auto input_tensor = lite::tensor::Tensor(data_type, input_shape, input_format, tensor_type); auto weight_tensor = lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensor_type); @@ -121,11 +137,17 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const input_tensor.MallocData(allocator); // before MapBuffer() sub_graph->Init(); LoadData(input_tensor.Data(), input_tensor.Size(), input_file); // after MapBuffer() - printf("input[0-2] =%.3f\n", reinterpret_cast(input_tensor.Data())[0]); - printf("weight[0-2]=%.3f\n", reinterpret_cast(weight_tensor.Data())[0]); - printf("bias[0-2] =%.3f\n", reinterpret_cast(bias_tensor.Data())[0]); + if (data_type == kNumberTypeFloat16) { + printf("input[0] =%.3f\n", ShortToFloat32(reinterpret_cast(input_tensor.Data())[0])); + printf("weight[0]=%.3f\n", ShortToFloat32(reinterpret_cast(weight_tensor.Data())[0])); + printf("bias[0] =%.3f\n", ShortToFloat32(reinterpret_cast(bias_tensor.Data())[0])); + } else { + printf("input[0] =%.3f\n", reinterpret_cast(input_tensor.Data())[0]); + printf("weight[0]=%.3f\n", reinterpret_cast(weight_tensor.Data())[0]); + printf("bias[0] =%.3f\n", reinterpret_cast(bias_tensor.Data())[0]); + } sub_graph->Run(); - MyCompareOutput(&output_tensor, expect_file); + MyCompareOutput(&output_tensor, expect_file, data_type, (data_type == kNumberTypeFloat16 ? 0.7f : 0.1f)); std::cout << "release resources"; weight_tensor.FreeData(); @@ -139,72 +161,30 @@ void TEST_MAIN(schema::Format input_format, schema::Format output_format, const lite::opencl::OpenCLRuntime::DeleteInstance(); } -TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { - // change W/H +TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp32) { TEST_MAIN( - schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_" + schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/mobilenetv2_fp32/", + "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" "1x1"); } -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x1_outputNHWC_1x32x512x50) { - // speed up - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", - "inputNHWC_1x32x512x1_outputNHWC_1x32x512x50_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} - -TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { +TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101_fp16) { TEST_MAIN( - schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/", + schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/mobilenetv2_fp16/", "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" "1x1"); } -TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", +TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp32) { + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat32, "testcases/test_fp32/", "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" "dilationHW_1x1"); } -TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", - "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} - -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x32x512x50_outputNHWC_1x32x512x48) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x32x512x50_outputNHWC_1x32x512x48_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x250) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x8x128x100_outputNHWC_1x8x128x250_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} - -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x8x128x100_outputNHWC_1x8x128x300) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x8x128x100_outputNHWC_1x8x128x300_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} - -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x350) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x4x64x150_outputNHWC_1x4x64x350_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" - "dilationHW_1x1"); -} -TEST_F(TestConvolutionOpenCL, winograd_02_other_inputNHWC_1x4x64x150_outputNHWC_1x4x64x400) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", - "inputNHWC_1x4x64x150_outputNHWC_1x4x64x400_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" +TEST_F(TestConvolutionOpenCL, winograd_02_origin_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_fp16) { + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, kNumberTypeFloat16, "testcases/test_fp16/", + "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" "dilationHW_1x1"); } -TEST_F(TestConvolutionOpenCL, winograd_08_origin_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) { - TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", - "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_" - "1x1x1x1_dilationHW_1x1"); -} - } // namespace mindspore