diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc index f881b7c6f7..56e9b0330f 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.cc @@ -29,36 +29,275 @@ using mindspore::schema::PrimitiveType_Conv2D; namespace mindspore::kernel { int ConvolutionOpenCLKernel::Init() { - static int count = 0; - std::set build_options; - std::string source = CodeGen(); - std::string program_name = "convolution" + std::to_string(count); - count++; - std::string kernel_name = "convolution"; + static int init_count = 0; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto allocator = ocl_runtime->GetAllocator(); + auto param = reinterpret_cast(op_parameter_); + std::set build_options; + init_count++; + + CI_SLICES = UP_DIV(param->input_channel_, C4NUM); + CO_SLICES = UP_DIV(param->output_channel_, C4NUM); + + // note: TILES_X TILES_Y TILES_XY is only used when use_winograd_=true + TILES_X = UP_DIV(param->output_w_, 4); + TILES_Y = UP_DIV(param->output_h_, 4); + TILES_XY = TILES_X * TILES_Y; + use_winograd_ = UseWinograd4x4To6x6(); + + // build kernel + if (use_winograd_) { + MS_LOG(DEBUG) << "use winograd"; + std::string program_name; + program_name = "Winograd4x4To36" + std::to_string(init_count); + ocl_runtime->LoadSource(program_name, CodeGenWinograd4x4To36()); + ocl_runtime->BuildKernel(kernel_4x4to36, program_name, "Winograd4x4To36", build_options); + + program_name = "WinogradConvolution" + std::to_string(init_count); + ocl_runtime->LoadSource(program_name, CodeGenWinogradConvolution()); + ocl_runtime->BuildKernel(kernel_conv, program_name, "WinogradConvolution", build_options); + + program_name = "Winograd36To4x4" + std::to_string(init_count); + ocl_runtime->LoadSource(program_name, CodeGenWinograd36To4x4()); + ocl_runtime->BuildKernel(kernel_36to4x4, program_name, "Winograd36To4x4", build_options); + } else { + std::string program_name = "convolution" + std::to_string(init_count); + ocl_runtime->LoadSource(program_name, CodeGenConvolution()); + ocl_runtime->BuildKernel(kernel_conv, program_name, "Convolution", build_options); + } + + // allocate winograd memory + if (use_winograd_) { +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; + size_t sizeof_datatype = 2; +#else + size_t img_dtype = CL_FLOAT; + size_t sizeof_datatype = 4; +#endif + size_t size = TILES_XY * CI_SLICES * 36 * sizeof_datatype; + size_t width = TILES_XY; + size_t height = CI_SLICES * 36; + winograd_mem0_ = allocator->Malloc(size, {width, height, img_dtype}); + + size = TILES_XY * CO_SLICES * 36 * sizeof_datatype; + width = TILES_XY; + height = CO_SLICES * 36; + winograd_mem1_ = allocator->Malloc(size, {width, height, img_dtype}); + } - ocl_runtime->LoadSource(program_name, source); - ocl_runtime->BuildKernel(kernel_, program_name, kernel_name, build_options); this->InitBuffer(); ori_format_ = out_tensors_[0]->GetFormat(); out_tensors_[0]->SetFormat(schema::Format_NHWC4); - MS_LOG(DEBUG) << kernel_name << " Init Done!"; return RET_OK; } -std::string ConvolutionOpenCLKernel::CodeGen() { +int ConvolutionOpenCLKernel::InitBuffer() { + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto allocator = ocl_runtime->GetAllocator(); + + auto weight_tensor = in_tensors_[1]; + auto weight_shape = weight_tensor->shape(); + size_t CO = weight_shape[0]; + size_t KH = weight_shape[1]; + size_t KW = weight_shape[2]; + size_t CI = weight_shape[3]; + // size_t CI_SLICES = UP_DIV(CI, C4NUM); + // size_t CO_SLICES = UP_DIV(CO, C4NUM); + constexpr size_t CI_TILE = C4NUM; + constexpr size_t CO_TILE = C4NUM; + size_t packed_weight_size; + if (use_winograd_) { + packed_weight_size = UP_DIV(CO, 8) * 6 * 6 * CI_SLICES * 2 * CI_TILE * CO_TILE * sizeof(float); + } else { + packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); + } + packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); + allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true); + memset(packed_weight_, 0x00, packed_weight_size); + auto origin_weight = reinterpret_cast(weight_tensor->Data()); + + if (use_winograd_) { + // weight: OHWI -> O66I -> O/8 6 6 I/4 O2 I4 O4 + std::vector encoded_weight(CO * 6 * 6 * CI); + std::vector Gt = {1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 1.0000000000, 0.0000000000, + 0.0000000000, 0.7071067691, -0.7071067691, 1.4142135382, -1.4142135382, 0.0000000000, + 0.0000000000, 0.4999999702, 0.4999999702, 1.9999998808, 1.9999998808, 1.0000000000}; + + std::vector G(Gt.size()); + for (int y = 0; y < 3; ++y) { + for (int x = 0; x < 6; ++x) { + G[x * 3 + y] = Gt[y * 6 + x]; + } + } + + for (int co = 0; co < CO; ++co) { + for (int ci = 0; ci < CI; ++ci) { + std::vector in_vals(9); + for (int kh = 0; kh < 3; ++kh) { + for (int kw = 0; kw < 3; ++kw) { + const int f_index = ((co * 3 + kh) * 3 + kw) * CI + ci; + in_vals[kh * 3 + kw] = origin_weight[f_index]; + } + } + + auto temp_vals = MatrixMultiply(G, in_vals, 6, 3, 3); + auto out_vals = MatrixMultiply(temp_vals, Gt, 6, 3, 6); + for (int kh = 0; kh < 6; ++kh) { + for (int kw = 0; kw < 6; ++kw) { + const int f_index = ((co * 6 + kh) * 6 + kw) * CI + ci; + encoded_weight[f_index] = out_vals[kh * 6 + kw]; + } + } + } + } + + for (int co = 0, src_idx = 0; co < CO; ++co) { + for (int kh = 0; kh < 6; ++kh) { + for (int kw = 0; kw < 6; ++kw) { + for (int ci = 0; ci < CI; ++ci) { + int co_outer = co / 8; + int co_inner_group = co % 8 / 4; + int co_inner = co % 8 % 4; + int ci_outer = ci / 4; + int ci_inner = ci % 4; + size_t dst_idx = + (((((co_outer * 6 + kh) * 6 + kw) * CI_SLICES + ci_outer) * 2 + co_inner_group) * CI_TILE + ci_inner) * + CO_TILE + + co_inner; + packed_weight_[dst_idx] = encoded_weight[src_idx++]; + } + } + } + } + } else { + // weight: OHWI -> O/4 H W I/4 I4 O4 + for (int co = 0; co < CO; ++co) { + for (int kh = 0; kh < KH; ++kh) { + for (int kw = 0; kw < KW; ++kw) { + for (int ci = 0; ci < CI; ++ci) { + auto co_outer = co / CO_TILE; + auto co_inner = co % CO_TILE; + auto ci_outer = ci / CI_TILE; + auto ci_inner = ci % CI_TILE; + packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + + co_inner] = *(origin_weight++); + } + } + } + } + } + allocator->UnmapBuffer(packed_weight_); + + // align bias from C to C4 + auto bias_tensor = in_tensors_[2]; + size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); + packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); + packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); + memset(packed_bias_, 0x00, packed_bias_size); + auto bias_data = reinterpret_cast(bias_tensor->Data()); + for (int co = 0; co < CO; ++co) { + packed_bias_[co] = bias_data[co]; + } + allocator->UnmapBuffer(packed_bias_); + + return RET_OK; +} + +int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { + // size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM); + size_t im_dst_x, im_dst_y; + if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { + if (out_tensors_[0]->Width() * CO_SLICES < 65536) { + { + im_dst_x = out_tensors_[0]->Width() * CO_SLICES; + im_dst_y = out_tensors_[0]->Height(); + } + } else { + im_dst_x = out_tensors_[0]->Height() * CO_SLICES; + im_dst_y = out_tensors_[0]->Width(); + } + } else { + im_dst_y = out_tensors_[0]->Height() * CO_SLICES; + im_dst_x = out_tensors_[0]->Width(); + } +#ifdef ENABLE_FP16 + size_t img_dtype = CL_HALF_FLOAT; +#else + size_t img_dtype = CL_FLOAT; +#endif + img_size->clear(); + img_size->push_back(im_dst_x); + img_size->push_back(im_dst_y); + img_size->push_back(img_dtype); + return RET_OK; +} + +int ConvolutionOpenCLKernel::Run() { + MS_LOG(DEBUG) << this->name() << " Running!"; + auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); + auto param = reinterpret_cast(op_parameter_); + + int arg_cn = 0; + if (use_winograd_) { + arg_cn = 0; + cl_int4 _4x4to36_in_shape = {1, param->input_h_, param->input_w_, CI_SLICES}; + cl_int4 _4x4to36_out_shape = {1, 36, TILES_XY, CI_SLICES}; + ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, in_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, winograd_mem0_); + ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_in_shape); + ocl_runtime->SetKernelArg(kernel_4x4to36, arg_cn++, _4x4to36_out_shape); + + arg_cn = 0; + cl_int4 conv_in_shape = {1, 36, TILES_XY, CI_SLICES}; + cl_int4 conv_out_shape = {1, 36, TILES_XY, CO_SLICES}; + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem0_); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, winograd_mem1_); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_in_shape); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, conv_out_shape); + + arg_cn = 0; + cl_int4 _36to4x4_in_shape = {1, 16, TILES_XY, CO_SLICES}; + cl_int4 _36to4x4_out_shape = {1, param->output_h_, param->output_w_, CO_SLICES}; + ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, winograd_mem1_); + ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, out_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, packed_bias_); + ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_in_shape); + ocl_runtime->SetKernelArg(kernel_36to4x4, arg_cn++, _36to4x4_out_shape); + } else { + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, in_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, out_tensors_[0]->Data()); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_weight_); + ocl_runtime->SetKernelArg(kernel_conv, arg_cn++, packed_bias_); + } + + if (use_winograd_) { + ocl_runtime->RunKernel(kernel_4x4to36, {size_t(TILES_XY), 6, size_t(CI_SLICES)}, {16, 6, 4}, nullptr); + ocl_runtime->RunKernel(kernel_conv, {size_t(TILES_XY / 2), 36, size_t(CO_SLICES / 2)}, {8, 6, 2}, nullptr); + ocl_runtime->RunKernel(kernel_36to4x4, {size_t(TILES_XY), 4, size_t(CO_SLICES)}, {32, 4, 2}, nullptr); + } else { + std::vector global, local; + SetGlobalLocalConv(&global, &local); + ocl_runtime->RunKernel(kernel_conv, global, local, nullptr); + } + + return RET_OK; +} + +std::string ConvolutionOpenCLKernel::CodeGenConvolution() { auto param = reinterpret_cast(op_parameter_); auto input_tensor = in_tensors_[0]; auto output_tensor = out_tensors_[0]; const size_t CI = input_tensor->Channel(); - const size_t CI_SLICES = UP_DIV(CI, C4NUM); - const size_t CI_ALIGN = UP_DIV(CI, C4NUM) * C4NUM; + // const size_t CI_SLICES = UP_DIV(CI, C4NUM); + const size_t CI_ALIGN = CI_SLICES * C4NUM; const size_t IH = input_tensor->Height(); const size_t IW = input_tensor->Width(); const size_t CO = output_tensor->Channel(); - const size_t CO_SLICES = UP_DIV(CO, C4NUM); - const size_t CO_ALIGN = UP_DIV(CO, C4NUM) * C4NUM; + // const size_t CO_SLICES = UP_DIV(CO, C4NUM); + const size_t CO_ALIGN = CO_SLICES * C4NUM; const size_t OH = output_tensor->Height(); const size_t OW = output_tensor->Width(); const size_t KH = param->kernel_h_; @@ -106,10 +345,10 @@ std::string ConvolutionOpenCLKernel::CodeGen() { code += "__constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;\n\n"; code += - "__kernel void convolution(__read_only image2d_t input,\n" - " __global FLT4 *weight,\n" - " __global FLT4 *bias,\n" - " __write_only image2d_t output)\n" + "__kernel void Convolution(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " __global FLT4 *weight,\n" + " __global FLT4 *bias)" "{\n"; code += @@ -173,67 +412,218 @@ std::string ConvolutionOpenCLKernel::CodeGen() { return code; } -int ConvolutionOpenCLKernel::InitBuffer() { - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - auto allocator = ocl_runtime->GetAllocator(); - - // weight: OHWI -> OHWIIO - auto weight_tensor = in_tensors_[1]; - auto weight_shape = weight_tensor->shape(); - size_t CO = weight_shape[0]; - size_t KH = weight_shape[1]; - size_t KW = weight_shape[2]; - size_t CI = weight_shape[3]; - size_t CI_SLICES = UP_DIV(CI, C4NUM); - size_t CO_SLICES = UP_DIV(CO, C4NUM); - constexpr size_t CI_TILE = C4NUM; - constexpr size_t CO_TILE = C4NUM; - size_t packed_weight_size = CO_SLICES * KH * KW * CI_SLICES * CI_TILE * CO_TILE * sizeof(float); - packed_weight_ = reinterpret_cast(allocator->Malloc(packed_weight_size)); - packed_weight_ = reinterpret_cast(allocator->MapBuffer(packed_weight_, CL_MAP_WRITE, nullptr, true)); - memset(packed_weight_, 0x00, packed_weight_size); - auto weight_data = reinterpret_cast(weight_tensor->Data()); - for (int co = 0; co < CO; ++co) { - for (int kh = 0; kh < KH; ++kh) { - for (int kw = 0; kw < KW; ++kw) { - for (int ci = 0; ci < CI; ++ci) { - auto co_outer = co / CO_TILE; - auto co_inner = co % CO_TILE; - auto ci_outer = ci / CI_TILE; - auto ci_inner = ci % CI_TILE; - packed_weight_[((((co_outer * KH + kh) * KW + kw) * CI_SLICES + ci_outer) * CI_TILE + ci_inner) * CO_TILE + - co_inner] = *(weight_data++); - } - } - } - } - allocator->UnmapBuffer(packed_weight_); - - // align bias - auto bias_tensor = in_tensors_[2]; - size_t packed_bias_size = CO_SLICES * CO_TILE * sizeof(float); - packed_bias_ = reinterpret_cast(allocator->Malloc(packed_bias_size)); - packed_bias_ = reinterpret_cast(allocator->MapBuffer(packed_bias_, CL_MAP_WRITE, nullptr, true)); - memset(packed_bias_, 0x00, packed_bias_size); - auto bias_data = reinterpret_cast(bias_tensor->Data()); - for (int co = 0; co < CO; ++co) { - packed_bias_[co] = bias_data[co]; - } - allocator->UnmapBuffer(packed_bias_); +std::string ConvolutionOpenCLKernel::CodeGenWinograd4x4To36() { + return "#define UP_DIV(x, y) (((x) + (y) - (1)) / (y))\n" + "#define PAD 1\n" + "\n" + "__constant sampler_t\n" + "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" + "\n" + "constant float Bt[36] = {\n" + " 1.0000000000f, 0.0000000000f, -2.5000004768f, -0.0000001192f, 1.0000001192f, 0.0000000000f,\n" + " 0.0000000000f, 0.9428091049f, 1.3333333731f, -0.4714044929f, -0.6666667461f, 0.0000000000f,\n" + " 0.0000000000f, -0.9428089857f, 1.3333334923f, 0.4714045525f, -0.6666667461f, 0.0000000000f,\n" + " 0.0000000000f, -0.1178511307f, -0.0833333358f, 0.2357022613f, 0.1666666865f, 0.0000000000f,\n" + " 0.0000000000f, 0.1178511307f, -0.0833333507f, -0.2357022911f, 0.1666666865f, 0.0000000000f,\n" + " 0.0000000000f, 0.9999998808f, -0.0000000596f, -2.5000000000f, 0.0000000000f, 1.0000000000f,\n" + "};\n" + "\n" + "__kernel void Winograd4x4To36(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " int4 input_shape, // N H W CI_SLICES\n" + " int4 output_shape) // N 36 H/4*W/4 CI_SLICES\n" + "{\n" + " int tile_xy = get_global_id(0);\n" + " int row = get_global_id(1);\n" + " int slice = get_global_id(2);\n" + "\n" + " int TILE_XY = output_shape.z;\n" + " int SLICES = input_shape.w;\n" + " if (tile_xy >= TILE_XY || row >= 6 || slice >= SLICES)\n" + " {\n" + " return;\n" + " }\n" + "\n" + " int IH = input_shape.y, IW = input_shape.z;\n" + " int TILE_X = IW / 4;\n" + " int tile_x = tile_xy % TILE_X;\n" + " int tile_y = tile_xy / TILE_X;\n" + "\n" + " constant float *Bt_row = Bt + row * 6;\n" + " float4 BtD_row[6] = {0};\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " int y_idx = tile_y * 4 - PAD + y;\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " int x_idx = (tile_x * 4 - PAD + x) * SLICES + slice;\n" + " BtD_row[x] += Bt_row[y] * read_imagef(input, smp_none, (int2)(x_idx, y_idx));\n" + " }\n" + " }\n" + "\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " acc += BtD_row[x] * Bt[y * 6 + x];\n" + " }\n" + "// write_imagef(output, (int2)((row * 6 + y) * SLICES + slice, tile_xy), acc); // H WC W=36\n" + " write_imagef(output, (int2)(tile_xy, slice * 36 + (row * 6 + y)), acc); // CH W H=36\n" + " }\n" + "}"; +} - return RET_OK; -} // namespace mindspore::kernel +std::string ConvolutionOpenCLKernel::CodeGenWinogradConvolution() { + return "#define CI_TILE 4\n" + "//#define CI 96\n" + "#define IH 36\n" + "//#define IW 256\n" + "//#define CO 80\n" + "#define OH 36\n" + "//#define OW 256\n" + "//#define CI_SLICES 24\n" + "//#define CO_SLICES 20\n" + "\n" + "#define FLT4 float4\n" + "#define READ_FLT4 read_imagef\n" + "#define WRITE_FLT4 write_imagef\n" + "\n" + "//#define __global\n" + "\n" + "__constant sampler_t\n" + "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" + "\n" + "__kernel void WinogradConvolution(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " __global float16 *weight,\n" + " int4 input_shape, // N 36 H/4*W/4 CI_SLICES\n" + " int4 output_shape) // N 36 H/4*W/4 CO_SLICES\n" + "{\n" + " int ow = get_global_id(0) * 2;\n" + " int oh = get_global_id(1);\n" + " int co_slice = get_global_id(2) * 2;\n" + "\n" + " int CI_SLICES = input_shape.w;\n" + " int IW = input_shape.z;\n" + " int CO_SLICES = output_shape.w;\n" + " int OW = IW;\n" + "\n" + " if (oh >= OH || ow >= OW || co_slice >= CO_SLICES)\n" + " {\n" + " return;\n" + " }\n" + "\n" + " __global float16 *w_ptr = weight + (co_slice / 2 * 36 + oh) * CI_SLICES * 2;\n" + " int y_idx = oh;\n" + " FLT4 out00 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 out01 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 out10 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " FLT4 out11 = (FLT4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " for (int ci_slice = 0; ci_slice < CI_SLICES; ci_slice++)\n" + " {\n" + " FLT4 in0 = READ_FLT4(input, smp_none, (int2)(ow + 0, y_idx));\n" + " FLT4 in1 = READ_FLT4(input, smp_none, (int2)(ow + 1, y_idx));\n" + " y_idx += 36;\n" + "\n" + " float16 w0 = w_ptr[0], w1 = w_ptr[1];\n" + " w_ptr += 2;\n" + "\n" + " out00 += in0.x * w0.s0123;\n" + " out00 += in0.y * w0.s4567;\n" + " out00 += in0.z * w0.s89ab;\n" + " out00 += in0.w * w0.scdef;\n" + "\n" + " out01 += in1.x * w0.s0123;\n" + " out01 += in1.y * w0.s4567;\n" + " out01 += in1.z * w0.s89ab;\n" + " out01 += in1.w * w0.scdef;\n" + "\n" + " out10 += in0.x * w1.s0123;\n" + " out10 += in0.y * w1.s4567;\n" + " out10 += in0.z * w1.s89ab;\n" + " out10 += in0.w * w1.scdef;\n" + "\n" + " out11 += in1.x * w1.s0123;\n" + " out11 += in1.y * w1.s4567;\n" + " out11 += in1.z * w1.s89ab;\n" + " out11 += in1.w * w1.scdef;\n" + " }\n" + " WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 0) * 36 + oh), out00);\n" + " WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 0) * 36 + oh), out01);\n" + " WRITE_FLT4(output, (int2)(ow + 0, (co_slice + 1) * 36 + oh), out10);\n" + " WRITE_FLT4(output, (int2)(ow + 1, (co_slice + 1) * 36 + oh), out11);\n" + "}"; +} -static int GetBiggestDivider(int x, int y) { - for (int i = y; i != 0; i--) { - if (x % i == 0) { - return i; - } - } - return 1; +std::string ConvolutionOpenCLKernel::CodeGenWinograd36To4x4() { + return "//#define TILE_XY 256\n" + "//#define SLICES 20\n" + "//#define OH 16\n" + "//#define OW 256\n" + "\n" + "//#define __global\n" + "__constant sampler_t\n" + "smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST;\n" + "\n" + "constant float At[24] = {\n" + " 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 1.0000000000f, 0.0000000000f,\n" + " 0.0000000000f, 0.7071067691f, -0.7071067691f, 1.4142135382f, -1.4142135382f, 0.0000000000f,\n" + " 0.0000000000f, 0.4999999702f, 0.4999999702f, 1.9999998808f, 1.9999998808f, 0.0000000000f,\n" + " 0.0000000000f, 0.3535533845f, -0.3535533845f, 2.8284270763f, -2.8284270763f, 1.0000000000f\n" + "};\n" + "\n" + "__kernel void Winograd36To4x4(__read_only image2d_t input,\n" + " __write_only image2d_t output,\n" + " __global float4 *bias,\n" + " int4 input_shape, // N 36 H/4*W/4 CO_SLICES\n" + " int4 output_shape) // N H W CO_SLICES\n" + "{\n" + " int tile_xy = get_global_id(0);\n" + " int row = get_global_id(1);\n" + " int slice = get_global_id(2);\n" + "\n" + " int TILE_XY = input_shape.z;\n" + " int SLICES = input_shape.w;\n" + " int OH = output_shape.y;\n" + " int OW = output_shape.z;\n" + "\n" + " if (tile_xy >= TILE_XY || row >= 4 || slice >= SLICES)\n" + " {\n" + " return;\n" + " }\n" + "\n" + " constant float *At_row = At + row * 6;\n" + " float4 AtM_row[6] = {0};\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " for (int x = 0; x < 6; x++)\n" + " {\n" + " AtM_row[x] += At_row[y] * read_imagef(input, smp_none, (int2)(tile_xy, slice * 36 + y * 6 + " + "x));\n" + " }\n" + " }\n" + "\n" + " for (int x = 0; x < 4; x++)\n" + " {\n" + " float4 acc = (float4)(0.0f, 0.0f, 0.0f, 0.0f);\n" + " for (int y = 0; y < 6; y++)\n" + " {\n" + " acc += AtM_row[y] * At[x * 6 + y];\n" + " }\n" + " acc += bias[slice];\n" + "\n" + " int TILE_X = OW / 4;\n" + " int tile_x = tile_xy % TILE_X * 4;\n" + " int tile_y = tile_xy / TILE_X * 4;\n" + "// write_imagef(output, (int2)(tile_x + x, slice * OH + tile_y + row), acc); // height=CH width=W\n" + " write_imagef(output, (int2)((tile_x + x) * SLICES + slice, tile_y + row), acc); // height=H " + "width=WC\n" + " }\n" + "}"; } -int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::vector *local) { +int ConvolutionOpenCLKernel::SetGlobalLocalConv(std::vector *global, std::vector *local) { auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); auto param = reinterpret_cast(op_parameter_); param->output_h_ = out_tensors_[0]->Height(); @@ -242,12 +632,12 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::ve constexpr size_t work_group_size[] = {4, 4, 1}; auto max_work_item_sizes = ocl_runtime->GetWorkItemSize(); - size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_(), (*ocl_runtime->Device())()); + size_t max_work_group_size = ocl_runtime->GetKernelMaxWorkGroupSize(kernel_conv(), (*ocl_runtime->Device())()); const size_t max_z_size = std::min(16, max_work_item_sizes[2]); size_t global_h = UP_DIV(param->output_h_, work_group_size[0]) * work_group_size[0]; size_t global_w = UP_DIV(param->output_w_, work_group_size[1]) * work_group_size[1]; - size_t global_c = UP_DIV(UP_DIV(param->output_channel_, C4NUM), work_group_size[2]) * work_group_size[2]; + size_t global_c = UP_DIV(CO_SLICES, work_group_size[2]) * work_group_size[2]; size_t local_c = GetBiggestDivider(global_c, max_z_size); if (local_c == 0) { @@ -262,8 +652,6 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::ve } auto output_tensor = out_tensors_[0]; - const size_t CO = output_tensor->Channel(); - const size_t CO_SLICES = UP_DIV(CO, C4NUM); const size_t OW = output_tensor->Width(); if (OW * CO_SLICES > 65536) { local_w = 4; @@ -272,7 +660,7 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::ve global->clear(); global->push_back(UP_DIV(param->output_h_, local_h) * local_h); global->push_back(UP_DIV(param->output_w_, local_w) * local_w); - global->push_back(UP_DIV(UP_DIV(param->output_channel_, C4NUM), local_c) * local_c); + global->push_back(UP_DIV(CO_SLICES, local_c) * local_c); local->clear(); local->push_back(local_h); local->push_back(local_w); @@ -280,52 +668,6 @@ int ConvolutionOpenCLKernel::GetGlobalLocal(std::vector *global, std::ve return RET_OK; } -int ConvolutionOpenCLKernel::GetImageSize(size_t idx, std::vector *img_size) { - size_t CO_SLICES = UP_DIV(out_tensors_[0]->Channel(), C4NUM); - size_t im_dst_x, im_dst_y; - if (in_tensors_[0]->GetFormat() == schema::Format_NHWC4) { - if (out_tensors_[0]->Width() * CO_SLICES < 65536) { - { - im_dst_x = out_tensors_[0]->Width() * CO_SLICES; - im_dst_y = out_tensors_[0]->Height(); - } - } else { - im_dst_x = out_tensors_[0]->Height() * CO_SLICES; - im_dst_y = out_tensors_[0]->Width(); - } - } else { - im_dst_y = out_tensors_[0]->Height() * CO_SLICES; - im_dst_x = out_tensors_[0]->Width(); - } -#ifdef ENABLE_FP16 - size_t img_dtype = CL_HALF_FLOAT; -#else - size_t img_dtype = CL_FLOAT; -#endif - img_size->clear(); - img_size->push_back(im_dst_x); - img_size->push_back(im_dst_y); - img_size->push_back(img_dtype); - return RET_OK; -} - -int ConvolutionOpenCLKernel::Run() { - MS_LOG(DEBUG) << this->name() << " Running!"; - auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); - - int arg_cn = 0; - ocl_runtime->SetKernelArg(kernel_, arg_cn++, in_tensors_[0]->Data()); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_weight_); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, packed_bias_); - ocl_runtime->SetKernelArg(kernel_, arg_cn++, out_tensors_[0]->Data()); - - std::vector global; - std::vector local; - GetGlobalLocal(&global, &local); - ocl_runtime->RunKernel(kernel_, global, local, nullptr); - return RET_OK; -} - kernel::LiteKernel *OpenCLConvolutionKernelCreator(const std::vector &inputs, const std::vector &outputs, OpParameter *opParameter, const lite::Context *ctx, diff --git a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h index d9da43df49..cfd426b944 100644 --- a/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h +++ b/mindspore/lite/src/runtime/kernel/opencl/kernel/convolution.h @@ -40,12 +40,59 @@ class ConvolutionOpenCLKernel : public OpenCLKernel { int GetImageSize(size_t idx, std::vector *img_size) override; private: + int CI_SLICES; + int CO_SLICES; float *packed_weight_ = nullptr; float *packed_bias_ = nullptr; - cl::Kernel kernel_; - std::string CodeGen(); - int GetGlobalLocal(std::vector *global, std::vector *local); + bool use_winograd_ = false; + int TILES_X; + int TILES_Y; + int TILES_XY; + void *winograd_mem0_ = nullptr; + void *winograd_mem1_ = nullptr; + + cl::Kernel kernel_4x4to36; + cl::Kernel kernel_conv; + cl::Kernel kernel_36to4x4; + + std::string CodeGenConvolution(); + std::string CodeGenWinograd4x4To36(); + std::string CodeGenWinogradConvolution(); + std::string CodeGenWinograd36To4x4(); + int SetGlobalLocalConv(std::vector *global, std::vector *local); + + bool UseWinograd4x4To6x6() { + auto param = reinterpret_cast(op_parameter_); + const bool attr_valid = param->kernel_h_ == 3 && param->kernel_w_ == 3 && param->dilation_h_ == 1 && + param->dilation_w_ == 1 && param->stride_h_ == 1 && param->stride_w_ == 1; + const bool channel_good = CO_SLICES % 4 == 0 && CI_SLICES >= 16 && CO_SLICES >= 16; + const bool hw_good = TILES_X * TILES_Y >= 32; + return attr_valid && channel_good && hw_good; + } + + std::vector MatrixMultiply(const std::vector &A, const std::vector &B, int M, int N, int K) { + std::vector C(M * K); + for (int i = 0; i < M; ++i) { + for (int j = 0; j < K; ++j) { + float s = 0.0f; + for (int k = 0; k < N; ++k) { + s += A[i * N + k] * B[k * K + j]; + } + C[i * K + j] = s; + } + } + return C; + } + + static int GetBiggestDivider(int x, int y) { + for (int i = y; i != 0; i--) { + if (x % i == 0) { + return i; + } + } + return 1; + } }; } // namespace mindspore::kernel diff --git a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc index cf579ee5ab..0f383ffeeb 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_allocator.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_allocator.cc @@ -113,6 +113,7 @@ void *OpenCLAllocator::Malloc(size_t size, const std::vector &img_size) UnLock(); return nullptr; } + MS_LOG(DEBUG) << "Malloc a new Image2D, width=" << img_size[0] << ", height=" << img_size[1]; image_ptr = static_cast(image); } } diff --git a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc index 5292973f68..93e7aa2e19 100644 --- a/mindspore/lite/src/runtime/opencl/opencl_runtime.cc +++ b/mindspore/lite/src/runtime/opencl/opencl_runtime.cc @@ -71,6 +71,10 @@ void OpenCLRuntime::DeleteInstance() { OpenCLRuntime::OpenCLRuntime() { default_build_opts_ = " -cl-mad-enable -cl-fast-relaxed-math -Werror"; } +void printf_callback(const char *buffer, size_t length, size_t final, void *user_data) { + fwrite(buffer, 1, length, stdout); +} + // Init will get platforms info, get devices info, create opencl context. int OpenCLRuntime::Init() { std::unique_lock lck(g_init_mtx); @@ -147,6 +151,9 @@ int OpenCLRuntime::Init() { } #else MS_LOG(INFO) << "Create common opencl context"; + // cl_context_properties context_prop[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0](), + // CL_PRINTF_CALLBACK_ARM, (cl_context_properties)printf_callback, 0}; + // context_ = std::make_shared(std::vector{*device_}, context_prop, nullptr, nullptr, &err); context_ = std::make_shared(std::vector{*device_}, nullptr, nullptr, nullptr, &err); #endif if (err != CL_SUCCESS) { diff --git a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc index ef8f347025..966bd32af2 100644 --- a/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc +++ b/mindspore/lite/test/ut/src/runtime/kernel/opencl/convolution_tests.cc @@ -63,9 +63,26 @@ void MyCompareOutput(lite::tensor::Tensor *output_tensor, const std::string &fil printf("compare success!\n\n\n"); } -void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::string &input_file, - const std::string &weight_file, const std::string &bias_file, const std::string &expect_file) { +void TEST_MAIN(schema::Format input_format, schema::Format output_format, const std::string &data_path, + std::string attr_str) { assert(data_format == schema::Format_NHWC || data_format == schema::Format_NHWC4); + auto param = new ConvParameter; + sscanf(attr_str.c_str(), + "inputNHWC_%dx%dx%dx%d_outputNHWC_%dx%dx%dx%d_kernelHW_%dx%d_strideHW_%dx%d_padTopBottomLeftRight_%dx%dx%dx%d_" + "dilationHW_%dx%d", + ¶m->input_batch_, ¶m->input_h_, ¶m->input_w_, ¶m->input_channel_, ¶m->output_batch_, + ¶m->output_h_, ¶m->output_w_, ¶m->output_channel_, ¶m->kernel_h_, ¶m->kernel_w_, + ¶m->stride_h_, ¶m->stride_w_, ¶m->pad_u_, ¶m->pad_d_, ¶m->pad_l_, ¶m->pad_r_, + ¶m->dilation_h_, ¶m->dilation_w_); + auto testcase_path = data_path + "/" + attr_str + "/"; + auto input_file = testcase_path + (input_format == schema::Format_NHWC4 ? "input_NHWC4.bin" : "input_NHWC.bin"); + auto weight_file = testcase_path + "weight_OHWI.bin"; + auto bias_file = testcase_path + "bias_C4.bin"; + auto expect_file = testcase_path + (output_format == schema::Format_NHWC4 ? "expect_NHWC4.bin" : "expect_NHWC.bin"); + std::cout << input_file << std::endl; + std::cout << weight_file << std::endl; + std::cout << bias_file << std::endl; + std::cout << expect_file << std::endl; std::cout << "initialize OpenCLRuntime"; auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); @@ -79,10 +96,10 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri std::vector output_shape = {param->output_batch_, param->output_h_, param->output_w_, param->output_channel_}; auto data_type = kNumberTypeFloat32; auto tensorType = schema::NodeType_ValueNode; - auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, data_format, tensorType); + auto input_tensor = new lite::tensor::Tensor(data_type, input_shape, input_format, tensorType); auto weight_tensor = new lite::tensor::Tensor(data_type, weight_shape, schema::Format_KHWC, tensorType); auto bias_tensor = new lite::tensor::Tensor(data_type, bias_shape, schema::Format_KHWC, tensorType); - auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, data_format, tensorType); + auto output_tensor = new lite::tensor::Tensor(data_type, output_shape, output_format, tensorType); std::vector inputs{input_tensor, weight_tensor, bias_tensor}; std::vector outputs{output_tensor}; @@ -114,7 +131,6 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri std::cout << "sub_graph->Run()"; sub_graph->Run(); - printf("output_tensor->Size() =%zu\n", output_tensor->Size()); std::cout << "compare result"; MyCompareOutput(output_tensor, expect_file); @@ -131,57 +147,35 @@ void TEST_MAIN(ConvParameter *param, schema::Format data_format, const std::stri mindspore::lite::opencl::OpenCLRuntime::DeleteInstance(); } -std::array GenFilenames(ConvParameter *param, schema::Format data_format, const std::string &path) { - auto full_path = path + "inputNHWC_" + std::to_string(param->input_batch_) + "x" + std::to_string(param->input_h_) + - "x" + std::to_string(param->input_w_) + "x" + std::to_string(param->input_channel_) + - "_outputNHWC_" + std::to_string(param->output_batch_) + "x" + std::to_string(param->output_h_) + - "x" + std::to_string(param->output_w_) + "x" + std::to_string(param->output_channel_) + - "_kernelHW_" + std::to_string(param->kernel_h_) + "x" + std::to_string(param->kernel_w_) + - "_strideHW_" + std::to_string(param->stride_h_) + "x" + std::to_string(param->stride_w_) + - "_padTopBottomLeftRight_" + std::to_string(param->pad_u_) + "x" + std::to_string(param->pad_d_) + - "x" + std::to_string(param->pad_l_) + "x" + std::to_string(param->pad_r_) + "_dilationHW_1x1/"; - - if (data_format == schema::Format_NHWC4) { - return std::array{full_path + "input_NHWC4.bin", full_path + "weight_OHWI.bin", - full_path + "bias_C4.bin", full_path + "expect_NHWC4.bin"}; - } else { - return std::array{full_path + "input_NHWC.bin", full_path + "weight_OHWI.bin", - full_path + "bias_C.bin", full_path + "expect_NHWC.bin"}; - } +TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { + TEST_MAIN( + schema::Format_NHWC, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/", + "inputNHWC_1x224x224x3_outputNHWC_1x112x112x32_kernelHW_3x3_strideHW_2x2_padTopBottomLeftRight_0x1x0x1_dilationHW_" + "1x1"); } -TEST_F(TestConvolutionOpenCL, in1x224x224x3_out1x112x112x32_k33_s22_p0101) { - auto param = new ConvParameter; - param->input_batch_ = 1, param->input_h_ = 224, param->input_w_ = 224, param->input_channel_ = 3; - param->output_batch_ = 1, param->output_h_ = 112, param->output_w_ = 112, param->output_channel_ = 32; - param->kernel_h_ = 3, param->kernel_w_ = 3; - param->stride_h_ = 2, param->stride_w_ = 2; - param->pad_u_ = 0, param->pad_d_ = 1, param->pad_l_ = 0, param->pad_r_ = 1; - - auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/mobilenetv2_fp32/"); - // std::cout << filenames[0] << std::endl; - // std::cout << filenames[1] << std::endl; - // std::cout << filenames[2] << std::endl; - // std::cout << filenames[3] << std::endl; - TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]); - lite::opencl::OpenCLRuntime::DeleteInstance(); +// TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { +// TEST_MAIN( +// schema::Format_NHWC, schema::Format_NHWC4, "testcases/02_fp32/", +// "inputNHWC_1x1x64x512_outputNHWC_1x1x64x7358_kernelHW_1x1_strideHW_1x1_padTopBottomLeftRight_0x0x0x0_dilationHW_" +// "1x1"); +//} + +TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x96_outputNHWC_1x16x256x80) { + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", + "inputNHWC_1x16x256x96_outputNHWC_1x16x256x80_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" + "dilationHW_1x1"); +} +TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x16x256x100_outputNHWC_1x16x256x96) { + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", + "inputNHWC_1x16x256x100_outputNHWC_1x16x256x96_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_1x1x1x1_" + "dilationHW_1x1"); } -TEST_F(TestConvolutionOpenCL, in1x1x64x512_out1x1x64x7358_k11_s11_p0000) { - auto param = new ConvParameter; - param->input_batch_ = 1, param->input_h_ = 1, param->input_w_ = 64, param->input_channel_ = 512; - param->output_batch_ = 1, param->output_h_ = 1, param->output_w_ = 64, param->output_channel_ = 7358; - param->kernel_h_ = 1, param->kernel_w_ = 1; - param->stride_h_ = 1, param->stride_w_ = 1; - param->pad_u_ = 0, param->pad_d_ = 0, param->pad_l_ = 0, param->pad_r_ = 0; - - auto filenames = GenFilenames(param, schema::Format_NHWC4, "testcases/02_fp32/"); - // std::cout << filenames[0] << std::endl; - // std::cout << filenames[1] << std::endl; - // std::cout << filenames[2] << std::endl; - // std::cout << filenames[3] << std::endl; - TEST_MAIN(param, schema::Format_NHWC4, filenames[0], filenames[1], filenames[2], filenames[3]); - lite::opencl::OpenCLRuntime::DeleteInstance(); +TEST_F(TestConvolutionOpenCL, winograd_inputNHWC_1x480x480x128_outputNHWC_1x480x480x128) { + TEST_MAIN(schema::Format_NHWC, schema::Format_NHWC4, "testcases/test_fp32/", + "inputNHWC_1x480x480x128_outputNHWC_1x480x480x128_kernelHW_3x3_strideHW_1x1_padTopBottomLeftRight_" + "1x1x1x1_dilationHW_1x1"); } } // namespace mindspore