| @@ -1,5 +1,5 @@ | |||
| __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | |||
| __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||
| const int2 padding) { | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| // axis to dst tensor coordinate | |||
| int X = get_global_id(0); | |||
| int Y = get_global_id(1); | |||
| @@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside_x = x_c < 0 || x_c >= input_shape.x; | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; | |||
| r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z)); | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||
| r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| window_size += !outside ? 1.0f : 0.0f; | |||
| } | |||
| } | |||
| float4 result = convert_float4(r / window_size); | |||
| write_imagef(output, (int2)(X, Y * output_shape.w + Z), result); | |||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); | |||
| } | |||
| @@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, | |||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; | |||
| } | |||
| __constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||
| __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | |||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||
| @@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||
| float4 maximum = (float4)(-10000.0f); | |||
| int xs = X * stride.x + padding.x; | |||
| int ys = Y * stride.y + padding.y; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) { | |||
| continue; | |||
| } | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) { | |||
| continue; | |||
| } | |||
| float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z)); | |||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||
| int y_c = ys + ky; | |||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||
| int x_c = xs + kx; | |||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||
| float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||
| maximum = max(src, maximum); | |||
| } | |||
| } | |||
| write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum); | |||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||
| } | |||
| @@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() { | |||
| std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | |||
| const size_t global_x = outputs_[0]->Height(); | |||
| const size_t global_y = outputs_[0]->Width(); | |||
| const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4); | |||
| const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| std::vector<size_t> global = {global_x, global_y, global_z}; | |||
| return global; | |||
| } | |||
| @@ -90,13 +90,8 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | |||
| int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | |||
| size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| size_t im_dst_x, im_dst_y; | |||
| if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { | |||
| im_dst_x = outputs_[0]->Height(); | |||
| im_dst_y = outputs_[0]->Width() * CO4; | |||
| } else { | |||
| im_dst_y = outputs_[0]->Width(); | |||
| im_dst_x = outputs_[0]->Height() * CO4; | |||
| } | |||
| im_dst_x = outputs_[0]->Width() * CO4; | |||
| im_dst_y = outputs_[0]->Height(); | |||
| #ifdef ENABLE_FP16 | |||
| size_t img_dtype = CL_HALF_FLOAT; | |||
| #else | |||
| @@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() { | |||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | |||
| // attribute | |||
| int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4); | |||
| int slices = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||
| cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices}; | |||
| cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices}; | |||
| cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; | |||