| @@ -1,5 +1,5 @@ | |||||
| __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, const int4 input_shape, | ||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -32,8 +32,7 @@ __kernel void AvgPooling2d_BUF(__global float4 *input, __global float4 *output, | |||||
| __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | __constant sampler_t smp_zero = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; | ||||
| __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | ||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, | |||||
| const int2 padding) { | |||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | |||||
| // axis to dst tensor coordinate | // axis to dst tensor coordinate | ||||
| int X = get_global_id(0); | int X = get_global_id(0); | ||||
| int Y = get_global_id(1); | int Y = get_global_id(1); | ||||
| @@ -49,17 +48,16 @@ __kernel void AvgPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||||
| int xs = X * stride.x + padding.x; | int xs = X * stride.x + padding.x; | ||||
| int ys = Y * stride.y + padding.y; | int ys = Y * stride.y + padding.y; | ||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| bool outside_x = x_c < 0 || x_c >= input_shape.x; | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| bool outside = outside_x || y_c < 0 || y_c >= input_shape.y; | |||||
| r += read_imagef(input, smp_zero, (int2)(x_c, y_c * input_shape.w + Z)); | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| bool outside_y = y_c < 0 || y_c >= input_shape.y; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| bool outside = outside_y || x_c < 0 || x_c >= input_shape.x; | |||||
| r += read_imagef(input, smp_zero, (int2)(y_c * input_shape.w + Z, x_c)); | |||||
| window_size += !outside ? 1.0f : 0.0f; | window_size += !outside ? 1.0f : 0.0f; | ||||
| } | } | ||||
| } | } | ||||
| float4 result = convert_float4(r / window_size); | float4 result = convert_float4(r / window_size); | ||||
| write_imagef(output, (int2)(X, Y * output_shape.w + Z), result); | |||||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), result); | |||||
| } | } | ||||
| @@ -31,7 +31,7 @@ __kernel void MaxPooling2d_BUF(__global float4 *input, __global float4 *output, | |||||
| output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; | output[(output_shape.y * X + Y) * output_shape.w + Z] = maximum; | ||||
| } | } | ||||
| __constant sampler_t sample_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||||
| __constant sampler_t smp_none = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_NONE | CLK_FILTER_NEAREST; | |||||
| __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d_t output, const int4 input_shape, | ||||
| const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | const int4 output_shape, const int2 stride, const int2 kernel_size, const int2 padding) { | ||||
| @@ -48,20 +48,15 @@ __kernel void MaxPooling2d_IMG(__read_only image2d_t input, __write_only image2d | |||||
| float4 maximum = (float4)(-10000.0f); | float4 maximum = (float4)(-10000.0f); | ||||
| int xs = X * stride.x + padding.x; | int xs = X * stride.x + padding.x; | ||||
| int ys = Y * stride.y + padding.y; | int ys = Y * stride.y + padding.y; | ||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| if (x_c < 0 || x_c >= input_shape.x) { | |||||
| continue; | |||||
| } | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| if (y_c < 0 || y_c >= input_shape.y) { | |||||
| continue; | |||||
| } | |||||
| float4 src = read_imagef(input, sample_none, (int2)(x_c, y_c * input_shape.w + Z)); | |||||
| for (int ky = 0; ky < kernel_size.y; ++ky) { | |||||
| int y_c = ys + ky; | |||||
| if (y_c < 0 || y_c >= input_shape.y) continue; | |||||
| for (int kx = 0; kx < kernel_size.x; ++kx) { | |||||
| int x_c = xs + kx; | |||||
| if (x_c < 0 || x_c >= input_shape.x) continue; | |||||
| float4 src = read_imagef(input, smp_none, (int2)(y_c * input_shape.w + Z, x_c)); | |||||
| maximum = max(src, maximum); | maximum = max(src, maximum); | ||||
| } | } | ||||
| } | } | ||||
| write_imagef(output, (int2)(X, Y * output_shape.w + Z), maximum); | |||||
| write_imagef(output, (int2)(Y * output_shape.w + Z, X), maximum); | |||||
| } | } | ||||
| @@ -82,7 +82,7 @@ int PoolingOpenCLKernel::Init() { | |||||
| std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | ||||
| const size_t global_x = outputs_[0]->Height(); | const size_t global_x = outputs_[0]->Height(); | ||||
| const size_t global_y = outputs_[0]->Width(); | const size_t global_y = outputs_[0]->Width(); | ||||
| const size_t global_z = UP_ROUND_DIV(outputs_[0]->Channel(), 4); | |||||
| const size_t global_z = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||||
| std::vector<size_t> global = {global_x, global_y, global_z}; | std::vector<size_t> global = {global_x, global_y, global_z}; | ||||
| return global; | return global; | ||||
| } | } | ||||
| @@ -90,13 +90,8 @@ std::vector<size_t> PoolingOpenCLKernel::InitGlobalSize() const { | |||||
| int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | int PoolingOpenCLKernel::GetImageSize(size_t idx, std::vector<size_t> *img_size) { | ||||
| size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); | size_t CO4 = UP_DIV(outputs_[0]->Channel(), C4NUM); | ||||
| size_t im_dst_x, im_dst_y; | size_t im_dst_x, im_dst_y; | ||||
| if (inputs_[0]->GetFormat() == schema::Format_NHWC4) { | |||||
| im_dst_x = outputs_[0]->Height(); | |||||
| im_dst_y = outputs_[0]->Width() * CO4; | |||||
| } else { | |||||
| im_dst_y = outputs_[0]->Width(); | |||||
| im_dst_x = outputs_[0]->Height() * CO4; | |||||
| } | |||||
| im_dst_x = outputs_[0]->Width() * CO4; | |||||
| im_dst_y = outputs_[0]->Height(); | |||||
| #ifdef ENABLE_FP16 | #ifdef ENABLE_FP16 | ||||
| size_t img_dtype = CL_HALF_FLOAT; | size_t img_dtype = CL_HALF_FLOAT; | ||||
| #else | #else | ||||
| @@ -117,7 +112,7 @@ int PoolingOpenCLKernel::Run() { | |||||
| auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | auto ocl_runtime = lite::opencl::OpenCLRuntime::GetInstance(); | ||||
| // attribute | // attribute | ||||
| int slices = UP_ROUND_DIV(outputs_[0]->Channel(), 4); | |||||
| int slices = UP_DIV(outputs_[0]->Channel(), C4NUM); | |||||
| cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices}; | cl_int4 input_shape = {inputs_[0]->Height(), inputs_[0]->Width(), inputs_[0]->Channel(), slices}; | ||||
| cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices}; | cl_int4 output_shape = {outputs_[0]->Height(), outputs_[0]->Width(), outputs_[0]->Channel(), slices}; | ||||
| cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; | cl_int2 stride = {parameter_->stride_h_, parameter_->stride_w_}; | ||||