From 4c381bee47cb99806ffb2c19b032a65351344e00 Mon Sep 17 00:00:00 2001 From: nihui Date: Tue, 21 Jan 2020 14:22:54 +0800 Subject: [PATCH] reduce reshape pack4 bandwidth, add some pack8 shaders --- .../vulkan/shader/interp_bicubic_pack8.comp | 115 +++++++++++ src/layer/vulkan/shader/interp_pack8.comp | 120 ++++++++++++ .../shader/lrn_norm_across_channel_pack8.comp | 98 ++++++++++ .../shader/lrn_norm_within_channel_pack8.comp | 91 +++++++++ .../lrn_square_pad_across_channel_pack8.comp | 85 ++++++++ .../lrn_square_pad_within_channel_pack8.comp | 82 ++++++++ src/layer/vulkan/shader/reshape_pack1to8.comp | 92 +++++++++ src/layer/vulkan/shader/reshape_pack4.comp | 70 +++++-- src/layer/vulkan/shader/reshape_pack4to1.comp | 16 +- src/layer/vulkan/shader/reshape_pack4to8.comp | 184 ++++++++++++++++++ src/layer/vulkan/shader/reshape_pack8.comp | 184 ++++++++++++++++++ src/layer/vulkan/shader/reshape_pack8to1.comp | 105 ++++++++++ src/layer/vulkan/shader/reshape_pack8to4.comp | 144 ++++++++++++++ 13 files changed, 1360 insertions(+), 26 deletions(-) create mode 100644 src/layer/vulkan/shader/interp_bicubic_pack8.comp create mode 100644 src/layer/vulkan/shader/interp_pack8.comp create mode 100644 src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp create mode 100644 src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp create mode 100644 src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp create mode 100644 src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp create mode 100644 src/layer/vulkan/shader/reshape_pack1to8.comp create mode 100644 src/layer/vulkan/shader/reshape_pack4to8.comp create mode 100644 src/layer/vulkan/shader/reshape_pack8.comp create mode 100644 src/layer/vulkan/shader/reshape_pack8to1.comp create mode 100644 src/layer/vulkan/shader/reshape_pack8to4.comp diff --git a/src/layer/vulkan/shader/interp_bicubic_pack8.comp b/src/layer/vulkan/shader/interp_bicubic_pack8.comp new file mode 100644 index 000000000..6d7794f3c --- /dev/null +++ b/src/layer/vulkan/shader/interp_bicubic_pack8.comp @@ -0,0 +1,115 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; +layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; +layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; }; +layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; }; +layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + int sx = xofs_blob_data[gx]; + int sy = yofs_blob_data[gy]; + + int v_offset_0 = gz * p.cstep + (sy - 1) * p.w + sx; + int v_offset_1 = gz * p.cstep + (sy + 0) * p.w + sx; + int v_offset_2 = gz * p.cstep + (sy + 1) * p.w + sx; + int v_offset_3 = gz * p.cstep + (sy + 2) * p.w + sx; + + afpvec4 alpha = buffer_ld4(alpha_blob_data, gx); + + afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1); + afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0); + afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); + afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2); + + afpvec8 a; + a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a; + a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a; + + afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1); + afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0); + afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); + afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2); + + afpvec8 b; + b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a; + b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a; + + afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1); + afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0); + afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1); + afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2); + + afpvec8 c; + c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a; + c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a; + + afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1); + afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0); + afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1); + afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2); + + afpvec8 d; + d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a; + d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a; + + afpvec4 beta = buffer_ld4(beta_blob_data, gy); + + afpvec8 v; + v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a; + v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a; + + const int gi = gz * p.outcstep + gy * p.outw + gx; + + buffer_st8(top_blob_data, gi, v); +} diff --git a/src/layer/vulkan/shader/interp_pack8.comp b/src/layer/vulkan/shader/interp_pack8.comp new file mode 100644 index 000000000..8ed2f1ddd --- /dev/null +++ b/src/layer/vulkan/shader/interp_pack8.comp @@ -0,0 +1,120 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int resize_type = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; + + float scale_x; + float scale_y; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + const int gi = gz * p.outcstep + gy * p.outw + gx; + + if (resize_type == 1) // nearest + { + afpvec2 gxy = afpvec2(gx, gy); + ivec2 sxy_max = ivec2(p.w - 1, p.h - 1); + ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); + + int sx = sxy.r; + int sy = sxy.g; + + int v_offset = gz * p.cstep + sy * p.w + sx; + + buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); + } + else if (resize_type == 2) // bilinear + { + afpvec2 gxy = afpvec2(gx, gy); + afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f); + + ivec2 sxy = ivec2(floor(fxy)); + + fxy -= afpvec2(sxy); + + ivec2 sxy_max = ivec2(p.w - 2, p.h - 2); + + bvec2 underflow = lessThan(sxy, ivec2(0)); + bvec2 overflow = greaterThan(sxy, sxy_max); + + sxy = clamp(sxy, ivec2(0), sxy_max); + + fxy = mix(fxy, afpvec2(0.f), underflow); + fxy = mix(fxy, afpvec2(1.f), overflow); + + int sx = sxy.r; + int sy = sxy.g; + + int v_offset_0 = gz * p.cstep + sy * p.w + sx; + int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; + + afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0); + afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); + afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1); + afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); + + afp fx = fxy.r; + afp fy = fxy.g; + + afpvec8 a; + afpvec8 b; + a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx; + b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx; + + afpvec8 res; + res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy; + res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy; + + buffer_st8(top_blob_data, gi, res); + } +} diff --git a/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp b/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp new file mode 100644 index 000000000..8e6fe072f --- /dev/null +++ b/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp @@ -0,0 +1,98 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int region_type = 0; +layout (constant_id = 1) const int local_size = 0; +layout (constant_id = 2) const float alpha = 0; +layout (constant_id = 3) const float beta = 0; +layout (constant_id = 4) const float bias_constant = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer square_workspace { float square_workspace_data[]; }; +layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + // support region_type == 0 only + + afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + + ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); + ivec4 zz4 = z4 + 4; + ivec4 v_offset = z4 * p.cstep + gy * p.w + gx; + ivec4 vv_offset = zz4 * p.cstep + gy * p.w + gx; + + for (int z = 0; z < local_size; z++) + { + sum[0].r += afp(square_workspace_data[v_offset.r]); + sum[0].g += afp(square_workspace_data[v_offset.g]); + sum[0].b += afp(square_workspace_data[v_offset.b]); + sum[0].a += afp(square_workspace_data[v_offset.a]); + sum[1].r += afp(square_workspace_data[vv_offset.r]); + sum[1].g += afp(square_workspace_data[vv_offset.g]); + sum[1].b += afp(square_workspace_data[vv_offset.b]); + sum[1].a += afp(square_workspace_data[vv_offset.a]); + + v_offset += p.cstep; + } + + const afp alpha_div_size = afp(alpha / local_size); + afpvec8 scale; + scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta)); + scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta)); + + int gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); + + v[0] *= scale[0]; + v[1] *= scale[1]; + + buffer_st8(bottom_top_blob_data, gi, v); +} diff --git a/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp b/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp new file mode 100644 index 000000000..9f3621444 --- /dev/null +++ b/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp @@ -0,0 +1,91 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int region_type = 0; +layout (constant_id = 1) const int local_size = 0; +layout (constant_id = 2) const float alpha = 0; +layout (constant_id = 3) const float beta = 0; +layout (constant_id = 4) const float bias_constant = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer square_workspace { mat2x4 square_workspace_data[]; }; +layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + // support region_type == 1 only + + afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); + + int v_offset = gz * p.cstep + gy * p.w + gx; + + for (int y = 0; y < local_size; y++) + { + for (int x = 0; x < local_size; x++) + { + sum += afpvec8(square_workspace_data[v_offset + x]); + } + + v_offset += p.w; + } + + const afp alpha_div_size = afp(alpha / (local_size * local_size)); + afpvec8 scale; + scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta)); + scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta)); + + int gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); + + v[0] *= scale[0]; + v[1] *= scale[1]; + + buffer_st8(bottom_top_blob_data, gi, v); +} diff --git a/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp b/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp new file mode 100644 index 000000000..fcfdf9c8d --- /dev/null +++ b/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp @@ -0,0 +1,85 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int region_type = 0; +layout (constant_id = 1) const int pad_head = 0; +layout (constant_id = 2) const int pad_tail = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer square_workspace { float square_workspace_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + // support region_type == 0 only + + afp res; + + int z = (gz - pad_head) / 8; + + if (z >= 0 && z < p.c) + { + int v_offset = z * p.cstep + gy * p.w + gx; + afpvec8 v8 = buffer_ld8(bottom_blob_data, v_offset); + + int lane = (gz - pad_head) % 8; + + afp v = v8[lane / 4][lane % 4]; + + res = v * v; + } + else + { + res = afp(0.f); + } + + const int gi = gz * p.outcstep + gy * p.outw + gx; + + square_workspace_data[gi] = float(res); +} diff --git a/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp b/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp new file mode 100644 index 000000000..49118dbd3 --- /dev/null +++ b/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp @@ -0,0 +1,82 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int region_type = 0; +layout (constant_id = 1) const int pad_head = 0; +layout (constant_id = 2) const int pad_tail = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer square_workspace { mat2x4 square_workspace_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + // support region_type == 1 only + + afpvec8 res; + + int x = gx - pad_head; + int y = gy - pad_head; + + if (x >= 0 && x < p.w && y >= 0 && y < p.h) + { + int v_offset = gz * p.cstep + y * p.w + x; + afpvec8 v = buffer_ld8(bottom_blob_data, v_offset); + res[0] = v[0] * v[0]; + res[1] = v[1] * v[1]; + } + else + { + res = afpvec8(afpvec4(0.f), afpvec4(0.f)); + } + + const int gi = gz * p.outcstep + gy * p.outw + gx; + + square_workspace_data[gi] = mat2x4(res); +} diff --git a/src/layer/vulkan/shader/reshape_pack1to8.comp b/src/layer/vulkan/shader/reshape_pack1to8.comp new file mode 100644 index 000000000..d0d381885 --- /dev/null +++ b/src/layer/vulkan/shader/reshape_pack1to8.comp @@ -0,0 +1,92 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; + ii4 = i4 + 4 * p.outw; + } + if (ndim == 3) + { + i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; + ii4 = i4 + 4 * p.outh * p.outw; + } + + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.w; + ivec4 xx4 = ii4 % size % p.w; + + ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; + ivec4 vv_offset = zz4 * p.cstep + yy4 * p.w + xx4; + + int gi = gz * p.outcstep + gy * p.outw + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +} diff --git a/src/layer/vulkan/shader/reshape_pack4.comp b/src/layer/vulkan/shader/reshape_pack4.comp index ff294d2ac..daee2ed61 100644 --- a/src/layer/vulkan/shader/reshape_pack4.comp +++ b/src/layer/vulkan/shader/reshape_pack4.comp @@ -27,7 +27,11 @@ layout (local_size_x_id = 233) in; layout (local_size_y_id = 234) in; layout (local_size_z_id = 235) in; -layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (push_constant) uniform parameter @@ -60,21 +64,22 @@ void main() if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; +#if NCNN_fp16_packed ivec4 v_offset; - ivec4 lane4; + ivec4 lane2; if (p.dims == 1) { - v_offset = i4 / 4; - lane4 = i4 % 4; + v_offset = i4 / 2; + lane2 = i4 % 2; } else if (p.dims == 2) { ivec4 y4 = i4 / p.w; ivec4 x4 = i4 % p.w; - v_offset = (y4 / 4) * p.w + x4; - lane4 = y4 % 4; + v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; } else // if (p.dims == 3) { @@ -84,30 +89,55 @@ void main() ivec4 y4 = i4 % size / p.w; ivec4 x4 = i4 % size % p.w; - v_offset = (z4 / 4) * p.cstep + y4 * p.w + x4; - lane4 = z4 % 4; + v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; } int gi; - if (ndim == 1) + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (p.dims == 1) { - gi = gx; + v_offset = i4; } - if (ndim == 2) + else if (p.dims == 2) { - gi = gy * p.outw + gx; + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + + v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4; } - if (ndim == 3) + else // if (p.dims == 3) { - gi = gz * p.outcstep + gy * p.outw + gx; + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + + v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4; } - afpvec4 vr = buffer_ld4(bottom_blob_data, v_offset.r); - afpvec4 vg = buffer_ld4(bottom_blob_data, v_offset.g); - afpvec4 vb = buffer_ld4(bottom_blob_data, v_offset.b); - afpvec4 va = buffer_ld4(bottom_blob_data, v_offset.a); + int gi; - afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]); + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; - buffer_st4(top_blob_data, gi, v); + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif } diff --git a/src/layer/vulkan/shader/reshape_pack4to1.comp b/src/layer/vulkan/shader/reshape_pack4to1.comp index 3644e91cb..8e5efd381 100644 --- a/src/layer/vulkan/shader/reshape_pack4to1.comp +++ b/src/layer/vulkan/shader/reshape_pack4to1.comp @@ -54,11 +54,15 @@ void main() if (gx >= p.w || gy >= p.h || gz >= p.c) return; - ivec4 i4; + ivec3 gxyz = ivec3(gx, gy, gz); - if (p.dims == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); - if (p.dims == 2) i4 = (gy * 4) * p.w + gx + ivec4(0, 1, 2, 3) * p.w; - if (p.dims == 3) i4 = (gz * 4) * p.h * p.w + gy * p.w + gx + ivec4(0, 1, 2, 3) * p.h * p.w; + gxyz[p.dims - 1] *= 4; + + int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x; + + ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w); + + ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1]; ivec4 v_offset; @@ -66,14 +70,14 @@ void main() { v_offset = i4; } - else if (ndim == 2) + if (ndim == 2) { ivec4 y4 = i4 / p.outw; ivec4 x4 = i4 % p.outw; v_offset = y4 * p.outw + x4; } - else // if (ndim == 3) + if (ndim == 3) { int size = p.outw * p.outh; diff --git a/src/layer/vulkan/shader/reshape_pack4to8.comp b/src/layer/vulkan/shader/reshape_pack4to8.comp new file mode 100644 index 000000000..4f3ff5fba --- /dev/null +++ b/src/layer/vulkan/shader/reshape_pack4to8.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; + ii4 = i4 + 4 * p.outw; + } + if (ndim == 3) + { + i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; + ii4 = i4 + 4 * p.outh * p.outw; + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (p.dims == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + vv_offset = ii4 / 2; + lane4 = ii4 % 2; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + ivec4 yy4 = ii4 / p.w; + ivec4 xx4 = ii4 % p.w; + + v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 4) * p.w + xx4) * 2 + (yy4 % 4) / 2; + lane4 = yy4 % 2; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.w; + ivec4 xx4 = ii4 % size % p.w; + + v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 2 + (zz4 % 4) / 2; + lane4 = zz4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (p.dims == 1) + { + v_offset = i4; + vv_offset = ii4; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + ivec4 yy4 = ii4 / p.w; + ivec4 xx4 = ii4 % p.w; + + v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4; + vv_offset = ((yy4 / 4) * p.w + xx4) * 4 + yy4 % 4; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.w; + ivec4 xx4 = ii4 % size % p.w; + + v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4; + vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 4 + zz4 % 4; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/src/layer/vulkan/shader/reshape_pack8.comp b/src/layer/vulkan/shader/reshape_pack8.comp new file mode 100644 index 000000000..9002f38e0 --- /dev/null +++ b/src/layer/vulkan/shader/reshape_pack8.comp @@ -0,0 +1,184 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + ivec4 i4; + ivec4 ii4; + + if (ndim == 1) + { + i4 = gx * 8 + ivec4(0, 1, 2, 3); + ii4 = i4 + 4; + } + if (ndim == 2) + { + i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; + ii4 = i4 + 4 * p.outw; + } + if (ndim == 3) + { + i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; + ii4 = i4 + 4 * p.outh * p.outw; + } + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 vv_offset; + ivec4 lane2; + ivec4 lane4; + + if (p.dims == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + vv_offset = ii4 / 2; + lane4 = ii4 % 2; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + ivec4 yy4 = ii4 / p.w; + ivec4 xx4 = ii4 % p.w; + + v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + vv_offset = ((yy4 / 8) * p.w + xx4) * 4 + (yy4 % 8) / 2; + lane4 = yy4 % 2; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.w; + ivec4 xx4 = ii4 % size % p.w; + + v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 4 + (zz4 % 8) / 2; + lane4 = zz4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); + afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); + afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); + afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); + + afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); + + buffer_st8(top_blob_data, gi, v); +#else + ivec4 v_offset; + ivec4 vv_offset; + + if (p.dims == 1) + { + v_offset = i4; + vv_offset = ii4; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + ivec4 yy4 = ii4 / p.w; + ivec4 xx4 = ii4 % p.w; + + v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8; + vv_offset = ((yy4 / 8) * p.w + xx4) * 8 + yy4 % 8; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.w; + ivec4 xx4 = ii4 % size % p.w; + + v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8; + vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 8 + zz4 % 8; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); +#endif +} diff --git a/src/layer/vulkan/shader/reshape_pack8to1.comp b/src/layer/vulkan/shader/reshape_pack8to1.comp new file mode 100644 index 000000000..0307721a0 --- /dev/null +++ b/src/layer/vulkan/shader/reshape_pack8to1.comp @@ -0,0 +1,105 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; +layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.w || gy >= p.h || gz >= p.c) + return; + + ivec3 gxyz = ivec3(gx, gy, gz); + + gxyz[p.dims - 1] *= 8; + + int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x; + + ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w); + + ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1]; + ivec4 ii4 = i4 + 4 * gxyz4[p.dims - 1]; + + ivec4 v_offset; + ivec4 vv_offset; + + if (ndim == 1) + { + v_offset = i4; + vv_offset = ii4; + } + if (ndim == 2) + { + ivec4 y4 = i4 / p.outw; + ivec4 x4 = i4 % p.outw; + ivec4 yy4 = ii4 / p.outw; + ivec4 xx4 = ii4 % p.outw; + + v_offset = y4 * p.outw + x4; + vv_offset = yy4 * p.outw + xx4; + } + if (ndim == 3) + { + int size = p.outw * p.outh; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.outw; + ivec4 x4 = i4 % size % p.outw; + ivec4 zz4 = ii4 / size; + ivec4 yy4 = ii4 % size / p.outw; + ivec4 xx4 = ii4 % size % p.outw; + + v_offset = z4 * p.outcstep + y4 * p.outw + x4; + vv_offset = zz4 * p.outcstep + yy4 * p.outw + xx4; + } + + int gi = gz * p.cstep + gy * p.w + gx; + + buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); +} diff --git a/src/layer/vulkan/shader/reshape_pack8to4.comp b/src/layer/vulkan/shader/reshape_pack8to4.comp new file mode 100644 index 000000000..e98feb988 --- /dev/null +++ b/src/layer/vulkan/shader/reshape_pack8to4.comp @@ -0,0 +1,144 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +layout (constant_id = 0) const int ndim = 0; + +layout (local_size_x_id = 233) in; +layout (local_size_y_id = 234) in; +layout (local_size_z_id = 235) in; + +#if NCNN_fp16_packed +layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; +#else +layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; +#endif +layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; + +layout (push_constant) uniform parameter +{ + int dims; + int w; + int h; + int c; + int cstep; + + int outdims; + int outw; + int outh; + int outc; + int outcstep; +} p; + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + ivec4 i4; + + if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); + if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; + if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; + +#if NCNN_fp16_packed + ivec4 v_offset; + ivec4 lane2; + + if (p.dims == 1) + { + v_offset = i4 / 2; + lane2 = i4 % 2; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + + v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2; + lane2 = y4 % 2; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + + v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2; + lane2 = z4 % 2; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); + afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); + afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); + afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); + + afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); + + buffer_st4(top_blob_data, gi, v); +#else + ivec4 v_offset; + + if (p.dims == 1) + { + v_offset = i4; + } + else if (p.dims == 2) + { + ivec4 y4 = i4 / p.w; + ivec4 x4 = i4 % p.w; + + v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8; + } + else // if (p.dims == 3) + { + int size = p.w * p.h; + + ivec4 z4 = i4 / size; + ivec4 y4 = i4 % size / p.w; + ivec4 x4 = i4 % size % p.w; + + v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8; + } + + int gi; + + if (ndim == 1) gi = gx; + if (ndim == 2) gi = gy * p.outw + gx; + if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; + + buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); +#endif +}