| @@ -0,0 +1,115 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; }; | |||
| layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; }; | |||
| layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; }; | |||
| layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| int sx = xofs_blob_data[gx]; | |||
| int sy = yofs_blob_data[gy]; | |||
| int v_offset_0 = gz * p.cstep + (sy - 1) * p.w + sx; | |||
| int v_offset_1 = gz * p.cstep + (sy + 0) * p.w + sx; | |||
| int v_offset_2 = gz * p.cstep + (sy + 1) * p.w + sx; | |||
| int v_offset_3 = gz * p.cstep + (sy + 2) * p.w + sx; | |||
| afpvec4 alpha = buffer_ld4(alpha_blob_data, gx); | |||
| afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1); | |||
| afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0); | |||
| afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); | |||
| afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2); | |||
| afpvec8 a; | |||
| a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a; | |||
| a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a; | |||
| afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1); | |||
| afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0); | |||
| afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); | |||
| afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2); | |||
| afpvec8 b; | |||
| b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a; | |||
| b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a; | |||
| afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1); | |||
| afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0); | |||
| afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1); | |||
| afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2); | |||
| afpvec8 c; | |||
| c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a; | |||
| c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a; | |||
| afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1); | |||
| afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0); | |||
| afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1); | |||
| afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2); | |||
| afpvec8 d; | |||
| d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a; | |||
| d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a; | |||
| afpvec4 beta = buffer_ld4(beta_blob_data, gy); | |||
| afpvec8 v; | |||
| v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a; | |||
| v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a; | |||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_st8(top_blob_data, gi, v); | |||
| } | |||
| @@ -0,0 +1,120 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int resize_type = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| float scale_x; | |||
| float scale_y; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| if (resize_type == 1) // nearest | |||
| { | |||
| afpvec2 gxy = afpvec2(gx, gy); | |||
| ivec2 sxy_max = ivec2(p.w - 1, p.h - 1); | |||
| ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max); | |||
| int sx = sxy.r; | |||
| int sy = sxy.g; | |||
| int v_offset = gz * p.cstep + sy * p.w + sx; | |||
| buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset); | |||
| } | |||
| else if (resize_type == 2) // bilinear | |||
| { | |||
| afpvec2 gxy = afpvec2(gx, gy); | |||
| afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f); | |||
| ivec2 sxy = ivec2(floor(fxy)); | |||
| fxy -= afpvec2(sxy); | |||
| ivec2 sxy_max = ivec2(p.w - 2, p.h - 2); | |||
| bvec2 underflow = lessThan(sxy, ivec2(0)); | |||
| bvec2 overflow = greaterThan(sxy, sxy_max); | |||
| sxy = clamp(sxy, ivec2(0), sxy_max); | |||
| fxy = mix(fxy, afpvec2(0.f), underflow); | |||
| fxy = mix(fxy, afpvec2(1.f), overflow); | |||
| int sx = sxy.r; | |||
| int sy = sxy.g; | |||
| int v_offset_0 = gz * p.cstep + sy * p.w + sx; | |||
| int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; | |||
| afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0); | |||
| afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1); | |||
| afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1); | |||
| afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1); | |||
| afp fx = fxy.r; | |||
| afp fy = fxy.g; | |||
| afpvec8 a; | |||
| afpvec8 b; | |||
| a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx; | |||
| b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx; | |||
| afpvec8 res; | |||
| res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy; | |||
| res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy; | |||
| buffer_st8(top_blob_data, gi, res); | |||
| } | |||
| } | |||
| @@ -0,0 +1,98 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int region_type = 0; | |||
| layout (constant_id = 1) const int local_size = 0; | |||
| layout (constant_id = 2) const float alpha = 0; | |||
| layout (constant_id = 3) const float beta = 0; | |||
| layout (constant_id = 4) const float bias_constant = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer square_workspace { float square_workspace_data[]; }; | |||
| layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| // support region_type == 0 only | |||
| afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3); | |||
| ivec4 zz4 = z4 + 4; | |||
| ivec4 v_offset = z4 * p.cstep + gy * p.w + gx; | |||
| ivec4 vv_offset = zz4 * p.cstep + gy * p.w + gx; | |||
| for (int z = 0; z < local_size; z++) | |||
| { | |||
| sum[0].r += afp(square_workspace_data[v_offset.r]); | |||
| sum[0].g += afp(square_workspace_data[v_offset.g]); | |||
| sum[0].b += afp(square_workspace_data[v_offset.b]); | |||
| sum[0].a += afp(square_workspace_data[v_offset.a]); | |||
| sum[1].r += afp(square_workspace_data[vv_offset.r]); | |||
| sum[1].g += afp(square_workspace_data[vv_offset.g]); | |||
| sum[1].b += afp(square_workspace_data[vv_offset.b]); | |||
| sum[1].a += afp(square_workspace_data[vv_offset.a]); | |||
| v_offset += p.cstep; | |||
| } | |||
| const afp alpha_div_size = afp(alpha / local_size); | |||
| afpvec8 scale; | |||
| scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta)); | |||
| scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta)); | |||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); | |||
| v[0] *= scale[0]; | |||
| v[1] *= scale[1]; | |||
| buffer_st8(bottom_top_blob_data, gi, v); | |||
| } | |||
| @@ -0,0 +1,91 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int region_type = 0; | |||
| layout (constant_id = 1) const int local_size = 0; | |||
| layout (constant_id = 2) const float alpha = 0; | |||
| layout (constant_id = 3) const float beta = 0; | |||
| layout (constant_id = 4) const float bias_constant = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer square_workspace { mat2x4 square_workspace_data[]; }; | |||
| layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| // support region_type == 1 only | |||
| afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| int v_offset = gz * p.cstep + gy * p.w + gx; | |||
| for (int y = 0; y < local_size; y++) | |||
| { | |||
| for (int x = 0; x < local_size; x++) | |||
| { | |||
| sum += afpvec8(square_workspace_data[v_offset + x]); | |||
| } | |||
| v_offset += p.w; | |||
| } | |||
| const afp alpha_div_size = afp(alpha / (local_size * local_size)); | |||
| afpvec8 scale; | |||
| scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta)); | |||
| scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta)); | |||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec8 v = buffer_ld8(bottom_top_blob_data, gi); | |||
| v[0] *= scale[0]; | |||
| v[1] *= scale[1]; | |||
| buffer_st8(bottom_top_blob_data, gi, v); | |||
| } | |||
| @@ -0,0 +1,85 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int region_type = 0; | |||
| layout (constant_id = 1) const int pad_head = 0; | |||
| layout (constant_id = 2) const int pad_tail = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer square_workspace { float square_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| // support region_type == 0 only | |||
| afp res; | |||
| int z = (gz - pad_head) / 8; | |||
| if (z >= 0 && z < p.c) | |||
| { | |||
| int v_offset = z * p.cstep + gy * p.w + gx; | |||
| afpvec8 v8 = buffer_ld8(bottom_blob_data, v_offset); | |||
| int lane = (gz - pad_head) % 8; | |||
| afp v = v8[lane / 4][lane % 4]; | |||
| res = v * v; | |||
| } | |||
| else | |||
| { | |||
| res = afp(0.f); | |||
| } | |||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| square_workspace_data[gi] = float(res); | |||
| } | |||
| @@ -0,0 +1,82 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int region_type = 0; | |||
| layout (constant_id = 1) const int pad_head = 0; | |||
| layout (constant_id = 2) const int pad_tail = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer square_workspace { mat2x4 square_workspace_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| // support region_type == 1 only | |||
| afpvec8 res; | |||
| int x = gx - pad_head; | |||
| int y = gy - pad_head; | |||
| if (x >= 0 && x < p.w && y >= 0 && y < p.h) | |||
| { | |||
| int v_offset = gz * p.cstep + y * p.w + x; | |||
| afpvec8 v = buffer_ld8(bottom_blob_data, v_offset); | |||
| res[0] = v[0] * v[0]; | |||
| res[1] = v[1] * v[1]; | |||
| } | |||
| else | |||
| { | |||
| res = afpvec8(afpvec4(0.f), afpvec4(0.f)); | |||
| } | |||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| square_workspace_data[gi] = mat2x4(res); | |||
| } | |||
| @@ -0,0 +1,92 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int ndim = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| ivec4 i4; | |||
| ivec4 ii4; | |||
| if (ndim == 1) | |||
| { | |||
| i4 = gx * 8 + ivec4(0, 1, 2, 3); | |||
| ii4 = i4 + 4; | |||
| } | |||
| if (ndim == 2) | |||
| { | |||
| i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; | |||
| ii4 = i4 + 4 * p.outw; | |||
| } | |||
| if (ndim == 3) | |||
| { | |||
| i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; | |||
| ii4 = i4 + 4 * p.outh * p.outw; | |||
| } | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.w; | |||
| ivec4 xx4 = ii4 % size % p.w; | |||
| ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | |||
| ivec4 vv_offset = zz4 * p.cstep + yy4 * p.w + xx4; | |||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); | |||
| } | |||
| @@ -27,7 +27,11 @@ layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| @@ -60,21 +64,22 @@ void main() | |||
| if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; | |||
| if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; | |||
| #if NCNN_fp16_packed | |||
| ivec4 v_offset; | |||
| ivec4 lane4; | |||
| ivec4 lane2; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4 / 4; | |||
| lane4 = i4 % 4; | |||
| v_offset = i4 / 2; | |||
| lane2 = i4 % 2; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| v_offset = (y4 / 4) * p.w + x4; | |||
| lane4 = y4 % 4; | |||
| v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2; | |||
| lane2 = y4 % 2; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| @@ -84,30 +89,55 @@ void main() | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| v_offset = (z4 / 4) * p.cstep + y4 * p.w + x4; | |||
| lane4 = z4 % 4; | |||
| v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2; | |||
| lane2 = z4 % 2; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); | |||
| afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); | |||
| afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); | |||
| afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); | |||
| afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); | |||
| buffer_st4(top_blob_data, gi, v); | |||
| #else | |||
| ivec4 v_offset; | |||
| if (p.dims == 1) | |||
| { | |||
| gi = gx; | |||
| v_offset = i4; | |||
| } | |||
| if (ndim == 2) | |||
| else if (p.dims == 2) | |||
| { | |||
| gi = gy * p.outw + gx; | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4; | |||
| } | |||
| if (ndim == 3) | |||
| else // if (p.dims == 3) | |||
| { | |||
| gi = gz * p.outcstep + gy * p.outw + gx; | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4; | |||
| } | |||
| afpvec4 vr = buffer_ld4(bottom_blob_data, v_offset.r); | |||
| afpvec4 vg = buffer_ld4(bottom_blob_data, v_offset.g); | |||
| afpvec4 vb = buffer_ld4(bottom_blob_data, v_offset.b); | |||
| afpvec4 va = buffer_ld4(bottom_blob_data, v_offset.a); | |||
| int gi; | |||
| afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]); | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_st4(top_blob_data, gi, v); | |||
| buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); | |||
| #endif | |||
| } | |||
| @@ -54,11 +54,15 @@ void main() | |||
| if (gx >= p.w || gy >= p.h || gz >= p.c) | |||
| return; | |||
| ivec4 i4; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| if (p.dims == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); | |||
| if (p.dims == 2) i4 = (gy * 4) * p.w + gx + ivec4(0, 1, 2, 3) * p.w; | |||
| if (p.dims == 3) i4 = (gz * 4) * p.h * p.w + gy * p.w + gx + ivec4(0, 1, 2, 3) * p.h * p.w; | |||
| gxyz[p.dims - 1] *= 4; | |||
| int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x; | |||
| ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w); | |||
| ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1]; | |||
| ivec4 v_offset; | |||
| @@ -66,14 +70,14 @@ void main() | |||
| { | |||
| v_offset = i4; | |||
| } | |||
| else if (ndim == 2) | |||
| if (ndim == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.outw; | |||
| ivec4 x4 = i4 % p.outw; | |||
| v_offset = y4 * p.outw + x4; | |||
| } | |||
| else // if (ndim == 3) | |||
| if (ndim == 3) | |||
| { | |||
| int size = p.outw * p.outh; | |||
| @@ -0,0 +1,184 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int ndim = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| ivec4 i4; | |||
| ivec4 ii4; | |||
| if (ndim == 1) | |||
| { | |||
| i4 = gx * 8 + ivec4(0, 1, 2, 3); | |||
| ii4 = i4 + 4; | |||
| } | |||
| if (ndim == 2) | |||
| { | |||
| i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; | |||
| ii4 = i4 + 4 * p.outw; | |||
| } | |||
| if (ndim == 3) | |||
| { | |||
| i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; | |||
| ii4 = i4 + 4 * p.outh * p.outw; | |||
| } | |||
| #if NCNN_fp16_packed | |||
| ivec4 v_offset; | |||
| ivec4 vv_offset; | |||
| ivec4 lane2; | |||
| ivec4 lane4; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4 / 2; | |||
| lane2 = i4 % 2; | |||
| vv_offset = ii4 / 2; | |||
| lane4 = ii4 % 2; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| ivec4 yy4 = ii4 / p.w; | |||
| ivec4 xx4 = ii4 % p.w; | |||
| v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2; | |||
| lane2 = y4 % 2; | |||
| vv_offset = ((yy4 / 4) * p.w + xx4) * 2 + (yy4 % 4) / 2; | |||
| lane4 = yy4 % 2; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.w; | |||
| ivec4 xx4 = ii4 % size % p.w; | |||
| v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2; | |||
| lane2 = z4 % 2; | |||
| vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 2 + (zz4 % 4) / 2; | |||
| lane4 = zz4 % 2; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); | |||
| afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); | |||
| afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); | |||
| afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); | |||
| afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); | |||
| afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); | |||
| afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); | |||
| afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); | |||
| afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); | |||
| buffer_st8(top_blob_data, gi, v); | |||
| #else | |||
| ivec4 v_offset; | |||
| ivec4 vv_offset; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4; | |||
| vv_offset = ii4; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| ivec4 yy4 = ii4 / p.w; | |||
| ivec4 xx4 = ii4 % p.w; | |||
| v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4; | |||
| vv_offset = ((yy4 / 4) * p.w + xx4) * 4 + yy4 % 4; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.w; | |||
| ivec4 xx4 = ii4 % size % p.w; | |||
| v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4; | |||
| vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 4 + zz4 % 4; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,184 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int ndim = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| ivec4 i4; | |||
| ivec4 ii4; | |||
| if (ndim == 1) | |||
| { | |||
| i4 = gx * 8 + ivec4(0, 1, 2, 3); | |||
| ii4 = i4 + 4; | |||
| } | |||
| if (ndim == 2) | |||
| { | |||
| i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; | |||
| ii4 = i4 + 4 * p.outw; | |||
| } | |||
| if (ndim == 3) | |||
| { | |||
| i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; | |||
| ii4 = i4 + 4 * p.outh * p.outw; | |||
| } | |||
| #if NCNN_fp16_packed | |||
| ivec4 v_offset; | |||
| ivec4 vv_offset; | |||
| ivec4 lane2; | |||
| ivec4 lane4; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4 / 2; | |||
| lane2 = i4 % 2; | |||
| vv_offset = ii4 / 2; | |||
| lane4 = ii4 % 2; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| ivec4 yy4 = ii4 / p.w; | |||
| ivec4 xx4 = ii4 % p.w; | |||
| v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2; | |||
| lane2 = y4 % 2; | |||
| vv_offset = ((yy4 / 8) * p.w + xx4) * 4 + (yy4 % 8) / 2; | |||
| lane4 = yy4 % 2; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.w; | |||
| ivec4 xx4 = ii4 % size % p.w; | |||
| v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2; | |||
| lane2 = z4 % 2; | |||
| vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 4 + (zz4 % 8) / 2; | |||
| lane4 = zz4 % 2; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); | |||
| afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); | |||
| afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); | |||
| afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); | |||
| afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r); | |||
| afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g); | |||
| afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b); | |||
| afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a); | |||
| afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]); | |||
| buffer_st8(top_blob_data, gi, v); | |||
| #else | |||
| ivec4 v_offset; | |||
| ivec4 vv_offset; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4; | |||
| vv_offset = ii4; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| ivec4 yy4 = ii4 / p.w; | |||
| ivec4 xx4 = ii4 % p.w; | |||
| v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8; | |||
| vv_offset = ((yy4 / 8) * p.w + xx4) * 8 + yy4 % 8; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.w; | |||
| ivec4 xx4 = ii4 % size % p.w; | |||
| v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8; | |||
| vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 8 + zz4 % 8; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset); | |||
| #endif | |||
| } | |||
| @@ -0,0 +1,105 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int ndim = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.w || gy >= p.h || gz >= p.c) | |||
| return; | |||
| ivec3 gxyz = ivec3(gx, gy, gz); | |||
| gxyz[p.dims - 1] *= 8; | |||
| int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x; | |||
| ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w); | |||
| ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1]; | |||
| ivec4 ii4 = i4 + 4 * gxyz4[p.dims - 1]; | |||
| ivec4 v_offset; | |||
| ivec4 vv_offset; | |||
| if (ndim == 1) | |||
| { | |||
| v_offset = i4; | |||
| vv_offset = ii4; | |||
| } | |||
| if (ndim == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.outw; | |||
| ivec4 x4 = i4 % p.outw; | |||
| ivec4 yy4 = ii4 / p.outw; | |||
| ivec4 xx4 = ii4 % p.outw; | |||
| v_offset = y4 * p.outw + x4; | |||
| vv_offset = yy4 * p.outw + xx4; | |||
| } | |||
| if (ndim == 3) | |||
| { | |||
| int size = p.outw * p.outh; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.outw; | |||
| ivec4 x4 = i4 % size % p.outw; | |||
| ivec4 zz4 = ii4 / size; | |||
| ivec4 yy4 = ii4 % size / p.outw; | |||
| ivec4 xx4 = ii4 % size % p.outw; | |||
| v_offset = z4 * p.outcstep + y4 * p.outw + x4; | |||
| vv_offset = zz4 * p.outcstep + yy4 * p.outw + xx4; | |||
| } | |||
| int gi = gz * p.cstep + gy * p.w + gx; | |||
| buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi); | |||
| } | |||
| @@ -0,0 +1,144 @@ | |||
| // Tencent is pleased to support the open source community by making ncnn available. | |||
| // | |||
| // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. | |||
| // | |||
| // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except | |||
| // in compliance with the License. You may obtain a copy of the License at | |||
| // | |||
| // https://opensource.org/licenses/BSD-3-Clause | |||
| // | |||
| // Unless required by applicable law or agreed to in writing, software distributed | |||
| // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR | |||
| // CONDITIONS OF ANY KIND, either express or implied. See the License for the | |||
| // specific language governing permissions and limitations under the License. | |||
| #version 450 | |||
| #if NCNN_fp16_storage | |||
| #extension GL_EXT_shader_16bit_storage: require | |||
| struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; | |||
| #endif | |||
| #if NCNN_fp16_arithmetic | |||
| #extension GL_EXT_shader_explicit_arithmetic_types_float16: require | |||
| #endif | |||
| layout (constant_id = 0) const int ndim = 0; | |||
| layout (local_size_x_id = 233) in; | |||
| layout (local_size_y_id = 234) in; | |||
| layout (local_size_z_id = 235) in; | |||
| #if NCNN_fp16_packed | |||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; }; | |||
| #else | |||
| layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; }; | |||
| #endif | |||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int dims; | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outdims; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| ivec4 i4; | |||
| if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3); | |||
| if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw; | |||
| if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw; | |||
| #if NCNN_fp16_packed | |||
| ivec4 v_offset; | |||
| ivec4 lane2; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4 / 2; | |||
| lane2 = i4 % 2; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2; | |||
| lane2 = y4 % 2; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2; | |||
| lane2 = z4 % 2; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r); | |||
| afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g); | |||
| afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b); | |||
| afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a); | |||
| afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]); | |||
| buffer_st4(top_blob_data, gi, v); | |||
| #else | |||
| ivec4 v_offset; | |||
| if (p.dims == 1) | |||
| { | |||
| v_offset = i4; | |||
| } | |||
| else if (p.dims == 2) | |||
| { | |||
| ivec4 y4 = i4 / p.w; | |||
| ivec4 x4 = i4 % p.w; | |||
| v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8; | |||
| } | |||
| else // if (p.dims == 3) | |||
| { | |||
| int size = p.w * p.h; | |||
| ivec4 z4 = i4 / size; | |||
| ivec4 y4 = i4 % size / p.w; | |||
| ivec4 x4 = i4 % size % p.w; | |||
| v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8; | |||
| } | |||
| int gi; | |||
| if (ndim == 1) gi = gx; | |||
| if (ndim == 2) gi = gy * p.outw + gx; | |||
| if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx; | |||
| buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset); | |||
| #endif | |||
| } | |||