Browse Source

reduce reshape pack4 bandwidth, add some pack8 shaders

tags/20200226
nihui 6 years ago
parent
commit
4c381bee47
13 changed files with 1360 additions and 26 deletions
  1. +115
    -0
      src/layer/vulkan/shader/interp_bicubic_pack8.comp
  2. +120
    -0
      src/layer/vulkan/shader/interp_pack8.comp
  3. +98
    -0
      src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
  4. +91
    -0
      src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
  5. +85
    -0
      src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
  6. +82
    -0
      src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
  7. +92
    -0
      src/layer/vulkan/shader/reshape_pack1to8.comp
  8. +50
    -20
      src/layer/vulkan/shader/reshape_pack4.comp
  9. +10
    -6
      src/layer/vulkan/shader/reshape_pack4to1.comp
  10. +184
    -0
      src/layer/vulkan/shader/reshape_pack4to8.comp
  11. +184
    -0
      src/layer/vulkan/shader/reshape_pack8.comp
  12. +105
    -0
      src/layer/vulkan/shader/reshape_pack8to1.comp
  13. +144
    -0
      src/layer/vulkan/shader/reshape_pack8to4.comp

+ 115
- 0
src/layer/vulkan/shader/interp_bicubic_pack8.comp View File

@@ -0,0 +1,115 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

int sx = xofs_blob_data[gx];
int sy = yofs_blob_data[gy];

int v_offset_0 = gz * p.cstep + (sy - 1) * p.w + sx;
int v_offset_1 = gz * p.cstep + (sy + 0) * p.w + sx;
int v_offset_2 = gz * p.cstep + (sy + 1) * p.w + sx;
int v_offset_3 = gz * p.cstep + (sy + 2) * p.w + sx;

afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);

afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1);
afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0);
afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2);

afpvec8 a;
a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;

afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);

afpvec8 b;
b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;

afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1);
afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0);
afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1);
afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2);

afpvec8 c;
c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;

afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1);
afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0);
afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1);
afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2);

afpvec8 d;
d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;

afpvec4 beta = buffer_ld4(beta_blob_data, gy);

afpvec8 v;
v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a;
v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a;

const int gi = gz * p.outcstep + gy * p.outw + gx;

buffer_st8(top_blob_data, gi, v);
}

+ 120
- 0
src/layer/vulkan/shader/interp_pack8.comp View File

@@ -0,0 +1,120 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int resize_type = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;

float scale_x;
float scale_y;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

const int gi = gz * p.outcstep + gy * p.outw + gx;

if (resize_type == 1) // nearest
{
afpvec2 gxy = afpvec2(gx, gy);
ivec2 sxy_max = ivec2(p.w - 1, p.h - 1);
ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);

int sx = sxy.r;
int sy = sxy.g;

int v_offset = gz * p.cstep + sy * p.w + sx;

buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
}
else if (resize_type == 2) // bilinear
{
afpvec2 gxy = afpvec2(gx, gy);
afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);

ivec2 sxy = ivec2(floor(fxy));

fxy -= afpvec2(sxy);

ivec2 sxy_max = ivec2(p.w - 2, p.h - 2);

bvec2 underflow = lessThan(sxy, ivec2(0));
bvec2 overflow = greaterThan(sxy, sxy_max);

sxy = clamp(sxy, ivec2(0), sxy_max);

fxy = mix(fxy, afpvec2(0.f), underflow);
fxy = mix(fxy, afpvec2(1.f), overflow);

int sx = sxy.r;
int sy = sxy.g;

int v_offset_0 = gz * p.cstep + sy * p.w + sx;
int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx;

afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0);
afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1);
afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);

afp fx = fxy.r;
afp fy = fxy.g;

afpvec8 a;
afpvec8 b;
a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx;
b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx;

afpvec8 res;
res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy;
res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy;

buffer_st8(top_blob_data, gi, res);
}
}

+ 98
- 0
src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp View File

@@ -0,0 +1,98 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int region_type = 0;
layout (constant_id = 1) const int local_size = 0;
layout (constant_id = 2) const float alpha = 0;
layout (constant_id = 3) const float beta = 0;
layout (constant_id = 4) const float bias_constant = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer square_workspace { float square_workspace_data[]; };
layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

// support region_type == 0 only

afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));

ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
ivec4 zz4 = z4 + 4;
ivec4 v_offset = z4 * p.cstep + gy * p.w + gx;
ivec4 vv_offset = zz4 * p.cstep + gy * p.w + gx;

for (int z = 0; z < local_size; z++)
{
sum[0].r += afp(square_workspace_data[v_offset.r]);
sum[0].g += afp(square_workspace_data[v_offset.g]);
sum[0].b += afp(square_workspace_data[v_offset.b]);
sum[0].a += afp(square_workspace_data[v_offset.a]);
sum[1].r += afp(square_workspace_data[vv_offset.r]);
sum[1].g += afp(square_workspace_data[vv_offset.g]);
sum[1].b += afp(square_workspace_data[vv_offset.b]);
sum[1].a += afp(square_workspace_data[vv_offset.a]);

v_offset += p.cstep;
}

const afp alpha_div_size = afp(alpha / local_size);
afpvec8 scale;
scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));

int gi = gz * p.outcstep + gy * p.outw + gx;

afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);

v[0] *= scale[0];
v[1] *= scale[1];

buffer_st8(bottom_top_blob_data, gi, v);
}

+ 91
- 0
src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp View File

@@ -0,0 +1,91 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int region_type = 0;
layout (constant_id = 1) const int local_size = 0;
layout (constant_id = 2) const float alpha = 0;
layout (constant_id = 3) const float beta = 0;
layout (constant_id = 4) const float bias_constant = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer square_workspace { mat2x4 square_workspace_data[]; };
layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

// support region_type == 1 only

afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));

int v_offset = gz * p.cstep + gy * p.w + gx;

for (int y = 0; y < local_size; y++)
{
for (int x = 0; x < local_size; x++)
{
sum += afpvec8(square_workspace_data[v_offset + x]);
}

v_offset += p.w;
}

const afp alpha_div_size = afp(alpha / (local_size * local_size));
afpvec8 scale;
scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));

int gi = gz * p.outcstep + gy * p.outw + gx;

afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);

v[0] *= scale[0];
v[1] *= scale[1];

buffer_st8(bottom_top_blob_data, gi, v);
}

+ 85
- 0
src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp View File

@@ -0,0 +1,85 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int region_type = 0;
layout (constant_id = 1) const int pad_head = 0;
layout (constant_id = 2) const int pad_tail = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer square_workspace { float square_workspace_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

// support region_type == 0 only

afp res;

int z = (gz - pad_head) / 8;

if (z >= 0 && z < p.c)
{
int v_offset = z * p.cstep + gy * p.w + gx;
afpvec8 v8 = buffer_ld8(bottom_blob_data, v_offset);

int lane = (gz - pad_head) % 8;

afp v = v8[lane / 4][lane % 4];

res = v * v;
}
else
{
res = afp(0.f);
}

const int gi = gz * p.outcstep + gy * p.outw + gx;

square_workspace_data[gi] = float(res);
}

+ 82
- 0
src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp View File

@@ -0,0 +1,82 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int region_type = 0;
layout (constant_id = 1) const int pad_head = 0;
layout (constant_id = 2) const int pad_tail = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer square_workspace { mat2x4 square_workspace_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

// support region_type == 1 only

afpvec8 res;

int x = gx - pad_head;
int y = gy - pad_head;

if (x >= 0 && x < p.w && y >= 0 && y < p.h)
{
int v_offset = gz * p.cstep + y * p.w + x;
afpvec8 v = buffer_ld8(bottom_blob_data, v_offset);
res[0] = v[0] * v[0];
res[1] = v[1] * v[1];
}
else
{
res = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

const int gi = gz * p.outcstep + gy * p.outw + gx;

square_workspace_data[gi] = mat2x4(res);
}

+ 92
- 0
src/layer/vulkan/shader/reshape_pack1to8.comp View File

@@ -0,0 +1,92 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int ndim = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

ivec4 i4;
ivec4 ii4;

if (ndim == 1)
{
i4 = gx * 8 + ivec4(0, 1, 2, 3);
ii4 = i4 + 4;
}
if (ndim == 2)
{
i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
ii4 = i4 + 4 * p.outw;
}
if (ndim == 3)
{
i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
ii4 = i4 + 4 * p.outh * p.outw;
}

int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.w;
ivec4 xx4 = ii4 % size % p.w;

ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;
ivec4 vv_offset = zz4 * p.cstep + yy4 * p.w + xx4;

int gi = gz * p.outcstep + gy * p.outw + gx;

buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
}

+ 50
- 20
src/layer/vulkan/shader/reshape_pack4.comp View File

@@ -27,7 +27,11 @@ layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
#if NCNN_fp16_packed
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
#endif
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

layout (push_constant) uniform parameter
@@ -60,21 +64,22 @@ void main()
if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;

#if NCNN_fp16_packed
ivec4 v_offset;
ivec4 lane4;
ivec4 lane2;

if (p.dims == 1)
{
v_offset = i4 / 4;
lane4 = i4 % 4;
v_offset = i4 / 2;
lane2 = i4 % 2;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;

v_offset = (y4 / 4) * p.w + x4;
lane4 = y4 % 4;
v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
lane2 = y4 % 2;
}
else // if (p.dims == 3)
{
@@ -84,30 +89,55 @@ void main()
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;

v_offset = (z4 / 4) * p.cstep + y4 * p.w + x4;
lane4 = z4 % 4;
v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
lane2 = z4 % 2;
}

int gi;
if (ndim == 1)

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);

buffer_st4(top_blob_data, gi, v);
#else
ivec4 v_offset;

if (p.dims == 1)
{
gi = gx;
v_offset = i4;
}
if (ndim == 2)
else if (p.dims == 2)
{
gi = gy * p.outw + gx;
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;

v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
}
if (ndim == 3)
else // if (p.dims == 3)
{
gi = gz * p.outcstep + gy * p.outw + gx;
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;

v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
}

afpvec4 vr = buffer_ld4(bottom_blob_data, v_offset.r);
afpvec4 vg = buffer_ld4(bottom_blob_data, v_offset.g);
afpvec4 vb = buffer_ld4(bottom_blob_data, v_offset.b);
afpvec4 va = buffer_ld4(bottom_blob_data, v_offset.a);
int gi;

afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

buffer_st4(top_blob_data, gi, v);
buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
#endif
}

+ 10
- 6
src/layer/vulkan/shader/reshape_pack4to1.comp View File

@@ -54,11 +54,15 @@ void main()
if (gx >= p.w || gy >= p.h || gz >= p.c)
return;

ivec4 i4;
ivec3 gxyz = ivec3(gx, gy, gz);

if (p.dims == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
if (p.dims == 2) i4 = (gy * 4) * p.w + gx + ivec4(0, 1, 2, 3) * p.w;
if (p.dims == 3) i4 = (gz * 4) * p.h * p.w + gy * p.w + gx + ivec4(0, 1, 2, 3) * p.h * p.w;
gxyz[p.dims - 1] *= 4;

int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;

ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);

ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];

ivec4 v_offset;

@@ -66,14 +70,14 @@ void main()
{
v_offset = i4;
}
else if (ndim == 2)
if (ndim == 2)
{
ivec4 y4 = i4 / p.outw;
ivec4 x4 = i4 % p.outw;

v_offset = y4 * p.outw + x4;
}
else // if (ndim == 3)
if (ndim == 3)
{
int size = p.outw * p.outh;



+ 184
- 0
src/layer/vulkan/shader/reshape_pack4to8.comp View File

@@ -0,0 +1,184 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int ndim = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_fp16_packed
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
#endif
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

ivec4 i4;
ivec4 ii4;

if (ndim == 1)
{
i4 = gx * 8 + ivec4(0, 1, 2, 3);
ii4 = i4 + 4;
}
if (ndim == 2)
{
i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
ii4 = i4 + 4 * p.outw;
}
if (ndim == 3)
{
i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
ii4 = i4 + 4 * p.outh * p.outw;
}

#if NCNN_fp16_packed
ivec4 v_offset;
ivec4 vv_offset;
ivec4 lane2;
ivec4 lane4;

if (p.dims == 1)
{
v_offset = i4 / 2;
lane2 = i4 % 2;
vv_offset = ii4 / 2;
lane4 = ii4 % 2;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;
ivec4 yy4 = ii4 / p.w;
ivec4 xx4 = ii4 % p.w;

v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
lane2 = y4 % 2;
vv_offset = ((yy4 / 4) * p.w + xx4) * 2 + (yy4 % 4) / 2;
lane4 = yy4 % 2;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.w;
ivec4 xx4 = ii4 % size % p.w;

v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
lane2 = z4 % 2;
vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 2 + (zz4 % 4) / 2;
lane4 = zz4 % 2;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);

afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);

buffer_st8(top_blob_data, gi, v);
#else
ivec4 v_offset;
ivec4 vv_offset;

if (p.dims == 1)
{
v_offset = i4;
vv_offset = ii4;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;
ivec4 yy4 = ii4 / p.w;
ivec4 xx4 = ii4 % p.w;

v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
vv_offset = ((yy4 / 4) * p.w + xx4) * 4 + yy4 % 4;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.w;
ivec4 xx4 = ii4 % size % p.w;

v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 4 + zz4 % 4;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
#endif
}

+ 184
- 0
src/layer/vulkan/shader/reshape_pack8.comp View File

@@ -0,0 +1,184 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int ndim = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_fp16_packed
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
#endif
layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

ivec4 i4;
ivec4 ii4;

if (ndim == 1)
{
i4 = gx * 8 + ivec4(0, 1, 2, 3);
ii4 = i4 + 4;
}
if (ndim == 2)
{
i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
ii4 = i4 + 4 * p.outw;
}
if (ndim == 3)
{
i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
ii4 = i4 + 4 * p.outh * p.outw;
}

#if NCNN_fp16_packed
ivec4 v_offset;
ivec4 vv_offset;
ivec4 lane2;
ivec4 lane4;

if (p.dims == 1)
{
v_offset = i4 / 2;
lane2 = i4 % 2;
vv_offset = ii4 / 2;
lane4 = ii4 % 2;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;
ivec4 yy4 = ii4 / p.w;
ivec4 xx4 = ii4 % p.w;

v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
lane2 = y4 % 2;
vv_offset = ((yy4 / 8) * p.w + xx4) * 4 + (yy4 % 8) / 2;
lane4 = yy4 % 2;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.w;
ivec4 xx4 = ii4 % size % p.w;

v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
lane2 = z4 % 2;
vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 4 + (zz4 % 8) / 2;
lane4 = zz4 % 2;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);

afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);

buffer_st8(top_blob_data, gi, v);
#else
ivec4 v_offset;
ivec4 vv_offset;

if (p.dims == 1)
{
v_offset = i4;
vv_offset = ii4;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;
ivec4 yy4 = ii4 / p.w;
ivec4 xx4 = ii4 % p.w;

v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
vv_offset = ((yy4 / 8) * p.w + xx4) * 8 + yy4 % 8;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.w;
ivec4 xx4 = ii4 % size % p.w;

v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 8 + zz4 % 8;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
#endif
}

+ 105
- 0
src/layer/vulkan/shader/reshape_pack8to1.comp View File

@@ -0,0 +1,105 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int ndim = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.w || gy >= p.h || gz >= p.c)
return;

ivec3 gxyz = ivec3(gx, gy, gz);

gxyz[p.dims - 1] *= 8;

int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;

ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);

ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];
ivec4 ii4 = i4 + 4 * gxyz4[p.dims - 1];

ivec4 v_offset;
ivec4 vv_offset;

if (ndim == 1)
{
v_offset = i4;
vv_offset = ii4;
}
if (ndim == 2)
{
ivec4 y4 = i4 / p.outw;
ivec4 x4 = i4 % p.outw;
ivec4 yy4 = ii4 / p.outw;
ivec4 xx4 = ii4 % p.outw;

v_offset = y4 * p.outw + x4;
vv_offset = yy4 * p.outw + xx4;
}
if (ndim == 3)
{
int size = p.outw * p.outh;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.outw;
ivec4 x4 = i4 % size % p.outw;
ivec4 zz4 = ii4 / size;
ivec4 yy4 = ii4 % size / p.outw;
ivec4 xx4 = ii4 % size % p.outw;

v_offset = z4 * p.outcstep + y4 * p.outw + x4;
vv_offset = zz4 * p.outcstep + yy4 * p.outw + xx4;
}

int gi = gz * p.cstep + gy * p.w + gx;

buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
}

+ 144
- 0
src/layer/vulkan/shader/reshape_pack8to4.comp View File

@@ -0,0 +1,144 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int ndim = 0;

layout (local_size_x_id = 233) in;
layout (local_size_y_id = 234) in;
layout (local_size_z_id = 235) in;

#if NCNN_fp16_packed
layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
#else
layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
#endif
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

ivec4 i4;

if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;

#if NCNN_fp16_packed
ivec4 v_offset;
ivec4 lane2;

if (p.dims == 1)
{
v_offset = i4 / 2;
lane2 = i4 % 2;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;

v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
lane2 = y4 % 2;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;

v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
lane2 = z4 % 2;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);

buffer_st4(top_blob_data, gi, v);
#else
ivec4 v_offset;

if (p.dims == 1)
{
v_offset = i4;
}
else if (p.dims == 2)
{
ivec4 y4 = i4 / p.w;
ivec4 x4 = i4 % p.w;

v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
}
else // if (p.dims == 3)
{
int size = p.w * p.h;

ivec4 z4 = i4 / size;
ivec4 y4 = i4 % size / p.w;
ivec4 x4 = i4 % size % p.w;

v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
}

int gi;

if (ndim == 1) gi = gx;
if (ndim == 2) gi = gy * p.outw + gx;
if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
#endif
}

Loading…
Cancel
Save