reduce reshape pack4 bandwidth, add some pack8 shaders

6 years ago · 4c381bee47
--- a/src/layer/vulkan/shader/interp_bicubic_pack8.comp
+++ b/src/layer/vulkan/shader/interp_bicubic_pack8.comp
@@ -0,0 +1,115 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
 layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
 layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
 layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
 layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    int sx = xofs_blob_data[gx];
    int sy = yofs_blob_data[gy];

    int v_offset_0 = gz * p.cstep + (sy - 1) * p.w + sx;
    int v_offset_1 = gz * p.cstep + (sy + 0) * p.w + sx;
    int v_offset_2 = gz * p.cstep + (sy + 1) * p.w + sx;
    int v_offset_3 = gz * p.cstep + (sy + 2) * p.w + sx;

    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);

    afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1);
    afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0);
    afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
    afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2);

    afpvec8 a;
    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;

    afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
    afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
    afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
    afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);

    afpvec8 b;
    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;

    afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1);
    afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0);
    afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1);
    afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2);

    afpvec8 c;
    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;

    afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1);
    afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0);
    afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1);
    afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2);

    afpvec8 d;
    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;

    afpvec4 beta = buffer_ld4(beta_blob_data, gy);

    afpvec8 v;
    v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a;
    v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a;

    const int gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_st8(top_blob_data, gi, v);
 }
--- a/src/layer/vulkan/shader/interp_pack8.comp
+++ b/src/layer/vulkan/shader/interp_pack8.comp
@@ -0,0 +1,120 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int resize_type = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;

    float scale_x;
    float scale_y;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    const int gi = gz * p.outcstep + gy * p.outw + gx;

    if (resize_type == 1) // nearest
    {
        afpvec2 gxy = afpvec2(gx, gy);
        ivec2 sxy_max = ivec2(p.w - 1, p.h - 1);
        ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);

        int sx = sxy.r;
        int sy = sxy.g;

        int v_offset = gz * p.cstep + sy * p.w + sx;

        buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
    }
    else if (resize_type == 2) // bilinear
    {
        afpvec2 gxy = afpvec2(gx, gy);
        afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);

        ivec2 sxy = ivec2(floor(fxy));

        fxy -= afpvec2(sxy);

        ivec2 sxy_max = ivec2(p.w - 2, p.h - 2);

        bvec2 underflow = lessThan(sxy, ivec2(0));
        bvec2 overflow = greaterThan(sxy, sxy_max);

        sxy = clamp(sxy, ivec2(0), sxy_max);

        fxy = mix(fxy, afpvec2(0.f), underflow);
        fxy = mix(fxy, afpvec2(1.f), overflow);

        int sx = sxy.r;
        int sy = sxy.g;

        int v_offset_0 = gz * p.cstep + sy * p.w + sx;
        int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx;

        afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0);
        afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
        afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1);
        afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);

        afp fx = fxy.r;
        afp fy = fxy.g;

        afpvec8 a;
        afpvec8 b;
        a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx;
        b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx;

        afpvec8 res;
        res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy;
        res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy;

        buffer_st8(top_blob_data, gi, res);
    }
 }
--- a/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
+++ b/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
@@ -0,0 +1,98 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int region_type = 0;
 layout (constant_id = 1) const int local_size = 0;
 layout (constant_id = 2) const float alpha = 0;
 layout (constant_id = 3) const float beta = 0;
 layout (constant_id = 4) const float bias_constant = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer square_workspace { float square_workspace_data[]; };
 layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    // support region_type == 0 only

    afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));

    ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
    ivec4 zz4 = z4 + 4;
    ivec4 v_offset = z4 * p.cstep + gy * p.w + gx;
    ivec4 vv_offset = zz4 * p.cstep + gy * p.w + gx;

    for (int z = 0; z < local_size; z++)
    {
        sum[0].r += afp(square_workspace_data[v_offset.r]);
        sum[0].g += afp(square_workspace_data[v_offset.g]);
        sum[0].b += afp(square_workspace_data[v_offset.b]);
        sum[0].a += afp(square_workspace_data[v_offset.a]);
        sum[1].r += afp(square_workspace_data[vv_offset.r]);
        sum[1].g += afp(square_workspace_data[vv_offset.g]);
        sum[1].b += afp(square_workspace_data[vv_offset.b]);
        sum[1].a += afp(square_workspace_data[vv_offset.a]);

        v_offset += p.cstep;
    }

    const afp alpha_div_size = afp(alpha / local_size);
    afpvec8 scale;
    scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
    scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));

    int gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);

    v[0] *= scale[0];
    v[1] *= scale[1];

    buffer_st8(bottom_top_blob_data, gi, v);
 }
--- a/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
+++ b/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
@@ -0,0 +1,91 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int region_type = 0;
 layout (constant_id = 1) const int local_size = 0;
 layout (constant_id = 2) const float alpha = 0;
 layout (constant_id = 3) const float beta = 0;
 layout (constant_id = 4) const float bias_constant = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer square_workspace { mat2x4 square_workspace_data[]; };
 layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    // support region_type == 1 only

    afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));

    int v_offset = gz * p.cstep + gy * p.w + gx;

    for (int y = 0; y < local_size; y++)
    {
        for (int x = 0; x < local_size; x++)
        {
            sum += afpvec8(square_workspace_data[v_offset + x]);
        }

        v_offset += p.w;
    }

    const afp alpha_div_size = afp(alpha / (local_size * local_size));
    afpvec8 scale;
    scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
    scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));

    int gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);

    v[0] *= scale[0];
    v[1] *= scale[1];

    buffer_st8(bottom_top_blob_data, gi, v);
 }
--- a/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
+++ b/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
@@ -0,0 +1,85 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int region_type = 0;
 layout (constant_id = 1) const int pad_head = 0;
 layout (constant_id = 2) const int pad_tail = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer square_workspace { float square_workspace_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    // support region_type == 0 only

    afp res;

    int z = (gz - pad_head) / 8;

    if (z >= 0 && z < p.c)
    {
        int v_offset = z * p.cstep + gy * p.w + gx;
        afpvec8 v8 = buffer_ld8(bottom_blob_data, v_offset);

        int lane = (gz - pad_head) % 8;

        afp v = v8[lane / 4][lane % 4];

        res = v * v;
    }
    else
    {
        res = afp(0.f);
    }

    const int gi = gz * p.outcstep + gy * p.outw + gx;

    square_workspace_data[gi] = float(res);
 }
--- a/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
+++ b/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
@@ -0,0 +1,82 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int region_type = 0;
 layout (constant_id = 1) const int pad_head = 0;
 layout (constant_id = 2) const int pad_tail = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer square_workspace { mat2x4 square_workspace_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    // support region_type == 1 only

    afpvec8 res;

    int x = gx - pad_head;
    int y = gy - pad_head;

    if (x >= 0 && x < p.w && y >= 0 && y < p.h)
    {
        int v_offset = gz * p.cstep + y * p.w + x;
        afpvec8 v = buffer_ld8(bottom_blob_data, v_offset);
        res[0] = v[0] * v[0];
        res[1] = v[1] * v[1];
    }
    else
    {
        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
    }

    const int gi = gz * p.outcstep + gy * p.outw + gx;

    square_workspace_data[gi] = mat2x4(res);
 }
--- a/src/layer/vulkan/shader/reshape_pack1to8.comp
+++ b/src/layer/vulkan/shader/reshape_pack1to8.comp
@@ -0,0 +1,92 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int ndim = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    ivec4 i4;
    ivec4 ii4;

    if (ndim == 1)
    {
        i4 = gx * 8 + ivec4(0, 1, 2, 3);
        ii4 = i4 + 4;
    }
    if (ndim == 2)
    {
        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
        ii4 = i4 + 4 * p.outw;
    }
    if (ndim == 3)
    {
        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
        ii4 = i4 + 4 * p.outh * p.outw;
    }

    int size = p.w * p.h;

    ivec4 z4 = i4 / size;
    ivec4 y4 = i4 % size / p.w;
    ivec4 x4 = i4 % size % p.w;
    ivec4 zz4 = ii4 / size;
    ivec4 yy4 = ii4 % size / p.w;
    ivec4 xx4 = ii4 % size % p.w;

    ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;
    ivec4 vv_offset = zz4 * p.cstep + yy4 * p.w + xx4;

    int gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
 }
--- a/src/layer/vulkan/shader/reshape_pack4.comp
+++ b/src/layer/vulkan/shader/reshape_pack4.comp
@@ -27,7 +27,11 @@ layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
 #if NCNN_fp16_packed
 layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
 #else
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 #endif
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

 layout (push_constant) uniform parameter
@@ -60,21 +64,22 @@ void main()
    if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
    if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;

 #if NCNN_fp16_packed
    ivec4 v_offset;
    ivec4 lane4;
    ivec4 lane2;

    if (p.dims == 1)
    {
        v_offset = i4 / 4;
        lane4 = i4 % 4;
        v_offset = i4 / 2;
        lane2 = i4 % 2;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;

        v_offset = (y4 / 4) * p.w + x4;
        lane4 = y4 % 4;
        v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
        lane2 = y4 % 2;
    }
    else // if (p.dims == 3)
    {
@@ -84,30 +89,55 @@ void main()
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;

        v_offset = (z4 / 4) * p.cstep + y4 * p.w + x4;
        lane4 = z4 % 4;
        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
        lane2 = z4 % 2;
    }

    int gi;
    if (ndim == 1)

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);

    buffer_st4(top_blob_data, gi, v);
 #else
    ivec4 v_offset;

    if (p.dims == 1)
    {
        gi = gx;
        v_offset = i4;
    }
    if (ndim == 2)
    else if (p.dims == 2)
    {
        gi = gy * p.outw + gx;
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;

        v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
    }
    if (ndim == 3)
    else // if (p.dims == 3)
    {
        gi = gz * p.outcstep + gy * p.outw + gx;
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;

        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
    }

    afpvec4 vr = buffer_ld4(bottom_blob_data, v_offset.r);
    afpvec4 vg = buffer_ld4(bottom_blob_data, v_offset.g);
    afpvec4 vb = buffer_ld4(bottom_blob_data, v_offset.b);
    afpvec4 va = buffer_ld4(bottom_blob_data, v_offset.a);
    int gi;

    afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_st4(top_blob_data, gi, v);
    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
 #endif
 }
--- a/src/layer/vulkan/shader/reshape_pack4to1.comp
+++ b/src/layer/vulkan/shader/reshape_pack4to1.comp
@@ -54,11 +54,15 @@ void main()
    if (gx >= p.w || gy >= p.h || gz >= p.c)
        return;

    ivec4 i4;
    ivec3 gxyz = ivec3(gx, gy, gz);

    if (p.dims == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
    if (p.dims == 2) i4 = (gy * 4) * p.w + gx + ivec4(0, 1, 2, 3) * p.w;
    if (p.dims == 3) i4 = (gz * 4) * p.h * p.w + gy * p.w + gx + ivec4(0, 1, 2, 3) * p.h * p.w;
    gxyz[p.dims - 1] *= 4;

    int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;

    ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);

    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];

    ivec4 v_offset;

@@ -66,14 +70,14 @@ void main()
    {
        v_offset = i4;
    }
    else if (ndim == 2)
    if (ndim == 2)
    {
        ivec4 y4 = i4 / p.outw;
        ivec4 x4 = i4 % p.outw;

        v_offset = y4 * p.outw + x4;
    }
    else // if (ndim == 3)
    if (ndim == 3)
    {
        int size = p.outw * p.outh;

--- a/src/layer/vulkan/shader/reshape_pack4to8.comp
+++ b/src/layer/vulkan/shader/reshape_pack4to8.comp
@@ -0,0 +1,184 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int ndim = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 #if NCNN_fp16_packed
 layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
 #else
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 #endif
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    ivec4 i4;
    ivec4 ii4;

    if (ndim == 1)
    {
        i4 = gx * 8 + ivec4(0, 1, 2, 3);
        ii4 = i4 + 4;
    }
    if (ndim == 2)
    {
        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
        ii4 = i4 + 4 * p.outw;
    }
    if (ndim == 3)
    {
        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
        ii4 = i4 + 4 * p.outh * p.outw;
    }

 #if NCNN_fp16_packed
    ivec4 v_offset;
    ivec4 vv_offset;
    ivec4 lane2;
    ivec4 lane4;

    if (p.dims == 1)
    {
        v_offset = i4 / 2;
        lane2 = i4 % 2;
        vv_offset = ii4 / 2;
        lane4 = ii4 % 2;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;
        ivec4 yy4 = ii4 / p.w;
        ivec4 xx4 = ii4 % p.w;

        v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
        lane2 = y4 % 2;
        vv_offset = ((yy4 / 4) * p.w + xx4) * 2 + (yy4 % 4) / 2;
        lane4 = yy4 % 2;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;
        ivec4 zz4 = ii4 / size;
        ivec4 yy4 = ii4 % size / p.w;
        ivec4 xx4 = ii4 % size % p.w;

        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
        lane2 = z4 % 2;
        vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 2 + (zz4 % 4) / 2;
        lane4 = zz4 % 2;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);

    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);

    buffer_st8(top_blob_data, gi, v);
 #else
    ivec4 v_offset;
    ivec4 vv_offset;

    if (p.dims == 1)
    {
        v_offset = i4;
        vv_offset = ii4;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;
        ivec4 yy4 = ii4 / p.w;
        ivec4 xx4 = ii4 % p.w;

        v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
        vv_offset = ((yy4 / 4) * p.w + xx4) * 4 + yy4 % 4;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;
        ivec4 zz4 = ii4 / size;
        ivec4 yy4 = ii4 % size / p.w;
        ivec4 xx4 = ii4 % size % p.w;

        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
        vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 4 + zz4 % 4;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
 #endif
 }
--- a/src/layer/vulkan/shader/reshape_pack8.comp
+++ b/src/layer/vulkan/shader/reshape_pack8.comp
@@ -0,0 +1,184 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int ndim = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 #if NCNN_fp16_packed
 layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
 #else
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 #endif
 layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    ivec4 i4;
    ivec4 ii4;

    if (ndim == 1)
    {
        i4 = gx * 8 + ivec4(0, 1, 2, 3);
        ii4 = i4 + 4;
    }
    if (ndim == 2)
    {
        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
        ii4 = i4 + 4 * p.outw;
    }
    if (ndim == 3)
    {
        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
        ii4 = i4 + 4 * p.outh * p.outw;
    }

 #if NCNN_fp16_packed
    ivec4 v_offset;
    ivec4 vv_offset;
    ivec4 lane2;
    ivec4 lane4;

    if (p.dims == 1)
    {
        v_offset = i4 / 2;
        lane2 = i4 % 2;
        vv_offset = ii4 / 2;
        lane4 = ii4 % 2;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;
        ivec4 yy4 = ii4 / p.w;
        ivec4 xx4 = ii4 % p.w;

        v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
        lane2 = y4 % 2;
        vv_offset = ((yy4 / 8) * p.w + xx4) * 4 + (yy4 % 8) / 2;
        lane4 = yy4 % 2;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;
        ivec4 zz4 = ii4 / size;
        ivec4 yy4 = ii4 % size / p.w;
        ivec4 xx4 = ii4 % size % p.w;

        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
        lane2 = z4 % 2;
        vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 4 + (zz4 % 8) / 2;
        lane4 = zz4 % 2;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);

    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);

    buffer_st8(top_blob_data, gi, v);
 #else
    ivec4 v_offset;
    ivec4 vv_offset;

    if (p.dims == 1)
    {
        v_offset = i4;
        vv_offset = ii4;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;
        ivec4 yy4 = ii4 / p.w;
        ivec4 xx4 = ii4 % p.w;

        v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
        vv_offset = ((yy4 / 8) * p.w + xx4) * 8 + yy4 % 8;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;
        ivec4 zz4 = ii4 / size;
        ivec4 yy4 = ii4 % size / p.w;
        ivec4 xx4 = ii4 % size % p.w;

        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
        vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 8 + zz4 % 8;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
 #endif
 }
--- a/src/layer/vulkan/shader/reshape_pack8to1.comp
+++ b/src/layer/vulkan/shader/reshape_pack8to1.comp
@@ -0,0 +1,105 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int ndim = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.w || gy >= p.h || gz >= p.c)
        return;

    ivec3 gxyz = ivec3(gx, gy, gz);

    gxyz[p.dims - 1] *= 8;

    int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;

    ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);

    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];
    ivec4 ii4 = i4 + 4 * gxyz4[p.dims - 1];

    ivec4 v_offset;
    ivec4 vv_offset;

    if (ndim == 1)
    {
        v_offset = i4;
        vv_offset = ii4;
    }
    if (ndim == 2)
    {
        ivec4 y4 = i4 / p.outw;
        ivec4 x4 = i4 % p.outw;
        ivec4 yy4 = ii4 / p.outw;
        ivec4 xx4 = ii4 % p.outw;

        v_offset = y4 * p.outw + x4;
        vv_offset = yy4 * p.outw + xx4;
    }
    if (ndim == 3)
    {
        int size = p.outw * p.outh;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.outw;
        ivec4 x4 = i4 % size % p.outw;
        ivec4 zz4 = ii4 / size;
        ivec4 yy4 = ii4 % size / p.outw;
        ivec4 xx4 = ii4 % size % p.outw;

        v_offset = z4 * p.outcstep + y4 * p.outw + x4;
        vv_offset = zz4 * p.outcstep + yy4 * p.outw + xx4;
    }

    int gi = gz * p.cstep + gy * p.w + gx;

    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
 }
--- a/src/layer/vulkan/shader/reshape_pack8to4.comp
+++ b/src/layer/vulkan/shader/reshape_pack8to4.comp
@@ -0,0 +1,144 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #if NCNN_fp16_storage
 #extension GL_EXT_shader_16bit_storage: require
 struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #endif
 #if NCNN_fp16_arithmetic
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int ndim = 0;

 layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;

 #if NCNN_fp16_packed
 layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
 #else
 layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
 #endif
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    ivec4 i4;

    if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
    if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
    if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;

 #if NCNN_fp16_packed
    ivec4 v_offset;
    ivec4 lane2;

    if (p.dims == 1)
    {
        v_offset = i4 / 2;
        lane2 = i4 % 2;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;

        v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
        lane2 = y4 % 2;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;

        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
        lane2 = z4 % 2;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);

    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);

    buffer_st4(top_blob_data, gi, v);
 #else
    ivec4 v_offset;

    if (p.dims == 1)
    {
        v_offset = i4;
    }
    else if (p.dims == 2)
    {
        ivec4 y4 = i4 / p.w;
        ivec4 x4 = i4 % p.w;

        v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
    }
    else // if (p.dims == 3)
    {
        int size = p.w * p.h;

        ivec4 z4 = i4 / size;
        ivec4 y4 = i4 % size / p.w;
        ivec4 x4 = i4 % size % p.w;

        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
    }

    int gi;

    if (ndim == 1) gi = gx;
    if (ndim == 2) gi = gy * p.outw + gx;
    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;

    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
 #endif
 }