From 4c381bee47cb99806ffb2c19b032a65351344e00 Mon Sep 17 00:00:00 2001
From: nihui <shuizhuyuanluo@126.com>
Date: Tue, 21 Jan 2020 14:22:54 +0800
Subject: [PATCH] reduce reshape pack4 bandwidth, add some pack8 shaders

---
 .../vulkan/shader/interp_bicubic_pack8.comp   | 115 +++++++++++
 src/layer/vulkan/shader/interp_pack8.comp     | 120 ++++++++++++
 .../shader/lrn_norm_across_channel_pack8.comp |  98 ++++++++++
 .../shader/lrn_norm_within_channel_pack8.comp |  91 +++++++++
 .../lrn_square_pad_across_channel_pack8.comp  |  85 ++++++++
 .../lrn_square_pad_within_channel_pack8.comp  |  82 ++++++++
 src/layer/vulkan/shader/reshape_pack1to8.comp |  92 +++++++++
 src/layer/vulkan/shader/reshape_pack4.comp    |  70 +++++--
 src/layer/vulkan/shader/reshape_pack4to1.comp |  16 +-
 src/layer/vulkan/shader/reshape_pack4to8.comp | 184 ++++++++++++++++++
 src/layer/vulkan/shader/reshape_pack8.comp    | 184 ++++++++++++++++++
 src/layer/vulkan/shader/reshape_pack8to1.comp | 105 ++++++++++
 src/layer/vulkan/shader/reshape_pack8to4.comp | 144 ++++++++++++++
 13 files changed, 1360 insertions(+), 26 deletions(-)
 create mode 100644 src/layer/vulkan/shader/interp_bicubic_pack8.comp
 create mode 100644 src/layer/vulkan/shader/interp_pack8.comp
 create mode 100644 src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
 create mode 100644 src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
 create mode 100644 src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
 create mode 100644 src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
 create mode 100644 src/layer/vulkan/shader/reshape_pack1to8.comp
 create mode 100644 src/layer/vulkan/shader/reshape_pack4to8.comp
 create mode 100644 src/layer/vulkan/shader/reshape_pack8.comp
 create mode 100644 src/layer/vulkan/shader/reshape_pack8to1.comp
 create mode 100644 src/layer/vulkan/shader/reshape_pack8to4.comp

diff --git a/src/layer/vulkan/shader/interp_bicubic_pack8.comp b/src/layer/vulkan/shader/interp_bicubic_pack8.comp
new file mode 100644
index 000000000..6d7794f3c
--- /dev/null
+++ b/src/layer/vulkan/shader/interp_bicubic_pack8.comp
@@ -0,0 +1,115 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+layout (binding = 2) readonly buffer alpha_blob { sfpvec4 alpha_blob_data[]; };
+layout (binding = 3) readonly buffer xofs_blob { int xofs_blob_data[]; };
+layout (binding = 4) readonly buffer beta_blob { sfpvec4 beta_blob_data[]; };
+layout (binding = 5) readonly buffer yofs_blob { int yofs_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    int sx = xofs_blob_data[gx];
+    int sy = yofs_blob_data[gy];
+
+    int v_offset_0 = gz * p.cstep + (sy - 1) * p.w + sx;
+    int v_offset_1 = gz * p.cstep + (sy + 0) * p.w + sx;
+    int v_offset_2 = gz * p.cstep + (sy + 1) * p.w + sx;
+    int v_offset_3 = gz * p.cstep + (sy + 2) * p.w + sx;
+
+    afpvec4 alpha = buffer_ld4(alpha_blob_data, gx);
+
+    afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0 - 1);
+    afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 0);
+    afpvec8 a2 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
+    afpvec8 a3 = buffer_ld8(bottom_blob_data, v_offset_0 + 2);
+
+    afpvec8 a;
+    a[0] = a0[0] * alpha.r + a1[0] * alpha.g + a2[0] * alpha.b + a3[0] * alpha.a;
+    a[1] = a0[1] * alpha.r + a1[1] * alpha.g + a2[1] * alpha.b + a3[1] * alpha.a;
+
+    afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1 - 1);
+    afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 0);
+    afpvec8 b2 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
+    afpvec8 b3 = buffer_ld8(bottom_blob_data, v_offset_1 + 2);
+
+    afpvec8 b;
+    b[0] = b0[0] * alpha.r + b1[0] * alpha.g + b2[0] * alpha.b + b3[0] * alpha.a;
+    b[1] = b0[1] * alpha.r + b1[1] * alpha.g + b2[1] * alpha.b + b3[1] * alpha.a;
+
+    afpvec8 c0 = buffer_ld8(bottom_blob_data, v_offset_2 - 1);
+    afpvec8 c1 = buffer_ld8(bottom_blob_data, v_offset_2 + 0);
+    afpvec8 c2 = buffer_ld8(bottom_blob_data, v_offset_2 + 1);
+    afpvec8 c3 = buffer_ld8(bottom_blob_data, v_offset_2 + 2);
+
+    afpvec8 c;
+    c[0] = c0[0] * alpha.r + c1[0] * alpha.g + c2[0] * alpha.b + c3[0] * alpha.a;
+    c[1] = c0[1] * alpha.r + c1[1] * alpha.g + c2[1] * alpha.b + c3[1] * alpha.a;
+
+    afpvec8 d0 = buffer_ld8(bottom_blob_data, v_offset_3 - 1);
+    afpvec8 d1 = buffer_ld8(bottom_blob_data, v_offset_3 + 0);
+    afpvec8 d2 = buffer_ld8(bottom_blob_data, v_offset_3 + 1);
+    afpvec8 d3 = buffer_ld8(bottom_blob_data, v_offset_3 + 2);
+
+    afpvec8 d;
+    d[0] = d0[0] * alpha.r + d1[0] * alpha.g + d2[0] * alpha.b + d3[0] * alpha.a;
+    d[1] = d0[1] * alpha.r + d1[1] * alpha.g + d2[1] * alpha.b + d3[1] * alpha.a;
+
+    afpvec4 beta = buffer_ld4(beta_blob_data, gy);
+
+    afpvec8 v;
+    v[0] = a[0] * beta.r + b[0] * beta.g + c[0] * beta.b + d[0] * beta.a;
+    v[1] = a[1] * beta.r + b[1] * beta.g + c[1] * beta.b + d[1] * beta.a;
+
+    const int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    buffer_st8(top_blob_data, gi, v);
+}
diff --git a/src/layer/vulkan/shader/interp_pack8.comp b/src/layer/vulkan/shader/interp_pack8.comp
new file mode 100644
index 000000000..8ed2f1ddd
--- /dev/null
+++ b/src/layer/vulkan/shader/interp_pack8.comp
@@ -0,0 +1,120 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int resize_type = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+
+    float scale_x;
+    float scale_y;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    const int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    if (resize_type == 1) // nearest
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        ivec2 sxy_max = ivec2(p.w - 1, p.h - 1);
+        ivec2 sxy = min(ivec2(floor(gxy * afpvec2(p.scale_x, p.scale_y))), sxy_max);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+        int v_offset = gz * p.cstep + sy * p.w + sx;
+
+        buffer_cp8(top_blob_data, gi, bottom_blob_data, v_offset);
+    }
+    else if (resize_type == 2) // bilinear
+    {
+        afpvec2 gxy = afpvec2(gx, gy);
+        afpvec2 fxy = (gxy + afp(0.5f)) * afpvec2(p.scale_x, p.scale_y) - afp(0.5f);
+
+        ivec2 sxy = ivec2(floor(fxy));
+
+        fxy -= afpvec2(sxy);
+
+        ivec2 sxy_max = ivec2(p.w - 2, p.h - 2);
+
+        bvec2 underflow = lessThan(sxy, ivec2(0));
+        bvec2 overflow = greaterThan(sxy, sxy_max);
+
+        sxy = clamp(sxy, ivec2(0), sxy_max);
+
+        fxy = mix(fxy, afpvec2(0.f), underflow);
+        fxy = mix(fxy, afpvec2(1.f), overflow);
+
+        int sx = sxy.r;
+        int sy = sxy.g;
+
+        int v_offset_0 = gz * p.cstep + sy * p.w + sx;
+        int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx;
+
+        afpvec8 a0 = buffer_ld8(bottom_blob_data, v_offset_0);
+        afpvec8 a1 = buffer_ld8(bottom_blob_data, v_offset_0 + 1);
+        afpvec8 b0 = buffer_ld8(bottom_blob_data, v_offset_1);
+        afpvec8 b1 = buffer_ld8(bottom_blob_data, v_offset_1 + 1);
+
+        afp fx = fxy.r;
+        afp fy = fxy.g;
+
+        afpvec8 a;
+        afpvec8 b;
+        a[0] = a0[0] * (afp(1.f) - fx) + a1[0] * fx;
+        b[1] = b0[1] * (afp(1.f) - fx) + b1[1] * fx;
+
+        afpvec8 res;
+        res[0] = a[0] * (afp(1.f) - fy) + b[0] * fy;
+        res[1] = a[1] * (afp(1.f) - fy) + b[1] * fy;
+
+        buffer_st8(top_blob_data, gi, res);
+    }
+}
diff --git a/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp b/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
new file mode 100644
index 000000000..8e6fe072f
--- /dev/null
+++ b/src/layer/vulkan/shader/lrn_norm_across_channel_pack8.comp
@@ -0,0 +1,98 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int region_type = 0;
+layout (constant_id = 1) const int local_size = 0;
+layout (constant_id = 2) const float alpha = 0;
+layout (constant_id = 3) const float beta = 0;
+layout (constant_id = 4) const float bias_constant = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer square_workspace { float square_workspace_data[]; };
+layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    // support region_type == 0 only
+
+    afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+    ivec4 z4 = gz * 8 + ivec4(0, 1, 2, 3);
+    ivec4 zz4 = z4 + 4;
+    ivec4 v_offset = z4 * p.cstep + gy * p.w + gx;
+    ivec4 vv_offset = zz4 * p.cstep + gy * p.w + gx;
+
+    for (int z = 0; z < local_size; z++)
+    {
+        sum[0].r += afp(square_workspace_data[v_offset.r]);
+        sum[0].g += afp(square_workspace_data[v_offset.g]);
+        sum[0].b += afp(square_workspace_data[v_offset.b]);
+        sum[0].a += afp(square_workspace_data[v_offset.a]);
+        sum[1].r += afp(square_workspace_data[vv_offset.r]);
+        sum[1].g += afp(square_workspace_data[vv_offset.g]);
+        sum[1].b += afp(square_workspace_data[vv_offset.b]);
+        sum[1].a += afp(square_workspace_data[vv_offset.a]);
+
+        v_offset += p.cstep;
+    }
+
+    const afp alpha_div_size = afp(alpha / local_size);
+    afpvec8 scale;
+    scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
+    scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));
+
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+
+    v[0] *= scale[0];
+    v[1] *= scale[1];
+
+    buffer_st8(bottom_top_blob_data, gi, v);
+}
diff --git a/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp b/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
new file mode 100644
index 000000000..9f3621444
--- /dev/null
+++ b/src/layer/vulkan/shader/lrn_norm_within_channel_pack8.comp
@@ -0,0 +1,91 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int region_type = 0;
+layout (constant_id = 1) const int local_size = 0;
+layout (constant_id = 2) const float alpha = 0;
+layout (constant_id = 3) const float beta = 0;
+layout (constant_id = 4) const float bias_constant = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer square_workspace { mat2x4 square_workspace_data[]; };
+layout (binding = 1) buffer bottom_top_blob { sfpvec8 bottom_top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    // support region_type == 1 only
+
+    afpvec8 sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
+
+    int v_offset = gz * p.cstep + gy * p.w + gx;
+
+    for (int y = 0; y < local_size; y++)
+    {
+        for (int x = 0; x < local_size; x++)
+        {
+            sum += afpvec8(square_workspace_data[v_offset + x]);
+        }
+
+        v_offset += p.w;
+    }
+
+    const afp alpha_div_size = afp(alpha / (local_size * local_size));
+    afpvec8 scale;
+    scale[0] = pow(afp(bias_constant) + alpha_div_size * sum[0], afpvec4(-beta));
+    scale[1] = pow(afp(bias_constant) + alpha_div_size * sum[1], afpvec4(-beta));
+
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
+
+    v[0] *= scale[0];
+    v[1] *= scale[1];
+
+    buffer_st8(bottom_top_blob_data, gi, v);
+}
diff --git a/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp b/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
new file mode 100644
index 000000000..fcfdf9c8d
--- /dev/null
+++ b/src/layer/vulkan/shader/lrn_square_pad_across_channel_pack8.comp
@@ -0,0 +1,85 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int region_type = 0;
+layout (constant_id = 1) const int pad_head = 0;
+layout (constant_id = 2) const int pad_tail = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer square_workspace { float square_workspace_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    // support region_type == 0 only
+
+    afp res;
+
+    int z = (gz - pad_head) / 8;
+
+    if (z >= 0 && z < p.c)
+    {
+        int v_offset = z * p.cstep + gy * p.w + gx;
+        afpvec8 v8 = buffer_ld8(bottom_blob_data, v_offset);
+
+        int lane = (gz - pad_head) % 8;
+
+        afp v = v8[lane / 4][lane % 4];
+
+        res = v * v;
+    }
+    else
+    {
+        res = afp(0.f);
+    }
+
+    const int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    square_workspace_data[gi] = float(res);
+}
diff --git a/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp b/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
new file mode 100644
index 000000000..49118dbd3
--- /dev/null
+++ b/src/layer/vulkan/shader/lrn_square_pad_within_channel_pack8.comp
@@ -0,0 +1,82 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int region_type = 0;
+layout (constant_id = 1) const int pad_head = 0;
+layout (constant_id = 2) const int pad_tail = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer square_workspace { mat2x4 square_workspace_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    // support region_type == 1 only
+
+    afpvec8 res;
+
+    int x = gx - pad_head;
+    int y = gy - pad_head;
+
+    if (x >= 0 && x < p.w && y >= 0 && y < p.h)
+    {
+        int v_offset = gz * p.cstep + y * p.w + x;
+        afpvec8 v = buffer_ld8(bottom_blob_data, v_offset);
+        res[0] = v[0] * v[0];
+        res[1] = v[1] * v[1];
+    }
+    else
+    {
+        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
+    }
+
+    const int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    square_workspace_data[gi] = mat2x4(res);
+}
diff --git a/src/layer/vulkan/shader/reshape_pack1to8.comp b/src/layer/vulkan/shader/reshape_pack1to8.comp
new file mode 100644
index 000000000..d0d381885
--- /dev/null
+++ b/src/layer/vulkan/shader/reshape_pack1to8.comp
@@ -0,0 +1,92 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
+        ii4 = i4 + 4 * p.outw;
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
+        ii4 = i4 + 4 * p.outh * p.outw;
+    }
+
+    int size = p.w * p.h;
+
+    ivec4 z4 = i4 / size;
+    ivec4 y4 = i4 % size / p.w;
+    ivec4 x4 = i4 % size % p.w;
+    ivec4 zz4 = ii4 / size;
+    ivec4 yy4 = ii4 % size / p.w;
+    ivec4 xx4 = ii4 % size % p.w;
+
+    ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;
+    ivec4 vv_offset = zz4 * p.cstep + yy4 * p.w + xx4;
+
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+}
diff --git a/src/layer/vulkan/shader/reshape_pack4.comp b/src/layer/vulkan/shader/reshape_pack4.comp
index ff294d2ac..daee2ed61 100644
--- a/src/layer/vulkan/shader/reshape_pack4.comp
+++ b/src/layer/vulkan/shader/reshape_pack4.comp
@@ -27,7 +27,11 @@ layout (local_size_x_id = 233) in;
 layout (local_size_y_id = 234) in;
 layout (local_size_z_id = 235) in;
 
-layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 
 layout (push_constant) uniform parameter
@@ -60,21 +64,22 @@ void main()
     if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
     if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
 
+#if NCNN_fp16_packed
     ivec4 v_offset;
-    ivec4 lane4;
+    ivec4 lane2;
 
     if (p.dims == 1)
     {
-        v_offset = i4 / 4;
-        lane4 = i4 % 4;
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
     }
     else if (p.dims == 2)
     {
         ivec4 y4 = i4 / p.w;
         ivec4 x4 = i4 % p.w;
 
-        v_offset = (y4 / 4) * p.w + x4;
-        lane4 = y4 % 4;
+        v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
     }
     else // if (p.dims == 3)
     {
@@ -84,30 +89,55 @@ void main()
         ivec4 y4 = i4 % size / p.w;
         ivec4 x4 = i4 % size % p.w;
 
-        v_offset = (z4 / 4) * p.cstep + y4 * p.w + x4;
-        lane4 = z4 % 4;
+        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
     }
 
     int gi;
-    if (ndim == 1)
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (p.dims == 1)
     {
-        gi = gx;
+        v_offset = i4;
     }
-    if (ndim == 2)
+    else if (p.dims == 2)
     {
-        gi = gy * p.outw + gx;
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+
+        v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
     }
-    if (ndim == 3)
+    else // if (p.dims == 3)
     {
-        gi = gz * p.outcstep + gy * p.outw + gx;
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+
+        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
     }
 
-    afpvec4 vr = buffer_ld4(bottom_blob_data, v_offset.r);
-    afpvec4 vg = buffer_ld4(bottom_blob_data, v_offset.g);
-    afpvec4 vb = buffer_ld4(bottom_blob_data, v_offset.b);
-    afpvec4 va = buffer_ld4(bottom_blob_data, v_offset.a);
+    int gi;
 
-    afpvec4 v = afpvec4(vr[lane4.r], vg[lane4.g], vb[lane4.b], va[lane4.a]);
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
 
-    buffer_st4(top_blob_data, gi, v);
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
 }
diff --git a/src/layer/vulkan/shader/reshape_pack4to1.comp b/src/layer/vulkan/shader/reshape_pack4to1.comp
index 3644e91cb..8e5efd381 100644
--- a/src/layer/vulkan/shader/reshape_pack4to1.comp
+++ b/src/layer/vulkan/shader/reshape_pack4to1.comp
@@ -54,11 +54,15 @@ void main()
     if (gx >= p.w || gy >= p.h || gz >= p.c)
         return;
 
-    ivec4 i4;
+    ivec3 gxyz = ivec3(gx, gy, gz);
 
-    if (p.dims == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
-    if (p.dims == 2) i4 = (gy * 4) * p.w + gx + ivec4(0, 1, 2, 3) * p.w;
-    if (p.dims == 3) i4 = (gz * 4) * p.h * p.w + gy * p.w + gx + ivec4(0, 1, 2, 3) * p.h * p.w;
+    gxyz[p.dims - 1] *= 4;
+
+    int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);
+
+    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];
 
     ivec4 v_offset;
 
@@ -66,14 +70,14 @@ void main()
     {
         v_offset = i4;
     }
-    else if (ndim == 2)
+    if (ndim == 2)
     {
         ivec4 y4 = i4 / p.outw;
         ivec4 x4 = i4 % p.outw;
 
         v_offset = y4 * p.outw + x4;
     }
-    else // if (ndim == 3)
+    if (ndim == 3)
     {
         int size = p.outw * p.outh;
 
diff --git a/src/layer/vulkan/shader/reshape_pack4to8.comp b/src/layer/vulkan/shader/reshape_pack4to8.comp
new file mode 100644
index 000000000..4f3ff5fba
--- /dev/null
+++ b/src/layer/vulkan/shader/reshape_pack4to8.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
+        ii4 = i4 + 4 * p.outw;
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
+        ii4 = i4 + 4 * p.outh * p.outw;
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+        vv_offset = ii4 / 2;
+        lane4 = ii4 % 2;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+        ivec4 yy4 = ii4 / p.w;
+        ivec4 xx4 = ii4 % p.w;
+
+        v_offset = ((y4 / 4) * p.w + x4) * 2 + (y4 % 4) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 4) * p.w + xx4) * 2 + (yy4 % 4) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / p.w;
+        ivec4 xx4 = ii4 % size % p.w;
+
+        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 2 + (z4 % 4) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 2 + (zz4 % 4) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+        ivec4 yy4 = ii4 / p.w;
+        ivec4 xx4 = ii4 % p.w;
+
+        v_offset = ((y4 / 4) * p.w + x4) * 4 + y4 % 4;
+        vv_offset = ((yy4 / 4) * p.w + xx4) * 4 + yy4 % 4;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / p.w;
+        ivec4 xx4 = ii4 % size % p.w;
+
+        v_offset = ((z4 / 4) * p.cstep + y4 * p.w + x4) * 4 + z4 % 4;
+        vv_offset = ((zz4 / 4) * p.cstep + yy4 * p.w + xx4) * 4 + zz4 % 4;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/src/layer/vulkan/shader/reshape_pack8.comp b/src/layer/vulkan/shader/reshape_pack8.comp
new file mode 100644
index 000000000..9002f38e0
--- /dev/null
+++ b/src/layer/vulkan/shader/reshape_pack8.comp
@@ -0,0 +1,184 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec8 top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    ivec4 i4;
+    ivec4 ii4;
+
+    if (ndim == 1)
+    {
+        i4 = gx * 8 + ivec4(0, 1, 2, 3);
+        ii4 = i4 + 4;
+    }
+    if (ndim == 2)
+    {
+        i4 = (gy * 8) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
+        ii4 = i4 + 4 * p.outw;
+    }
+    if (ndim == 3)
+    {
+        i4 = (gz * 8) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
+        ii4 = i4 + 4 * p.outh * p.outw;
+    }
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 vv_offset;
+    ivec4 lane2;
+    ivec4 lane4;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+        vv_offset = ii4 / 2;
+        lane4 = ii4 % 2;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+        ivec4 yy4 = ii4 / p.w;
+        ivec4 xx4 = ii4 % p.w;
+
+        v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+        vv_offset = ((yy4 / 8) * p.w + xx4) * 4 + (yy4 % 8) / 2;
+        lane4 = yy4 % 2;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / p.w;
+        ivec4 xx4 = ii4 % size % p.w;
+
+        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+        vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 4 + (zz4 % 8) / 2;
+        lane4 = zz4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec2 vvr = buffer_ld2(bottom_blob_data, vv_offset.r);
+    afpvec2 vvg = buffer_ld2(bottom_blob_data, vv_offset.g);
+    afpvec2 vvb = buffer_ld2(bottom_blob_data, vv_offset.b);
+    afpvec2 vva = buffer_ld2(bottom_blob_data, vv_offset.a);
+
+    afpvec8 v = afpvec8(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a], vvr[lane4.r], vvg[lane4.g], vvb[lane4.b], vva[lane4.a]);
+
+    buffer_st8(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+        ivec4 yy4 = ii4 / p.w;
+        ivec4 xx4 = ii4 % p.w;
+
+        v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
+        vv_offset = ((yy4 / 8) * p.w + xx4) * 8 + yy4 % 8;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / p.w;
+        ivec4 xx4 = ii4 % size % p.w;
+
+        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
+        vv_offset = ((zz4 / 8) * p.cstep + yy4 * p.w + xx4) * 8 + zz4 % 8;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    buffer_cp1to8(top_blob_data, gi, bottom_blob_data, v_offset, vv_offset);
+#endif
+}
diff --git a/src/layer/vulkan/shader/reshape_pack8to1.comp b/src/layer/vulkan/shader/reshape_pack8to1.comp
new file mode 100644
index 000000000..0307721a0
--- /dev/null
+++ b/src/layer/vulkan/shader/reshape_pack8to1.comp
@@ -0,0 +1,105 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+layout (binding = 0) readonly buffer bottom_blob { sfpvec8 bottom_blob_data[]; };
+layout (binding = 1) writeonly buffer top_blob { sfp top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.w || gy >= p.h || gz >= p.c)
+        return;
+
+    ivec3 gxyz = ivec3(gx, gy, gz);
+
+    gxyz[p.dims - 1] *= 8;
+
+    int i4_0 = gxyz.z * p.h * p.w + gxyz.y * p.w + gxyz.x;
+
+    ivec3 gxyz4 = ivec3(1, p.w, p.h * p.w);
+
+    ivec4 i4 = i4_0 + ivec4(0, 1, 2, 3) * gxyz4[p.dims - 1];
+    ivec4 ii4 = i4 + 4 * gxyz4[p.dims - 1];
+
+    ivec4 v_offset;
+    ivec4 vv_offset;
+
+    if (ndim == 1)
+    {
+        v_offset = i4;
+        vv_offset = ii4;
+    }
+    if (ndim == 2)
+    {
+        ivec4 y4 = i4 / p.outw;
+        ivec4 x4 = i4 % p.outw;
+        ivec4 yy4 = ii4 / p.outw;
+        ivec4 xx4 = ii4 % p.outw;
+
+        v_offset = y4 * p.outw + x4;
+        vv_offset = yy4 * p.outw + xx4;
+    }
+    if (ndim == 3)
+    {
+        int size = p.outw * p.outh;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.outw;
+        ivec4 x4 = i4 % size % p.outw;
+        ivec4 zz4 = ii4 / size;
+        ivec4 yy4 = ii4 % size / p.outw;
+        ivec4 xx4 = ii4 % size % p.outw;
+
+        v_offset = z4 * p.outcstep + y4 * p.outw + x4;
+        vv_offset = zz4 * p.outcstep + yy4 * p.outw + xx4;
+    }
+
+    int gi = gz * p.cstep + gy * p.w + gx;
+
+    buffer_cp8to1(top_blob_data, v_offset, vv_offset, bottom_blob_data, gi);
+}
diff --git a/src/layer/vulkan/shader/reshape_pack8to4.comp b/src/layer/vulkan/shader/reshape_pack8to4.comp
new file mode 100644
index 000000000..e98feb988
--- /dev/null
+++ b/src/layer/vulkan/shader/reshape_pack8to4.comp
@@ -0,0 +1,144 @@
+// Tencent is pleased to support the open source community by making ncnn available.
+//
+// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
+//
+// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
+// in compliance with the License. You may obtain a copy of the License at
+//
+// https://opensource.org/licenses/BSD-3-Clause
+//
+// Unless required by applicable law or agreed to in writing, software distributed
+// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+// CONDITIONS OF ANY KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations under the License.
+
+#version 450
+
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_16bit_storage: require
+struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
+#endif
+#if NCNN_fp16_arithmetic
+#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#endif
+
+layout (constant_id = 0) const int ndim = 0;
+
+layout (local_size_x_id = 233) in;
+layout (local_size_y_id = 234) in;
+layout (local_size_z_id = 235) in;
+
+#if NCNN_fp16_packed
+layout (binding = 0) readonly buffer bottom_blob { sfpvec2 bottom_blob_data[]; };
+#else
+layout (binding = 0) readonly buffer bottom_blob { sfp bottom_blob_data[]; };
+#endif
+layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
+
+layout (push_constant) uniform parameter
+{
+    int dims;
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outdims;
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
+} p;
+
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
+    int gy = int(gl_GlobalInvocationID.y);
+    int gz = int(gl_GlobalInvocationID.z);
+
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    ivec4 i4;
+
+    if (ndim == 1) i4 = gx * 4 + ivec4(0, 1, 2, 3);
+    if (ndim == 2) i4 = (gy * 4) * p.outw + gx + ivec4(0, 1, 2, 3) * p.outw;
+    if (ndim == 3) i4 = (gz * 4) * p.outh * p.outw + gy * p.outw + gx + ivec4(0, 1, 2, 3) * p.outh * p.outw;
+
+#if NCNN_fp16_packed
+    ivec4 v_offset;
+    ivec4 lane2;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4 / 2;
+        lane2 = i4 % 2;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+
+        v_offset = ((y4 / 8) * p.w + x4) * 4 + (y4 % 8) / 2;
+        lane2 = y4 % 2;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+
+        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 4 + (z4 % 8) / 2;
+        lane2 = z4 % 2;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    afpvec2 vr = buffer_ld2(bottom_blob_data, v_offset.r);
+    afpvec2 vg = buffer_ld2(bottom_blob_data, v_offset.g);
+    afpvec2 vb = buffer_ld2(bottom_blob_data, v_offset.b);
+    afpvec2 va = buffer_ld2(bottom_blob_data, v_offset.a);
+
+    afpvec4 v = afpvec4(vr[lane2.r], vg[lane2.g], vb[lane2.b], va[lane2.a]);
+
+    buffer_st4(top_blob_data, gi, v);
+#else
+    ivec4 v_offset;
+
+    if (p.dims == 1)
+    {
+        v_offset = i4;
+    }
+    else if (p.dims == 2)
+    {
+        ivec4 y4 = i4 / p.w;
+        ivec4 x4 = i4 % p.w;
+
+        v_offset = ((y4 / 8) * p.w + x4) * 8 + y4 % 8;
+    }
+    else // if (p.dims == 3)
+    {
+        int size = p.w * p.h;
+
+        ivec4 z4 = i4 / size;
+        ivec4 y4 = i4 % size / p.w;
+        ivec4 x4 = i4 % size % p.w;
+
+        v_offset = ((z4 / 8) * p.cstep + y4 * p.w + x4) * 8 + z4 % 8;
+    }
+
+    int gi;
+
+    if (ndim == 1) gi = gx;
+    if (ndim == 2) gi = gy * p.outw + gx;
+    if (ndim == 3) gi = gz * p.outcstep + gy * p.outw + gx;
+
+    buffer_cp1to4(top_blob_data, gi, bottom_blob_data, v_offset);
+#endif
+}