From 4302f78f55167bfb02fa50bbe421f99660313cea Mon Sep 17 00:00:00 2001
From: nihui <nihuini@tencent.com>
Date: Sun, 27 Mar 2022 17:47:31 +0800
Subject: [PATCH] less specialization constant for vulkan conv1x1s1d1 shaders
 (#3657)

---
 src/layer/vulkan/convolution_vulkan.cpp       | 414 ++++++++++--------
 .../vulkan/shader/convolution_1x1s1d1.comp    |  64 ++-
 .../shader/convolution_pack1to4_1x1s1d1.comp  |  83 ++--
 .../shader/convolution_pack1to8_1x1s1d1.comp  |  69 ++-
 .../shader/convolution_pack4_1x1s1d1.comp     | 101 ++---
 .../shader/convolution_pack4to1_1x1s1d1.comp  |  83 ++--
 .../shader/convolution_pack4to8_1x1s1d1.comp  |  79 ++--
 .../shader/convolution_pack8_1x1s1d1.comp     |  79 ++--
 .../shader/convolution_pack8to1_1x1s1d1.comp  |  69 ++-
 .../shader/convolution_pack8to4_1x1s1d1.comp  |  75 ++--
 10 files changed, 520 insertions(+), 596 deletions(-)

diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 1305abec4..e7facb37a 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -172,52 +172,6 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
         padding->create_pipeline(opt);
     }
 
-    std::vector<vk_specialization_type> specializations(10 + 10);
-    specializations[0].i = kernel_w;
-    specializations[1].i = kernel_h;
-    specializations[2].i = dilation_w;
-    specializations[3].i = dilation_h;
-    specializations[4].i = stride_w;
-    specializations[5].i = stride_h;
-    specializations[6].i = bias_term;
-    specializations[7].i = activation_type;
-    specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
-    specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
-    specializations[10 + 0].i = shape_bordered_packed.dims;
-    specializations[10 + 1].i = shape_bordered_packed.w;
-    specializations[10 + 2].i = shape_bordered_packed.h;
-    specializations[10 + 3].i = shape_bordered_packed.c;
-    specializations[10 + 4].i = shape_bordered_packed.cstep;
-    specializations[10 + 5].i = out_shape_packed.dims;
-    specializations[10 + 6].i = out_shape_packed.w;
-    specializations[10 + 7].i = out_shape_packed.h;
-    specializations[10 + 8].i = out_shape_packed.c;
-    specializations[10 + 9].i = out_shape_packed.cstep;
-
-    if (is_conv1x1s1d1)
-    {
-        int shader_type_index = -1;
-        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
-        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
-        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
-        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
-        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
-        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
-        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
-        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
-        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;
-
-        pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
-        if (opt.use_shader_local_memory)
-        {
-            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8);
-        }
-        else
-        {
-            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack));
-        }
-        pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
-    }
     if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
     {
         // winograd43
@@ -477,59 +431,117 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
             opt.use_image_storage = false;
         }
 
+        std::vector<vk_specialization_type> specializations(10 + 8);
+        specializations[0].i = kernel_w;
+        specializations[1].i = kernel_h;
+        specializations[2].i = dilation_w;
+        specializations[3].i = dilation_h;
+        specializations[4].i = stride_w;
+        specializations[5].i = stride_h;
+        specializations[6].i = bias_term;
+        specializations[7].i = activation_type;
+        specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[10 + 0].i = shape_bordered_packed.w;
+        specializations[10 + 1].i = shape_bordered_packed.h;
+        specializations[10 + 2].i = shape_bordered_packed.c;
+        specializations[10 + 3].i = shape_bordered_packed.cstep;
+        specializations[10 + 4].i = out_shape_packed.w;
+        specializations[10 + 5].i = out_shape_packed.h;
+        specializations[10 + 6].i = out_shape_packed.c;
+        specializations[10 + 7].i = out_shape_packed.cstep;
+
+        Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
+        if (out_shape_packed.dims != 0)
         {
-            std::vector<vk_specialization_type> specializations(10 + 8);
-            specializations[0].i = kernel_w;
-            specializations[1].i = kernel_h;
-            specializations[2].i = dilation_w;
-            specializations[3].i = dilation_h;
-            specializations[4].i = stride_w;
-            specializations[5].i = stride_h;
-            specializations[6].i = bias_term;
-            specializations[7].i = activation_type;
-            specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
-            specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
-            specializations[10 + 0].i = shape_bordered_packed.w;
-            specializations[10 + 1].i = shape_bordered_packed.h;
-            specializations[10 + 2].i = shape_bordered_packed.c;
-            specializations[10 + 3].i = shape_bordered_packed.cstep;
-            specializations[10 + 4].i = out_shape_packed.w;
-            specializations[10 + 5].i = out_shape_packed.h;
-            specializations[10 + 6].i = out_shape_packed.c;
-            specializations[10 + 7].i = out_shape_packed.cstep;
-
-            Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
-            if (out_shape_packed.dims != 0)
-            {
-                local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
-                local_size_xyz.h = std::min(4, out_shape_packed.c);
-            }
+            local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
+            local_size_xyz.h = std::min(4, out_shape_packed.c);
+        }
 
-            int shader_type_index = -1;
-            if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
-            if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
-            if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
-            if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
-            if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
-            if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
-            if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
-            if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
-            if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;
-
-            pipeline_convolution_gemm = new Pipeline(vkdev);
-            if (opt.use_shader_local_memory)
-            {
-                pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
-            }
-            else
-            {
-                pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
-            }
-            pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
+        int shader_type_index = -1;
+        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
+        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
+        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
+        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
+        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
+        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
+        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
+        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
+        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;
+
+        pipeline_convolution_gemm = new Pipeline(vkdev);
+        if (opt.use_shader_local_memory)
+        {
+            pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
+        }
+        else
+        {
+            pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
+        }
+        pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
+    }
+    if (is_conv1x1s1d1)
+    {
+        std::vector<vk_specialization_type> specializations(4 + 8);
+        specializations[0].i = bias_term;
+        specializations[1].i = activation_type;
+        specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[4 + 0].i = shape_bordered_packed.w;
+        specializations[4 + 1].i = shape_bordered_packed.h;
+        specializations[4 + 2].i = shape_bordered_packed.c;
+        specializations[4 + 3].i = shape_bordered_packed.cstep;
+        specializations[4 + 4].i = out_shape_packed.w;
+        specializations[4 + 5].i = out_shape_packed.h;
+        specializations[4 + 6].i = out_shape_packed.c;
+        specializations[4 + 7].i = out_shape_packed.cstep;
+
+        int shader_type_index = -1;
+        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
+        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
+        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
+        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
+        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
+        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
+        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
+        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
+        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;
+
+        pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
+        if (opt.use_shader_local_memory)
+        {
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1);
+        }
+        else
+        {
+            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, std::min(8, num_output / out_elempack), 1);
         }
+        pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
     }
     else
     {
+        std::vector<vk_specialization_type> specializations(10 + 10);
+        specializations[0].i = kernel_w;
+        specializations[1].i = kernel_h;
+        specializations[2].i = dilation_w;
+        specializations[3].i = dilation_h;
+        specializations[4].i = stride_w;
+        specializations[5].i = stride_h;
+        specializations[6].i = bias_term;
+        specializations[7].i = activation_type;
+        specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
+        specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
+        specializations[10 + 0].i = shape_bordered_packed.dims;
+        specializations[10 + 1].i = shape_bordered_packed.w;
+        specializations[10 + 2].i = shape_bordered_packed.h;
+        specializations[10 + 3].i = shape_bordered_packed.c;
+        specializations[10 + 4].i = shape_bordered_packed.cstep;
+        specializations[10 + 5].i = out_shape_packed.dims;
+        specializations[10 + 6].i = out_shape_packed.w;
+        specializations[10 + 7].i = out_shape_packed.h;
+        specializations[10 + 8].i = out_shape_packed.c;
+        specializations[10 + 9].i = out_shape_packed.cstep;
+
         Mat local_size_xyz(8, 8, std::min(4, (num_output / out_elempack + 1) / 2), (void*)0);
         if (out_shape_packed.dims != 0)
         {
@@ -1194,34 +1206,63 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
     {
         // gemm
-        {
-            top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
-            if (top_blob.empty())
-                return -100;
-
-            std::vector<VkMat> bindings(4);
-            bindings[0] = bottom_blob_bordered;
-            bindings[1] = top_blob;
-            bindings[2] = weight_data_gpu;
-            bindings[3] = bias_data_gpu;
-
-            std::vector<vk_constant_type> constants(8);
-            constants[0].i = bottom_blob_bordered.w;
-            constants[1].i = bottom_blob_bordered.h;
-            constants[2].i = bottom_blob_bordered.c;
-            constants[3].i = bottom_blob_bordered.cstep;
-            constants[4].i = top_blob.w;
-            constants[5].i = top_blob.h;
-            constants[6].i = top_blob.c;
-            constants[7].i = top_blob.cstep;
-
-            VkMat dispatcher;
-            dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
-            dispatcher.h = top_blob.c;
-            dispatcher.c = 1;
-
-            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
-        }
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(4);
+        bindings[0] = bottom_blob_bordered;
+        bindings[1] = top_blob;
+        bindings[2] = weight_data_gpu;
+        bindings[3] = bias_data_gpu;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = bottom_blob_bordered.w;
+        constants[1].i = bottom_blob_bordered.h;
+        constants[2].i = bottom_blob_bordered.c;
+        constants[3].i = bottom_blob_bordered.cstep;
+        constants[4].i = top_blob.w;
+        constants[5].i = top_blob.h;
+        constants[6].i = top_blob.c;
+        constants[7].i = top_blob.cstep;
+
+        VkMat dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = top_blob.c;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
+
+        return 0;
+    }
+    if (is_conv1x1s1d1)
+    {
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkMat> bindings(4);
+        bindings[0] = bottom_blob_bordered;
+        bindings[1] = top_blob;
+        bindings[2] = weight_data_gpu;
+        bindings[3] = bias_data_gpu;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = bottom_blob_bordered.w;
+        constants[1].i = bottom_blob_bordered.h;
+        constants[2].i = bottom_blob_bordered.c;
+        constants[3].i = bottom_blob_bordered.cstep;
+        constants[4].i = top_blob.w;
+        constants[5].i = top_blob.h;
+        constants[6].i = top_blob.c;
+        constants[7].i = top_blob.cstep;
+
+        VkMat dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = top_blob.c;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
 
         return 0;
     }
@@ -1248,25 +1289,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
     constants[8].i = top_blob.c;
     constants[9].i = top_blob.cstep;
 
-    // record
-    if (is_conv1x1s1d1)
-    {
-        VkMat dispatcher;
-        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
-        dispatcher.h = 1;
-        dispatcher.c = top_blob.c;
-
-        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
-    }
-    else
-    {
-        VkMat dispatcher;
-        dispatcher.w = (top_blob.w + 1) / 2;
-        dispatcher.h = (top_blob.h + 1) / 2;
-        dispatcher.c = (top_blob.c + 1) / 2;
+    VkMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
 
-        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
-    }
+    cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
 
     return 0;
 }
@@ -1567,34 +1595,63 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
     if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
     {
         // gemm
-        {
-            top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
-            if (top_blob.empty())
-                return -100;
-
-            std::vector<VkImageMat> bindings(4);
-            bindings[0] = bottom_blob_bordered;
-            bindings[1] = top_blob;
-            bindings[2] = weight_data_gpu_image;
-            bindings[3] = bias_data_gpu_image;
-
-            std::vector<vk_constant_type> constants(8);
-            constants[0].i = bottom_blob_bordered.w;
-            constants[1].i = bottom_blob_bordered.h;
-            constants[2].i = bottom_blob_bordered.c;
-            constants[3].i = 0; // bottom_blob_bordered.cstep;
-            constants[4].i = top_blob.w;
-            constants[5].i = top_blob.h;
-            constants[6].i = top_blob.c;
-            constants[7].i = 0; // top_blob.cstep;
-
-            VkImageMat dispatcher;
-            dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
-            dispatcher.h = top_blob.c;
-            dispatcher.c = 1;
-
-            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
-        }
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkImageMat> bindings(4);
+        bindings[0] = bottom_blob_bordered;
+        bindings[1] = top_blob;
+        bindings[2] = weight_data_gpu_image;
+        bindings[3] = bias_data_gpu_image;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = bottom_blob_bordered.w;
+        constants[1].i = bottom_blob_bordered.h;
+        constants[2].i = bottom_blob_bordered.c;
+        constants[3].i = 0; // bottom_blob_bordered.cstep;
+        constants[4].i = top_blob.w;
+        constants[5].i = top_blob.h;
+        constants[6].i = top_blob.c;
+        constants[7].i = 0; // top_blob.cstep;
+
+        VkImageMat dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = top_blob.c;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
+
+        return 0;
+    }
+    if (is_conv1x1s1d1)
+    {
+        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
+        if (top_blob.empty())
+            return -100;
+
+        std::vector<VkImageMat> bindings(4);
+        bindings[0] = bottom_blob_bordered;
+        bindings[1] = top_blob;
+        bindings[2] = weight_data_gpu_image;
+        bindings[3] = bias_data_gpu_image;
+
+        std::vector<vk_constant_type> constants(8);
+        constants[0].i = bottom_blob_bordered.w;
+        constants[1].i = bottom_blob_bordered.h;
+        constants[2].i = bottom_blob_bordered.c;
+        constants[3].i = 0; // bottom_blob_bordered.cstep;
+        constants[4].i = top_blob.w;
+        constants[5].i = top_blob.h;
+        constants[6].i = top_blob.c;
+        constants[7].i = 0; // top_blob.cstep;
+
+        VkImageMat dispatcher;
+        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
+        dispatcher.h = top_blob.c;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
 
         return 0;
     }
@@ -1621,25 +1678,12 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
     constants[8].i = top_blob.c;
     constants[9].i = 0; //top_blob.cstep;
 
-    // record
-    if (is_conv1x1s1d1)
-    {
-        VkImageMat dispatcher;
-        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
-        dispatcher.h = 1;
-        dispatcher.c = top_blob.c;
-
-        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
-    }
-    else
-    {
-        VkImageMat dispatcher;
-        dispatcher.w = (top_blob.w + 1) / 2;
-        dispatcher.h = (top_blob.h + 1) / 2;
-        dispatcher.c = (top_blob.c + 1) / 2;
+    VkImageMat dispatcher;
+    dispatcher.w = (top_blob.w + 1) / 2;
+    dispatcher.h = (top_blob.h + 1) / 2;
+    dispatcher.c = (top_blob.c + 1) / 2;
 
-        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
-    }
+    cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
 
     return 0;
 }
diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_1x1s1d1.comp
index f34474331..8b287de98 100644
--- a/src/layer/vulkan/shader/convolution_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_1x1s1d1.comp
@@ -21,29 +21,21 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -64,13 +56,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -82,16 +72,14 @@ void main()
 #if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
     int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx * 4 >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -100,9 +88,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        sum = afpvec4(image3d_ld1(bias_blob, ivec3(gz, 0, 0)));
+        sum = afpvec4(image3d_ld1(bias_blob, ivec3(gy, 0, 0)));
 #else
-        sum = afpvec4(buffer_ld1(bias_data, gz));
+        sum = afpvec4(buffer_ld1(bias_data, gy));
 #endif
     }
     else
@@ -118,7 +106,7 @@ void main()
 
     for (int z = 0; z < psc(c); z++)
     {
-        afp k = image3d_ld1(weight_blob, ivec3(0, z, gz));
+        afp k = image3d_ld1(weight_blob, ivec3(0, z, gy));
 
         sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z));
         sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z));
@@ -126,7 +114,7 @@ void main()
         sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
     }
 #else
-    int w_offset = gz * psc(c);
+    int w_offset = gy * psc(c);
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -174,12 +162,12 @@ void main()
     
 
 #if NCNN_image_shader
-    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r);
-    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g);
-    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b);
-    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a);
+    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r);
+    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum.g);
+    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum.b);
+    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum.a);
 #else
-    const int gi = gz * psc(outcstep) / 4 + gx;
+    const int gi = gy * psc(outcstep) / 4 + gx;
 
 #if NCNN_fp16_packed
     top_blob_data[gi] = sum;
diff --git a/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
index 3acc66311..f9028f7e2 100644
--- a/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
@@ -23,29 +23,21 @@
 
 #define LOCAL_MEMORY_UNROLL_INCH 8
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -83,14 +73,13 @@ void main()
 {
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
 #if NCNN_image_shader
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
 #if !NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
+        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec4 b = buffer_ld4(bias_data, gz);
+        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -133,7 +122,7 @@ void main()
         afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
+        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));
 
         sum0 += v0 * k;
         sum1 += v1 * k;
@@ -141,21 +130,21 @@ void main()
         sum3 += v3 * k;
     }
 #else
-    int w_offset = gz * psc(c);
+    int w_offset = gy * psc(c);
     int v_offset = gx;
 
 #if NCNN_shader_local_memory
     const int lx = int(gl_LocalInvocationID.x);
-    const int lz = int(gl_LocalInvocationID.z);
+    const int ly = int(gl_LocalInvocationID.y);
 
     int z = 0;
     for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
     {
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -163,7 +152,7 @@ void main()
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
+                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
             }
         }
 
@@ -176,7 +165,7 @@ void main()
             afp v2 = lfp2afp(tmp_v[lx][z4][2]);
             afp v3 = lfp2afp(tmp_v[lx][z4][3]);
 
-            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
+            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);
 
             sum0 += v0 * k;
             sum1 += v1 * k;
@@ -194,11 +183,11 @@ void main()
     {
         const int remain = psc(c) - z;
 
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -206,7 +195,7 @@ void main()
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
+                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
             }
         }
 
@@ -219,7 +208,7 @@ void main()
             afp v2 = lfp2afp(tmp_v[lx][z4][2]);
             afp v3 = lfp2afp(tmp_v[lx][z4][3]);
 
-            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
+            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);
 
             sum0 += v0 * k;
             sum1 += v1 * k;
@@ -297,17 +286,17 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st4(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
index 50e743f2f..847d37091 100644
--- a/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter
 
 void main()
 {
-#if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+#if NCNN_image_shader
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
-    int gx = int(gl_GlobalInvocationID.x) * 4;
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -99,9 +84,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
+        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec8 b = buffer_ld8(bias_data, gz);
+        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -129,7 +114,7 @@ void main()
         afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
+        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));
 
         // sum += v * k;
         sum0[0] += v0 * k[0];
@@ -145,7 +130,7 @@ void main()
         sum3[1] += v3 * k[1];
     }
 #else
-    int w_offset = gz * psc(c);
+    int w_offset = gy * psc(c);
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -248,12 +233,12 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st8(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
index df5e2e4b0..e377191f4 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
@@ -23,29 +23,21 @@
 
 #define LOCAL_MEMORY_UNROLL_INCH 8
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -83,14 +73,13 @@ void main()
 {
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
 #if NCNN_image_shader
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
 #if !NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
+        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec4 b = buffer_ld4(bias_data, gz);
+        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -134,10 +123,10 @@ void main()
         afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
         afpmat4 k = afpmat4(
-            image3d_ld4(weight_blob, ivec3(0, z, gz)),
-            image3d_ld4(weight_blob, ivec3(1, z, gz)),
-            image3d_ld4(weight_blob, ivec3(2, z, gz)),
-            image3d_ld4(weight_blob, ivec3(3, z, gz))
+            image3d_ld4(weight_blob, ivec3(0, z, gy)),
+            image3d_ld4(weight_blob, ivec3(1, z, gy)),
+            image3d_ld4(weight_blob, ivec3(2, z, gy)),
+            image3d_ld4(weight_blob, ivec3(3, z, gy))
         );
 
         sum0 += v0 * k;
@@ -146,21 +135,21 @@ void main()
         sum3 += v3 * k;
     }
 #else
-    int w_offset = gz * psc(c) * 4;
+    int w_offset = gy * psc(c) * 4;
     int v_offset = gx;
 
 #if NCNN_shader_local_memory
     const int lx = int(gl_LocalInvocationID.x);
-    const int lz = int(gl_LocalInvocationID.z);
+    const int ly = int(gl_LocalInvocationID.y);
 
     int z = 0;
     for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
     {
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -168,7 +157,7 @@ void main()
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
+                tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
             }
         }
 
@@ -181,10 +170,10 @@ void main()
             afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
             afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);
 
-            afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
-            afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
-            afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
-            afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
+            afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
+            afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
+            afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
+            afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);
 
             afpmat4 k = afpmat4(k0, k1, k2, k3);
 
@@ -204,11 +193,11 @@ void main()
     {
         const int remain = psc(c) - z;
 
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -216,7 +205,7 @@ void main()
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
+                tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
             }
         }
 
@@ -229,10 +218,10 @@ void main()
             afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
             afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);
 
-            afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
-            afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
-            afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
-            afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
+            afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
+            afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
+            afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
+            afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);
 
             afpmat4 k = afpmat4(k0, k1, k2, k3);
 
@@ -317,17 +306,17 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st4(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
index 636a3c258..b040af642 100644
--- a/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
@@ -23,29 +23,21 @@
 
 #define LOCAL_MEMORY_UNROLL_INCH 8
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -83,14 +73,13 @@ void main()
 {
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
 #if NCNN_image_shader
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
 #if !NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
+        afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
 #else
-        afp b = buffer_ld1(bias_data, gz);
+        afp b = buffer_ld1(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -133,7 +122,7 @@ void main()
         afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
+        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));
 
         sum0 += dot(v0, k);
         sum1 += dot(v1, k);
@@ -141,21 +130,21 @@ void main()
         sum3 += dot(v3, k);
     }
 #else
-    int w_offset = gz * psc(c);
+    int w_offset = gy * psc(c);
     int v_offset = gx;
 
 #if NCNN_shader_local_memory
     const int lx = int(gl_LocalInvocationID.x);
-    const int lz = int(gl_LocalInvocationID.z);
+    const int ly = int(gl_LocalInvocationID.y);
 
     int z = 0;
     for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
     {
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -163,7 +152,7 @@ void main()
         {
             for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
             {
-                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
+                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
             }
         }
 
@@ -176,7 +165,7 @@ void main()
             afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
             afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);
 
-            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
+            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);
 
             sum0 += dot(v0, k);
             sum1 += dot(v1, k);
@@ -194,11 +183,11 @@ void main()
     {
         const int remain = psc(c) - z;
 
-        if (lz < 4)
+        if (ly < 4)
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
+                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
             }
         }
 
@@ -206,7 +195,7 @@ void main()
         {
             for (int z4 = 0; z4 < remain; z4++)
             {
-                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
+                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
             }
         }
 
@@ -219,7 +208,7 @@ void main()
             afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
             afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);
 
-            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
+            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);
 
             sum0 += dot(v0, k);
             sum1 += dot(v1, k);
@@ -297,17 +286,17 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st1(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
index e8937d531..8c2375866 100644
--- a/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
 
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
 
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter
 
 void main()
 {
-#if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+#if NCNN_image_shader
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
-    int gx = int(gl_GlobalInvocationID.x) * 4;
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -99,9 +84,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
+        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec8 b = buffer_ld8(bias_data, gz);
+        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -129,14 +114,14 @@ void main()
         afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gz));
-        afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gz));
-        afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gz));
-        afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gz));
-        afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gz));
-        afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gz));
-        afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gz));
-        afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gz));
+        afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gy));
+        afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gy));
+        afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gy));
+        afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gy));
+        afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gy));
+        afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gy));
+        afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gy));
+        afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gy));
 
         // sum += v * k;
         sum0[0].r += dot(v0, k0);
@@ -176,7 +161,7 @@ void main()
         sum3[1].a += dot(v3, k7);
     }
 #else
-    int w_offset = gz * psc(c) * 8;
+    int w_offset = gy * psc(c) * 8;
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st8(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
index 755bd4014..e74bc8424 100644
--- a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
 
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
 
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter
 
 void main()
 {
-#if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+#if NCNN_image_shader
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
-    int gx = int(gl_GlobalInvocationID.x) * 4;
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -99,9 +84,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
+        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec8 b = buffer_ld8(bias_data, gz);
+        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -129,14 +114,14 @@ void main()
         afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
-        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
-        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
-        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
-        afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz));
-        afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz));
-        afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz));
-        afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz));
+        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
+        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
+        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
+        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));
+        afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gy));
+        afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gy));
+        afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gy));
+        afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gy));
 
         // sum += v * k
         sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -176,7 +161,7 @@ void main()
         sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
     }
 #else
-    int w_offset = gz * psc(c) * 8;
+    int w_offset = gy * psc(c) * 8;
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st8(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
index 0c6bf958b..f26c4d8ca 100644
--- a/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter
 
 void main()
 {
-#if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+#if NCNN_image_shader
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
-    int gx = int(gl_GlobalInvocationID.x) * 4;
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -99,9 +84,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
+        afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
 #else
-        afp b = buffer_ld1(bias_data, gz);
+        afp b = buffer_ld1(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -129,7 +114,7 @@ void main()
         afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
+        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));
 
         // sum += dot(v, k);
         sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
@@ -138,7 +123,7 @@ void main()
         sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);
     }
 #else
-    int w_offset = gz * psc(c);
+    int w_offset = gy * psc(c);
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -210,12 +195,12 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st1(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);
diff --git a/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
index 5573c36bd..0803a9fd5 100644
--- a/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif
 
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
+layout (constant_id = 0) const int bias_term = 0;
+layout (constant_id = 1) const int activation_type = 0;
+layout (constant_id = 2) const float activation_param_0 = 0;
+layout (constant_id = 3) const float activation_param_1 = 0;
+
+#define shape_constant_id_offset 4
+layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
+layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
+layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
+layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
+
+layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
+layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
+layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
+layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
 
 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
 
 layout (push_constant) uniform parameter
 {
-    int dims;
     int w;
     int h;
     int c;
     int cstep;
 
-    int outdims;
     int outw;
     int outh;
     int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter
 
 void main()
 {
-#if NCNN_image_shader
     int gx = int(gl_GlobalInvocationID.x) * 4;
     int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
+#if NCNN_image_shader
+    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
         return;
 #else
-    int gx = int(gl_GlobalInvocationID.x) * 4;
-    int gy = int(gl_GlobalInvocationID.y);
-    int gz = int(gl_GlobalInvocationID.z);
-
-    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
+    if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 #endif
 
@@ -99,9 +84,9 @@ void main()
     if (bias_term == 1)
     {
 #if NCNN_image_shader
-        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
+        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
-        afpvec4 b = buffer_ld4(bias_data, gz);
+        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
         sum0 = b;
         sum1 = b;
@@ -129,10 +114,10 @@ void main()
         afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
         afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));
 
-        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
-        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
-        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
-        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
+        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
+        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
+        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
+        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));
 
         // sum += v * k;
         sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -156,7 +141,7 @@ void main()
         sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
     }
 #else
-    int w_offset = gz * psc(c) * 4;
+    int w_offset = gy * psc(c) * 4;
     int v_offset = gx;
 
     for (int z = 0; z < psc(c); z++)
@@ -246,12 +231,12 @@ void main()
     }
 
 #if NCNN_image_shader
-    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
-    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
-    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
-    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
+    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
+    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
+    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
+    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
-    int gi = gz * psc(outcstep) + gx;
+    int gi = gy * psc(outcstep) + gx;
 
     buffer_st4(top_blob_data, gi + 0, sum0);
     if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);