less specialization constant for vulkan conv1x1s1d1 shaders (#3657)

4 years ago · 4302f78f55
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -172,52 +172,6 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
        padding->create_pipeline(opt);
    }

    std::vector<vk_specialization_type> specializations(10 + 10);
    specializations[0].i = kernel_w;
    specializations[1].i = kernel_h;
    specializations[2].i = dilation_w;
    specializations[3].i = dilation_h;
    specializations[4].i = stride_w;
    specializations[5].i = stride_h;
    specializations[6].i = bias_term;
    specializations[7].i = activation_type;
    specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
    specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
    specializations[10 + 0].i = shape_bordered_packed.dims;
    specializations[10 + 1].i = shape_bordered_packed.w;
    specializations[10 + 2].i = shape_bordered_packed.h;
    specializations[10 + 3].i = shape_bordered_packed.c;
    specializations[10 + 4].i = shape_bordered_packed.cstep;
    specializations[10 + 5].i = out_shape_packed.dims;
    specializations[10 + 6].i = out_shape_packed.w;
    specializations[10 + 7].i = out_shape_packed.h;
    specializations[10 + 8].i = out_shape_packed.c;
    specializations[10 + 9].i = out_shape_packed.cstep;

    if (is_conv1x1s1d1)
    {
        int shader_type_index = -1;
        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;

        pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
        if (opt.use_shader_local_memory)
        {
            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8);
        }
        else
        {
            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack));
        }
        pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
    }
    if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
    {
        // winograd43
@@ -477,59 +431,117 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
            opt.use_image_storage = false;
        }

        std::vector<vk_specialization_type> specializations(10 + 8);
        specializations[0].i = kernel_w;
        specializations[1].i = kernel_h;
        specializations[2].i = dilation_w;
        specializations[3].i = dilation_h;
        specializations[4].i = stride_w;
        specializations[5].i = stride_h;
        specializations[6].i = bias_term;
        specializations[7].i = activation_type;
        specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
        specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
        specializations[10 + 0].i = shape_bordered_packed.w;
        specializations[10 + 1].i = shape_bordered_packed.h;
        specializations[10 + 2].i = shape_bordered_packed.c;
        specializations[10 + 3].i = shape_bordered_packed.cstep;
        specializations[10 + 4].i = out_shape_packed.w;
        specializations[10 + 5].i = out_shape_packed.h;
        specializations[10 + 6].i = out_shape_packed.c;
        specializations[10 + 7].i = out_shape_packed.cstep;

        Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
        if (out_shape_packed.dims != 0)
        {
            std::vector<vk_specialization_type> specializations(10 + 8);
            specializations[0].i = kernel_w;
            specializations[1].i = kernel_h;
            specializations[2].i = dilation_w;
            specializations[3].i = dilation_h;
            specializations[4].i = stride_w;
            specializations[5].i = stride_h;
            specializations[6].i = bias_term;
            specializations[7].i = activation_type;
            specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
            specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
            specializations[10 + 0].i = shape_bordered_packed.w;
            specializations[10 + 1].i = shape_bordered_packed.h;
            specializations[10 + 2].i = shape_bordered_packed.c;
            specializations[10 + 3].i = shape_bordered_packed.cstep;
            specializations[10 + 4].i = out_shape_packed.w;
            specializations[10 + 5].i = out_shape_packed.h;
            specializations[10 + 6].i = out_shape_packed.c;
            specializations[10 + 7].i = out_shape_packed.cstep;

            Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
            if (out_shape_packed.dims != 0)
            {
                local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
                local_size_xyz.h = std::min(4, out_shape_packed.c);
            }
            local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
            local_size_xyz.h = std::min(4, out_shape_packed.c);
        }

            int shader_type_index = -1;
            if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
            if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
            if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
            if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
            if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
            if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
            if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
            if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
            if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;

            pipeline_convolution_gemm = new Pipeline(vkdev);
            if (opt.use_shader_local_memory)
            {
                pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
            }
            else
            {
                pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
            }
            pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
        int shader_type_index = -1;
        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;

        pipeline_convolution_gemm = new Pipeline(vkdev);
        if (opt.use_shader_local_memory)
        {
            pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
        }
        else
        {
            pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
        }
        pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
    }
    if (is_conv1x1s1d1)
    {
        std::vector<vk_specialization_type> specializations(4 + 8);
        specializations[0].i = bias_term;
        specializations[1].i = activation_type;
        specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
        specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
        specializations[4 + 0].i = shape_bordered_packed.w;
        specializations[4 + 1].i = shape_bordered_packed.h;
        specializations[4 + 2].i = shape_bordered_packed.c;
        specializations[4 + 3].i = shape_bordered_packed.cstep;
        specializations[4 + 4].i = out_shape_packed.w;
        specializations[4 + 5].i = out_shape_packed.h;
        specializations[4 + 6].i = out_shape_packed.c;
        specializations[4 + 7].i = out_shape_packed.cstep;

        int shader_type_index = -1;
        if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
        if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
        if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
        if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
        if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
        if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
        if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
        if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
        if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;

        pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
        if (opt.use_shader_local_memory)
        {
            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1);
        }
        else
        {
            pipeline_convolution_1x1s1d1->set_local_size_xyz(8, std::min(8, num_output / out_elempack), 1);
        }
        pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
    }
    else
    {
        std::vector<vk_specialization_type> specializations(10 + 10);
        specializations[0].i = kernel_w;
        specializations[1].i = kernel_h;
        specializations[2].i = dilation_w;
        specializations[3].i = dilation_h;
        specializations[4].i = stride_w;
        specializations[5].i = stride_h;
        specializations[6].i = bias_term;
        specializations[7].i = activation_type;
        specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
        specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
        specializations[10 + 0].i = shape_bordered_packed.dims;
        specializations[10 + 1].i = shape_bordered_packed.w;
        specializations[10 + 2].i = shape_bordered_packed.h;
        specializations[10 + 3].i = shape_bordered_packed.c;
        specializations[10 + 4].i = shape_bordered_packed.cstep;
        specializations[10 + 5].i = out_shape_packed.dims;
        specializations[10 + 6].i = out_shape_packed.w;
        specializations[10 + 7].i = out_shape_packed.h;
        specializations[10 + 8].i = out_shape_packed.c;
        specializations[10 + 9].i = out_shape_packed.cstep;

        Mat local_size_xyz(8, 8, std::min(4, (num_output / out_elempack + 1) / 2), (void*)0);
        if (out_shape_packed.dims != 0)
        {
@@ -1194,34 +1206,63 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
    if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
    {
        // gemm
        {
            top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

            std::vector<VkMat> bindings(4);
            bindings[0] = bottom_blob_bordered;
            bindings[1] = top_blob;
            bindings[2] = weight_data_gpu;
            bindings[3] = bias_data_gpu;

            std::vector<vk_constant_type> constants(8);
            constants[0].i = bottom_blob_bordered.w;
            constants[1].i = bottom_blob_bordered.h;
            constants[2].i = bottom_blob_bordered.c;
            constants[3].i = bottom_blob_bordered.cstep;
            constants[4].i = top_blob.w;
            constants[5].i = top_blob.h;
            constants[6].i = top_blob.c;
            constants[7].i = top_blob.cstep;

            VkMat dispatcher;
            dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
            dispatcher.h = top_blob.c;
            dispatcher.c = 1;

            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
        }
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        std::vector<VkMat> bindings(4);
        bindings[0] = bottom_blob_bordered;
        bindings[1] = top_blob;
        bindings[2] = weight_data_gpu;
        bindings[3] = bias_data_gpu;

        std::vector<vk_constant_type> constants(8);
        constants[0].i = bottom_blob_bordered.w;
        constants[1].i = bottom_blob_bordered.h;
        constants[2].i = bottom_blob_bordered.c;
        constants[3].i = bottom_blob_bordered.cstep;
        constants[4].i = top_blob.w;
        constants[5].i = top_blob.h;
        constants[6].i = top_blob.c;
        constants[7].i = top_blob.cstep;

        VkMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = top_blob.c;
        dispatcher.c = 1;

        cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);

        return 0;
    }
    if (is_conv1x1s1d1)
    {
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        std::vector<VkMat> bindings(4);
        bindings[0] = bottom_blob_bordered;
        bindings[1] = top_blob;
        bindings[2] = weight_data_gpu;
        bindings[3] = bias_data_gpu;

        std::vector<vk_constant_type> constants(8);
        constants[0].i = bottom_blob_bordered.w;
        constants[1].i = bottom_blob_bordered.h;
        constants[2].i = bottom_blob_bordered.c;
        constants[3].i = bottom_blob_bordered.cstep;
        constants[4].i = top_blob.w;
        constants[5].i = top_blob.h;
        constants[6].i = top_blob.c;
        constants[7].i = top_blob.cstep;

        VkMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = top_blob.c;
        dispatcher.c = 1;

        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);

        return 0;
    }
@@ -1248,25 +1289,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
    constants[8].i = top_blob.c;
    constants[9].i = top_blob.cstep;

    // record
    if (is_conv1x1s1d1)
    {
        VkMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = 1;
        dispatcher.c = top_blob.c;

        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
    }
    else
    {
        VkMat dispatcher;
        dispatcher.w = (top_blob.w + 1) / 2;
        dispatcher.h = (top_blob.h + 1) / 2;
        dispatcher.c = (top_blob.c + 1) / 2;
    VkMat dispatcher;
    dispatcher.w = (top_blob.w + 1) / 2;
    dispatcher.h = (top_blob.h + 1) / 2;
    dispatcher.c = (top_blob.c + 1) / 2;

        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
    }
    cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);

    return 0;
 }
@@ -1567,34 +1595,63 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
    if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
    {
        // gemm
        {
            top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
            if (top_blob.empty())
                return -100;

            std::vector<VkImageMat> bindings(4);
            bindings[0] = bottom_blob_bordered;
            bindings[1] = top_blob;
            bindings[2] = weight_data_gpu_image;
            bindings[3] = bias_data_gpu_image;

            std::vector<vk_constant_type> constants(8);
            constants[0].i = bottom_blob_bordered.w;
            constants[1].i = bottom_blob_bordered.h;
            constants[2].i = bottom_blob_bordered.c;
            constants[3].i = 0; // bottom_blob_bordered.cstep;
            constants[4].i = top_blob.w;
            constants[5].i = top_blob.h;
            constants[6].i = top_blob.c;
            constants[7].i = 0; // top_blob.cstep;

            VkImageMat dispatcher;
            dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
            dispatcher.h = top_blob.c;
            dispatcher.c = 1;

            cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
        }
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        std::vector<VkImageMat> bindings(4);
        bindings[0] = bottom_blob_bordered;
        bindings[1] = top_blob;
        bindings[2] = weight_data_gpu_image;
        bindings[3] = bias_data_gpu_image;

        std::vector<vk_constant_type> constants(8);
        constants[0].i = bottom_blob_bordered.w;
        constants[1].i = bottom_blob_bordered.h;
        constants[2].i = bottom_blob_bordered.c;
        constants[3].i = 0; // bottom_blob_bordered.cstep;
        constants[4].i = top_blob.w;
        constants[5].i = top_blob.h;
        constants[6].i = top_blob.c;
        constants[7].i = 0; // top_blob.cstep;

        VkImageMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = top_blob.c;
        dispatcher.c = 1;

        cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);

        return 0;
    }
    if (is_conv1x1s1d1)
    {
        top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
        if (top_blob.empty())
            return -100;

        std::vector<VkImageMat> bindings(4);
        bindings[0] = bottom_blob_bordered;
        bindings[1] = top_blob;
        bindings[2] = weight_data_gpu_image;
        bindings[3] = bias_data_gpu_image;

        std::vector<vk_constant_type> constants(8);
        constants[0].i = bottom_blob_bordered.w;
        constants[1].i = bottom_blob_bordered.h;
        constants[2].i = bottom_blob_bordered.c;
        constants[3].i = 0; // bottom_blob_bordered.cstep;
        constants[4].i = top_blob.w;
        constants[5].i = top_blob.h;
        constants[6].i = top_blob.c;
        constants[7].i = 0; // top_blob.cstep;

        VkImageMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = top_blob.c;
        dispatcher.c = 1;

        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);

        return 0;
    }
@@ -1621,25 +1678,12 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
    constants[8].i = top_blob.c;
    constants[9].i = 0; //top_blob.cstep;

    // record
    if (is_conv1x1s1d1)
    {
        VkImageMat dispatcher;
        dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
        dispatcher.h = 1;
        dispatcher.c = top_blob.c;

        cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
    }
    else
    {
        VkImageMat dispatcher;
        dispatcher.w = (top_blob.w + 1) / 2;
        dispatcher.h = (top_blob.h + 1) / 2;
        dispatcher.c = (top_blob.c + 1) / 2;
    VkImageMat dispatcher;
    dispatcher.w = (top_blob.w + 1) / 2;
    dispatcher.h = (top_blob.h + 1) / 2;
    dispatcher.c = (top_blob.c + 1) / 2;

        cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
    }
    cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);

    return 0;
 }
--- a/src/layer/vulkan/shader/convolution_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_1x1s1d1.comp
@@ -21,29 +21,21 @@
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -64,13 +56,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -82,16 +72,14 @@ void main()
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx * 4 >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -100,9 +88,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        sum = afpvec4(image3d_ld1(bias_blob, ivec3(gz, 0, 0)));
        sum = afpvec4(image3d_ld1(bias_blob, ivec3(gy, 0, 0)));
 #else
        sum = afpvec4(buffer_ld1(bias_data, gz));
        sum = afpvec4(buffer_ld1(bias_data, gy));
 #endif
    }
    else
@@ -118,7 +106,7 @@ void main()

    for (int z = 0; z < psc(c); z++)
    {
        afp k = image3d_ld1(weight_blob, ivec3(0, z, gz));
        afp k = image3d_ld1(weight_blob, ivec3(0, z, gy));

        sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z));
        sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z));
@@ -126,7 +114,7 @@ void main()
        sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
    }
 #else
    int w_offset = gz * psc(c);
    int w_offset = gy * psc(c);
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -174,12 +162,12 @@ void main()
    

 #if NCNN_image_shader
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a);
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum.g);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum.b);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum.a);
 #else
    const int gi = gz * psc(outcstep) / 4 + gx;
    const int gi = gy * psc(outcstep) / 4 + gx;

 #if NCNN_fp16_packed
    top_blob_data[gi] = sum;
--- a/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
@@ -23,29 +23,21 @@

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -83,14 +73,13 @@ void main()
 {
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
 #if !NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec4 b = buffer_ld4(bias_data, gz);
        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -133,7 +122,7 @@ void main()
        afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));

        sum0 += v0 * k;
        sum1 += v1 * k;
@@ -141,21 +130,21 @@ void main()
        sum3 += v3 * k;
    }
 #else
    int w_offset = gz * psc(c);
    int w_offset = gy * psc(c);
    int v_offset = gx;

 #if NCNN_shader_local_memory
    const int lx = int(gl_LocalInvocationID.x);
    const int lz = int(gl_LocalInvocationID.z);
    const int ly = int(gl_LocalInvocationID.y);

    int z = 0;
    for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
    {
        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -163,7 +152,7 @@ void main()
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
            }
        }

@@ -176,7 +165,7 @@ void main()
            afp v2 = lfp2afp(tmp_v[lx][z4][2]);
            afp v3 = lfp2afp(tmp_v[lx][z4][3]);

            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

            sum0 += v0 * k;
            sum1 += v1 * k;
@@ -194,11 +183,11 @@ void main()
    {
        const int remain = psc(c) - z;

        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -206,7 +195,7 @@ void main()
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
            }
        }

@@ -219,7 +208,7 @@ void main()
            afp v2 = lfp2afp(tmp_v[lx][z4][2]);
            afp v3 = lfp2afp(tmp_v[lx][z4][3]);

            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

            sum0 += v0 * k;
            sum1 += v1 * k;
@@ -297,17 +286,17 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st4(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

 void main()
 {
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -99,9 +84,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec8 b = buffer_ld8(bias_data, gz);
        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -129,7 +114,7 @@ void main()
        afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));

        // sum += v * k;
        sum0[0] += v0 * k[0];
@@ -145,7 +130,7 @@ void main()
        sum3[1] += v3 * k[1];
    }
 #else
    int w_offset = gz * psc(c);
    int w_offset = gy * psc(c);
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -248,12 +233,12 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st8(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
@@ -23,29 +23,21 @@

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -83,14 +73,13 @@ void main()
 {
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
 #if !NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec4 b = buffer_ld4(bias_data, gz);
        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -134,10 +123,10 @@ void main()
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpmat4 k = afpmat4(
            image3d_ld4(weight_blob, ivec3(0, z, gz)),
            image3d_ld4(weight_blob, ivec3(1, z, gz)),
            image3d_ld4(weight_blob, ivec3(2, z, gz)),
            image3d_ld4(weight_blob, ivec3(3, z, gz))
            image3d_ld4(weight_blob, ivec3(0, z, gy)),
            image3d_ld4(weight_blob, ivec3(1, z, gy)),
            image3d_ld4(weight_blob, ivec3(2, z, gy)),
            image3d_ld4(weight_blob, ivec3(3, z, gy))
        );

        sum0 += v0 * k;
@@ -146,21 +135,21 @@ void main()
        sum3 += v3 * k;
    }
 #else
    int w_offset = gz * psc(c) * 4;
    int w_offset = gy * psc(c) * 4;
    int v_offset = gx;

 #if NCNN_shader_local_memory
    const int lx = int(gl_LocalInvocationID.x);
    const int lz = int(gl_LocalInvocationID.z);
    const int ly = int(gl_LocalInvocationID.y);

    int z = 0;
    for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
    {
        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -168,7 +157,7 @@ void main()
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
                tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
            }
        }

@@ -181,10 +170,10 @@ void main()
            afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
            afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

            afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
            afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
            afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
            afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
            afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
            afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
            afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
            afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

            afpmat4 k = afpmat4(k0, k1, k2, k3);

@@ -204,11 +193,11 @@ void main()
    {
        const int remain = psc(c) - z;

        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -216,7 +205,7 @@ void main()
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
                tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
            }
        }

@@ -229,10 +218,10 @@ void main()
            afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
            afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

            afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
            afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
            afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
            afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
            afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
            afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
            afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
            afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

            afpmat4 k = afpmat4(k0, k1, k2, k3);

@@ -317,17 +306,17 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st4(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
@@ -23,29 +23,21 @@

 #define LOCAL_MEMORY_UNROLL_INCH 8

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -83,14 +73,13 @@ void main()
 {
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
 #if !NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif
 #endif
@@ -103,9 +92,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
        afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
 #else
        afp b = buffer_ld1(bias_data, gz);
        afp b = buffer_ld1(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -133,7 +122,7 @@ void main()
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
        afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));

        sum0 += dot(v0, k);
        sum1 += dot(v1, k);
@@ -141,21 +130,21 @@ void main()
        sum3 += dot(v3, k);
    }
 #else
    int w_offset = gz * psc(c);
    int w_offset = gy * psc(c);
    int v_offset = gx;

 #if NCNN_shader_local_memory
    const int lx = int(gl_LocalInvocationID.x);
    const int lz = int(gl_LocalInvocationID.z);
    const int ly = int(gl_LocalInvocationID.y);

    int z = 0;
    for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
    {
        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -163,7 +152,7 @@ void main()
        {
            for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
            {
                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
            }
        }

@@ -176,7 +165,7 @@ void main()
            afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
            afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

            sum0 += dot(v0, k);
            sum1 += dot(v1, k);
@@ -194,11 +183,11 @@ void main()
    {
        const int remain = psc(c) - z;

        if (lz < 4)
        if (ly < 4)
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
                tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
            }
        }

@@ -206,7 +195,7 @@ void main()
        {
            for (int z4 = 0; z4 < remain; z4++)
            {
                tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
                tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
            }
        }

@@ -219,7 +208,7 @@ void main()
            afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
            afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

            afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
            afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

            sum0 += dot(v0, k);
            sum1 += dot(v1, k);
@@ -297,17 +286,17 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
 #if NCNN_shader_local_memory
    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st1(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

 void main()
 {
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -99,9 +84,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec8 b = buffer_ld8(bias_data, gz);
        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -129,14 +114,14 @@ void main()
        afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gz));
        afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gz));
        afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gz));
        afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gz));
        afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gz));
        afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gz));
        afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gz));
        afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gz));
        afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gy));
        afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gy));
        afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gy));
        afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gy));
        afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gy));
        afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gy));
        afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gy));
        afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gy));

        // sum += v * k;
        sum0[0].r += dot(v0, k0);
@@ -176,7 +161,7 @@ void main()
        sum3[1].a += dot(v3, k7);
    }
 #else
    int w_offset = gz * psc(c) * 8;
    int w_offset = gy * psc(c) * 8;
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st8(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

 void main()
 {
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -99,9 +84,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
        afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec8 b = buffer_ld8(bias_data, gz);
        afpvec8 b = buffer_ld8(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -129,14 +114,14 @@ void main()
        afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
        afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz));
        afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz));
        afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz));
        afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz));
        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));
        afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gy));
        afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gy));
        afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gy));
        afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gy));

        // sum += v * k
        sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -176,7 +161,7 @@ void main()
        sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
    }
 #else
    int w_offset = gz * psc(c) * 8;
    int w_offset = gy * psc(c) * 8;
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st8(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

 void main()
 {
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -99,9 +84,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
        afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
 #else
        afp b = buffer_ld1(bias_data, gz);
        afp b = buffer_ld1(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -129,7 +114,7 @@ void main()
        afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
        afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));

        // sum += dot(v, k);
        sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
@@ -138,7 +123,7 @@ void main()
        sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);
    }
 #else
    int w_offset = gz * psc(c);
    int w_offset = gy * psc(c);
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -210,12 +195,12 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st1(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);
--- a/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
+++ b/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 #if NCNN_image_shader
 layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int dims;
    int w;
    int h;
    int c;
    int cstep;

    int outdims;
    int outw;
    int outh;
    int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

 void main()
 {
 #if NCNN_image_shader
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
 #if NCNN_image_shader
    if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
        return;
 #else
    int gx = int(gl_GlobalInvocationID.x) * 4;
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;
 #endif

@@ -99,9 +84,9 @@ void main()
    if (bias_term == 1)
    {
 #if NCNN_image_shader
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
        afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
 #else
        afpvec4 b = buffer_ld4(bias_data, gz);
        afpvec4 b = buffer_ld4(bias_data, gy);
 #endif
        sum0 = b;
        sum1 = b;
@@ -129,10 +114,10 @@ void main()
        afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
        afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
        afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
        afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
        afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
        afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));

        // sum += v * k;
        sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -156,7 +141,7 @@ void main()
        sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
    }
 #else
    int w_offset = gz * psc(c) * 4;
    int w_offset = gy * psc(c) * 4;
    int v_offset = gx;

    for (int z = 0; z < psc(c); z++)
@@ -246,12 +231,12 @@ void main()
    }

 #if NCNN_image_shader
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
    image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
    image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
    image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
    image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
 #else
    int gi = gz * psc(outcstep) + gx;
    int gi = gy * psc(outcstep) + gx;

    buffer_st4(top_blob_data, gi + 0, sum0);
    if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);