unified vulkan khr/nv cooperative matrix shader (#6116)

1 year ago · 9cdc02bb7a
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -376,14 +376,16 @@ public:
    VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT queryShaderAtomicFloat2Features;

    // extension properties
    void* queryDeviceProperties;
    void* queryExtensionProperties;
    VkPhysicalDeviceFloatControlsPropertiesKHR queryFloatControlsProperties;
    VkPhysicalDeviceShaderIntegerDotProductProperties queryShaderIntegerDotProductProperties;
    VkPhysicalDeviceSubgroupProperties querySubgroupProperties;
    VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties;
    VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties;
    std::vector<VkCooperativeMatrixPropertiesKHR> queryCooperativeMatrixProperties;
    std::vector<VkCooperativeMatrixPropertiesNV> queryCooperativeMatrixPropertiesNV;

    // extension sub properties
    std::vector<VkCooperativeMatrixPropertiesKHR> queryCooperativeMatrixSubProperties;
    std::vector<VkCooperativeMatrixPropertiesNV> queryCooperativeMatrixSubPropertiesNV;
 };

 void GpuInfoPrivate::query_features()
@@ -855,17 +857,19 @@ void GpuInfoPrivate::query_extension_features()

    // query cooperative_matrix
    memset(&queryCooperativeMatrixFeatures, 0, sizeof(queryCooperativeMatrixFeatures));
    memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV));
    queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
    queryCooperativeMatrixFeatures.pNext = 0;
    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
    queryCooperativeMatrixFeaturesNV.pNext = 0;
    if (support_VK_KHR_cooperative_matrix)
    {
        queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeMatrixFeatures;
    }
    else if (support_VK_NV_cooperative_matrix)

    // query nv cooperative matrix
    memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV));
    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
    queryCooperativeMatrixFeaturesNV.pNext = 0;
    if (support_VK_NV_cooperative_matrix)
    {
        queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures;
        queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV;
@@ -978,7 +982,7 @@ void GpuInfoPrivate::query_extension_features()

 void GpuInfoPrivate::query_extension_properties()
 {
    queryDeviceProperties = 0;
    queryExtensionProperties = 0;

    // query float controls
    memset(&queryFloatControlsProperties, 0, sizeof(queryFloatControlsProperties));
@@ -986,8 +990,8 @@ void GpuInfoPrivate::query_extension_properties()
    queryFloatControlsProperties.pNext = 0;
    if (support_VK_KHR_shader_float_controls)
    {
        queryFloatControlsProperties.pNext = queryDeviceProperties;
        queryDeviceProperties = &queryFloatControlsProperties;
        queryFloatControlsProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryFloatControlsProperties;
    }

    // query integer dot product
@@ -996,8 +1000,8 @@ void GpuInfoPrivate::query_extension_properties()
    queryShaderIntegerDotProductProperties.pNext = 0;
    if (support_VK_KHR_driver_properties)
    {
        queryShaderIntegerDotProductProperties.pNext = queryDeviceProperties;
        queryDeviceProperties = &queryShaderIntegerDotProductProperties;
        queryShaderIntegerDotProductProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryShaderIntegerDotProductProperties;
    }

    // query subgroup
@@ -1006,8 +1010,8 @@ void GpuInfoPrivate::query_extension_properties()
    querySubgroupProperties.pNext = 0;
    if (VK_VERSION_MAJOR(g_instance.instance_api_version) >= 1 && VK_VERSION_MINOR(g_instance.instance_api_version) >= 1)
    {
        querySubgroupProperties.pNext = queryDeviceProperties;
        queryDeviceProperties = &querySubgroupProperties;
        querySubgroupProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &querySubgroupProperties;
    }
    else
    {
@@ -1032,8 +1036,8 @@ void GpuInfoPrivate::query_extension_properties()
    queryDriverProperties.pNext = 0;
    if (support_VK_KHR_driver_properties)
    {
        queryDriverProperties.pNext = queryDeviceProperties;
        queryDeviceProperties = &queryDriverProperties;
        queryDriverProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &queryDriverProperties;
    }

    // query subgroup size control
@@ -1042,15 +1046,15 @@ void GpuInfoPrivate::query_extension_properties()
    querySubgroupSizeControlProperties.pNext = 0;
    if (support_VK_EXT_subgroup_size_control)
    {
        querySubgroupSizeControlProperties.pNext = queryDeviceProperties;
        queryDeviceProperties = &querySubgroupSizeControlProperties;
        querySubgroupSizeControlProperties.pNext = queryExtensionProperties;
        queryExtensionProperties = &querySubgroupSizeControlProperties;
    }

    if (support_VK_KHR_get_physical_device_properties2)
    {
        VkPhysicalDeviceProperties2KHR queryProperties;
        queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
        queryProperties.pNext = queryDeviceProperties;
        queryProperties.pNext = queryExtensionProperties;

        vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);

@@ -1072,8 +1076,8 @@ void GpuInfoPrivate::query_extension_properties()
    }

    // query supported cooperative matrix types and operations
    queryCooperativeMatrixProperties.clear();
    queryCooperativeMatrixPropertiesNV.clear();
    queryCooperativeMatrixSubProperties.clear();
    queryCooperativeMatrixSubPropertiesNV.clear();
    support_cooperative_matrix_8_8_16 = false;
    support_cooperative_matrix_16_8_8 = false;
    support_cooperative_matrix_16_8_16 = false;
@@ -1087,14 +1091,14 @@ void GpuInfoPrivate::query_extension_properties()
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
        }

        queryCooperativeMatrixProperties.resize(propertyCount);
        queryCooperativeMatrixSubProperties.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeMatrixProperties[j], 0, sizeof(queryCooperativeMatrixProperties[j]));
            queryCooperativeMatrixProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
            queryCooperativeMatrixProperties[j].pNext = 0;
            memset(&queryCooperativeMatrixSubProperties[j], 0, sizeof(queryCooperativeMatrixSubProperties[j]));
            queryCooperativeMatrixSubProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
            queryCooperativeMatrixSubProperties[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixProperties.data());
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixSubProperties.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
@@ -1102,7 +1106,7 @@ void GpuInfoPrivate::query_extension_properties()

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixProperties[j];
            const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixSubProperties[j];
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
@@ -1144,14 +1148,14 @@ void GpuInfoPrivate::query_extension_properties()
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
        }

        queryCooperativeMatrixPropertiesNV.resize(propertyCount);
        queryCooperativeMatrixSubPropertiesNV.resize(propertyCount);
        for (uint32_t j = 0; j < propertyCount; j++)
        {
            memset(&queryCooperativeMatrixPropertiesNV[j], 0, sizeof(queryCooperativeMatrixPropertiesNV[j]));
            queryCooperativeMatrixPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
            queryCooperativeMatrixPropertiesNV[j].pNext = 0;
            memset(&queryCooperativeMatrixSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixSubPropertiesNV[j]));
            queryCooperativeMatrixSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
            queryCooperativeMatrixSubPropertiesNV[j].pNext = 0;
        }
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixPropertiesNV.data());
        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixSubPropertiesNV.data());
        if (ret != VK_SUCCESS)
        {
            NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
@@ -1159,7 +1163,7 @@ void GpuInfoPrivate::query_extension_properties()

        for (uint32_t j = 0; j < propertyCount; j++)
        {
            const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixPropertiesNV[j];
            const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixSubPropertiesNV[j];
            // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);

            if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
@@ -1837,9 +1841,9 @@ const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& GpuInfo::queryShaderAtomicF
    return d->queryShaderAtomicFloat2Features;
 }

 const void* GpuInfo::queryDeviceProperties() const
 const void* GpuInfo::queryExtensionProperties() const
 {
    return d->queryDeviceProperties;
    return d->queryExtensionProperties;
 }

 const VkPhysicalDeviceShaderIntegerDotProductProperties& GpuInfo::queryShaderIntegerDotProductProperties() const
@@ -1862,14 +1866,14 @@ const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSi
    return d->querySubgroupSizeControlProperties;
 }

 const std::vector<VkCooperativeMatrixPropertiesKHR>& GpuInfo::queryCooperativeMatrixProperties() const
 const std::vector<VkCooperativeMatrixPropertiesKHR>& GpuInfo::queryCooperativeMatrixSubProperties() const
 {
    return d->queryCooperativeMatrixProperties;
    return d->queryCooperativeMatrixSubProperties;
 }

 const std::vector<VkCooperativeMatrixPropertiesNV>& GpuInfo::queryCooperativeMatrixPropertiesNV() const
 const std::vector<VkCooperativeMatrixPropertiesNV>& GpuInfo::queryCooperativeMatrixSubPropertiesNV() const
 {
    return d->queryCooperativeMatrixPropertiesNV;
    return d->queryCooperativeMatrixSubPropertiesNV;
 }

 static int init_instance_core()
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -365,14 +365,16 @@ public:
    const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& queryShaderAtomicFloat2Features() const;

    // extension properties
    const void* queryDeviceProperties() const;
    const void* queryExtensionProperties() const;
    const VkPhysicalDeviceFloatControlsPropertiesKHR& queryFloatControlsProperties() const;
    const VkPhysicalDeviceShaderIntegerDotProductProperties& queryShaderIntegerDotProductProperties() const;
    const VkPhysicalDeviceSubgroupProperties& querySubgroupProperties() const;
    const VkPhysicalDeviceDriverPropertiesKHR& queryDriverProperties() const;
    const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& querySubgroupSizeControlProperties() const;
    const std::vector<VkCooperativeMatrixPropertiesKHR>& queryCooperativeMatrixProperties() const;
    const std::vector<VkCooperativeMatrixPropertiesNV>& queryCooperativeMatrixPropertiesNV() const;

    // extension sub properties
    const std::vector<VkCooperativeMatrixPropertiesKHR>& queryCooperativeMatrixSubProperties() const;
    const std::vector<VkCooperativeMatrixPropertiesNV>& queryCooperativeMatrixSubPropertiesNV() const;

 private:
    GpuInfo(const GpuInfo&);
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -416,17 +416,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)

                if (use_cooperative_matrix_16_8_8)
                {
                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
                    else
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
                }
                else if (use_cooperative_matrix_16_16_16)
                {
                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
                    else
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16;
                }

                pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
@@ -696,17 +690,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)

                if (use_cooperative_matrix_16_8_8)
                {
                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
                    else
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
                }
                else if (use_cooperative_matrix_16_16_16)
                {
                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
                    else
                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16;
                }

                pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
@@ -1028,17 +1016,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)

        if (use_cooperative_matrix_16_8_8)
        {
            if (vkdev->info.support_VK_KHR_cooperative_matrix())
                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_8_8;
            else
                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_8_8;
            shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_8_8;
        }
        else if (use_cooperative_matrix_16_16_16)
        {
            if (vkdev->info.support_VK_KHR_cooperative_matrix())
                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_16_16;
            else
                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_16_16;
            shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_16_16;
        }

        pipeline_convolution_gemm = new Pipeline(vkdev);
@@ -1099,17 +1081,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)

        if (use_cooperative_matrix_16_8_8)
        {
            if (vkdev->info.support_VK_KHR_cooperative_matrix())
                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_8_8;
            else
                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_8_8;
            shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_8_8;
        }
        else if (use_cooperative_matrix_16_16_16)
        {
            if (vkdev->info.support_VK_KHR_cooperative_matrix())
                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_16_16;
            else
                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_16_16;
            shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_16_16;
        }

        pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -301,17 +301,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)

            if (use_cooperative_matrix_16_8_8)
            {
                if (vkdev->info.support_VK_KHR_cooperative_matrix())
                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_8_8;
                else
                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_8_8;
                shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_8_8;
            }
            else if (use_cooperative_matrix_16_16_16)
            {
                if (vkdev->info.support_VK_KHR_cooperative_matrix())
                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_16_16;
                else
                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_16_16;
                shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_16_16;
            }

            pipeline_deconvolution_gemm = new Pipeline(vkdev);
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
@@ -73,18 +77,40 @@ void main()
    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
 #endif
 #endif

    if (bias_term == 1)
    {
 #if ncnn_VK_KHR_cooperative_matrix
        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias0;
        coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias1;

        coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
        coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
 #endif

 #if NCNN_fp16_arithmetic
        sum0 = bias0;
@@ -92,18 +118,39 @@ void main()
        sum2 = bias1;
        sum3 = bias1;
 #else
 #if ncnn_VK_KHR_cooperative_matrix
        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
 #elif ncnn_VK_NV_cooperative_matrix
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
 #endif
 #endif
    }
    else
    {
 #if ncnn_VK_KHR_cooperative_matrix
        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
        sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
 #endif
 #endif
    }

    const int N = psc(c) / 4;
@@ -132,6 +179,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -147,6 +195,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -178,6 +243,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -193,6 +259,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -201,6 +284,7 @@ void main()
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -216,6 +300,24 @@ void main()
    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
 #endif
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif
 #endif

    barrier();
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
@@ -75,6 +79,7 @@ void main()
    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
@@ -83,9 +88,31 @@ void main()
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
 #endif
 #endif

    if (bias_term == 1)
    {
 #if ncnn_VK_KHR_cooperative_matrix
        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias0;
        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias1;
        coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias2;
@@ -95,6 +122,17 @@ void main()
        coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor);
        coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
        coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
 #endif

 #if NCNN_fp16_arithmetic
        sum0 = bias0;
@@ -106,6 +144,7 @@ void main()
        sum6 = bias3;
        sum7 = bias3;
 #else
 #if ncnn_VK_KHR_cooperative_matrix
        sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
        sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
@@ -114,10 +153,21 @@ void main()
        sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
 #elif ncnn_VK_NV_cooperative_matrix
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
 #endif
 #endif
    }
    else
    {
 #if ncnn_VK_KHR_cooperative_matrix
        sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
        sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
        sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -126,6 +176,27 @@ void main()
        sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
        sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
        sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
 #endif
 #endif
    }

    const int N = psc(c) / 2;
@@ -160,6 +231,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -183,6 +255,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -220,6 +317,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -243,6 +341,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -251,6 +374,7 @@ void main()
    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -278,6 +402,36 @@ void main()
    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
 #endif
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif
 #endif

    barrier();
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
@@ -1,260 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 2

 shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k1[UNROLL_INCH * 16*4];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
 #endif

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);

 #if NCNN_fp16_arithmetic
        sum0 = bias0;
        sum1 = bias0;
        sum2 = bias1;
        sum3 = bias1;
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
 #endif
    }
    else
    {
 #if NCNN_fp16_arithmetic
        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
 #endif
    }

    const int N = psc(c) / 4;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd16 == 0)
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;

 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif

    barrier();

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;

            uvec2 sum0_u2 = tmp_v0[tmp_vi];
            uvec2 sum1_u2 = tmp_v1[tmp_vi];

            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));

            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);

            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);

            if (gy + lxd16 * 4 + j < psc(outc))
            {
                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
@@ -1,335 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
 layout (constant_id = 2) const float activation_param_0 = 0;
 layout (constant_id = 3) const float activation_param_1 = 0;

 #define shape_constant_id_offset 4
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 4

 shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
 shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k3[UNROLL_INCH * 8*2];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
 #endif

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);

 #if NCNN_fp16_arithmetic
        sum0 = bias0;
        sum1 = bias0;
        sum2 = bias1;
        sum3 = bias1;
        sum4 = bias2;
        sum5 = bias2;
        sum6 = bias3;
        sum7 = bias3;
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
 #endif
    }
    else
    {
 #if NCNN_fp16_arithmetic
        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
 #else
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
 #endif
    }

    const int N = psc(c) / 2;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd8 < remain)
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (gx >= psc(outcstep) || gy >= psc(outc))
        return;

 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif

    barrier();

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);

            if (gy + j * 2 + lxd16 < psc(outc))
            {
                if (gx + lxm16 < psc(outcstep))
                {
                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
                    buffer_st4(top_blob_data, gi, sum0);
                }
                if (gx + lxm16 + 16 < psc(outcstep))
                {
                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
                    buffer_st4(top_blob_data, gi + 16, sum1);
                }
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int batch = 1;

@@ -61,10 +65,24 @@ void main()
    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
 #endif
 #endif

    const int N = psc(c) / 4;

@@ -92,6 +110,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -107,6 +126,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -138,6 +174,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -153,6 +190,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -161,6 +215,7 @@ void main()
    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
        return;

 #if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -176,6 +231,24 @@ void main()
    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
 #endif
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif
 #endif

    barrier();
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int batch = 1;

@@ -63,6 +67,7 @@ void main()
    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -71,6 +76,27 @@ void main()
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
 #endif
 #endif

    const int N = psc(c) / 2;

@@ -104,6 +130,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -127,6 +154,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -164,6 +216,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -187,6 +240,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -195,6 +273,7 @@ void main()
    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
        return;

 #if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -222,6 +301,36 @@ void main()
    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
 #endif
 #elif ncnn_VK_NV_cooperative_matrix
 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif
 #endif

    barrier();
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
@@ -1,203 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int batch = 1;

 #define shape_constant_id_offset 1
 layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
 layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
 layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };

 layout (push_constant) uniform parameter
 {
    int c;
    int cstep;

    int outw;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 2

 shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k1[UNROLL_INCH * 16*4];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
    int gz = int(gl_GlobalInvocationID.z);

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
 #endif

    const int N = psc(c) / 4;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_tm_data[w_offset];
                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd16 == 0)
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_tm_data[w_offset];
                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
        return;

 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif

    barrier();

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
            const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16);

            if (gy + lxd16 * 4 + j < psc(outc))
            {
                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
@@ -1,256 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int batch = 1;

 #define shape_constant_id_offset 1
 layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
 layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
 layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };

 layout (push_constant) uniform parameter
 {
    int c;
    int cstep;

    int outw;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 4

 shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
 shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k3[UNROLL_INCH * 8*2];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
    int gz = int(gl_GlobalInvocationID.z);

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if NCNN_fp16_arithmetic
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
 #else
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
 #endif

    const int N = psc(c) / 2;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd8 < remain)
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
        return;

 #if NCNN_fp16_arithmetic
    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
 #else
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif

    barrier();

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
            const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16);

            if (gy + j * 2 + lxd16 < psc(outc))
            {
                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
@@ -81,6 +85,7 @@ void main()
    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
@@ -106,6 +111,33 @@ void main()
        sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
        sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    }
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);

        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
    }
    else
    {
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    const int N = psc(c) / 4 * maxk;
@@ -148,6 +180,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -163,6 +196,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -208,6 +258,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -223,6 +274,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -231,6 +299,7 @@ void main()
    if (gx >= outsize || gy >= psc(outc))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -240,6 +309,17 @@ void main()
    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif

    barrier();

--- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
@@ -83,6 +87,7 @@ void main()
    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
@@ -124,6 +129,49 @@ void main()
        sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
        sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    }
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);

        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
    }
    else
    {
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    }
 #endif

    const int maxk = kernel_w * kernel_h;
    const int N = psc(c) / 2 * maxk;
@@ -172,6 +220,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -195,6 +244,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -246,6 +320,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -269,6 +344,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -277,6 +377,7 @@ void main()
    if (gx >= outsize || gy >= psc(outc))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -294,6 +395,25 @@ void main()
    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif

    barrier();

--- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
@@ -1,269 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 2

 shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k1[UNROLL_INCH * 16*4];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int outsize = psc(outw) * psc(outh);

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);

        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
    }
    else
    {
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    }

    const int maxk = kernel_w * kernel_h;
    const int N = psc(c) / 4 * maxk;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int sz = (z + lxd16) / maxk;
                const int kk = (z + lxd16) % maxk;

                const int ky = kk / kernel_w;
                const int kx = kk % kernel_w;

                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);

                const ivec2 sy16 = gx16 / psc(outw);
                const ivec2 sx16 = gx16 % psc(outw);

                const ivec2 sxs16 = sx16 * stride_w;
                const ivec2 sys16 = sy16 * stride_h;

                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;

                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);

                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd16 == 0)
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int sz = (z + lxd16) / maxk;
                const int kk = (z + lxd16) % maxk;

                const int ky = kk / kernel_w;
                const int kx = kk % kernel_w;

                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);

                const ivec2 sy16 = gx16 / psc(outw);
                const ivec2 sx16 = gx16 % psc(outw);

                const ivec2 sxs16 = sx16 * stride_w;
                const ivec2 sys16 = sy16 * stride_h;

                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;

                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);

                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (gx >= outsize || gy >= psc(outc))
        return;

    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);

    barrier();

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;

            uvec2 sum0_u2 = tmp_v0[tmp_vi];
            uvec2 sum1_u2 = tmp_v1[tmp_vi];

            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));

            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);

            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);

            if (gy + lxd16 * 4 + j < psc(outc))
            {
                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
@@ -1,328 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_GOOGLE_include_directive: enable
 #include "vulkan_activation.comp"

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
 layout (constant_id = 2) const int dilation_w = 1;
 layout (constant_id = 3) const int dilation_h = 1;
 layout (constant_id = 4) const int stride_w = 1;
 layout (constant_id = 5) const int stride_h = 1;
 layout (constant_id = 6) const int bias_term = 0;
 layout (constant_id = 7) const int activation_type = 0;
 layout (constant_id = 8) const float activation_param_0 = 0;
 layout (constant_id = 9) const float activation_param_1 = 0;

 #define shape_constant_id_offset 10
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
 layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
 layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
 layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 #define UNROLL_INCH 4

 shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
 shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k3[UNROLL_INCH * 8*2];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int outsize = psc(outw) * psc(outh);

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;

    if (bias_term == 1)
    {
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;

        coopMatLoadNV(bias0, bias_data, gy, 0, false);
        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);

        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
    }
    else
    {
        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    }

    const int maxk = kernel_w * kernel_h;
    const int N = psc(c) / 2 * maxk;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                const int sz = (z + lxd8) / maxk;
                const int kk = (z + lxd8) % maxk;

                const int ky = kk / kernel_w;
                const int kx = kk % kernel_w;

                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);

                const ivec4 sy16 = gx16 / psc(outw);
                const ivec4 sx16 = gx16 % psc(outw);

                const ivec4 sxs16 = sx16 * stride_w;
                const ivec4 sys16 = sy16 * stride_h;

                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;

                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd8 < remain)
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                const int sz = (z + lxd8) / maxk;
                const int kk = (z + lxd8) % maxk;

                const int ky = kk / kernel_w;
                const int kx = kk % kernel_w;

                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);

                const ivec4 sy16 = gx16 / psc(outw);
                const ivec4 sx16 = gx16 % psc(outw);

                const ivec4 sxs16 = sx16 * stride_w;
                const ivec4 sys16 = sy16 * stride_h;

                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;

                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (gx >= outsize || gy >= psc(outc))
        return;

    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);

    barrier();

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);

            if (gy + j * 2 + lxd16 < psc(outc))
            {
                if (gx + lxm16 < psc(outcstep))
                {
                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
                    buffer_st4(top_blob_data, gi, sum0);
                }
                if (gx + lxm16 + 16 < psc(outcstep))
                {
                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
                    buffer_st4(top_blob_data, gi + 16, sum1);
                }
            }
        }
    }
 }
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int maxk = 1;

@@ -62,10 +66,17 @@ void main()
    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
 #endif

    const int N = psc(c) / 4;

@@ -93,6 +104,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -108,6 +120,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -139,6 +168,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -154,6 +184,23 @@ void main()
            sum1 = coopMatMulAdd(A1, B0, sum1);
            sum2 = coopMatMulAdd(A0, B1, sum2);
            sum3 = coopMatMulAdd(A1, B1, sum3);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
 #endif
        }

        barrier();
@@ -162,6 +209,7 @@ void main()
    if (gx >= psc(outw) || gy >= psc(outh))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -171,6 +219,17 @@ void main()
    coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
 #endif

    barrier();

--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
 #elif ncnn_VK_NV_cooperative_matrix
 #extension GL_NV_cooperative_matrix: require
 #endif

 layout (constant_id = 0) const int maxk = 1;

@@ -64,6 +68,7 @@ void main()
    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -72,6 +77,16 @@ void main()
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
    coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
 #endif

    const int N = psc(c) / 2;

@@ -105,6 +120,7 @@ void main()

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -128,6 +144,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -165,6 +206,7 @@ void main()

        for (int z4 = 0; z4 < remain; z4++)
        {
 #if ncnn_VK_KHR_cooperative_matrix
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
            coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
            coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -188,6 +230,31 @@ void main()
            sum5 = coopMatMulAdd(A1, B2, sum5);
            sum6 = coopMatMulAdd(A0, B3, sum6);
            sum7 = coopMatMulAdd(A1, B3, sum7);
 #elif ncnn_VK_NV_cooperative_matrix
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
 #endif
        }

        barrier();
@@ -196,6 +263,7 @@ void main()
    if (gx >= psc(outw) || gy >= psc(outh))
        return;

 #if ncnn_VK_KHR_cooperative_matrix
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
    coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -213,6 +281,25 @@ void main()
    coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
    coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
 #elif ncnn_VK_NV_cooperative_matrix
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
 #endif

    barrier();

--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
@@ -1,188 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int maxk = 1;

 #define shape_constant_id_offset 1
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
 } p;

 #define UNROLL_INCH 2

 shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
 shared uvec2 tmp_k1[UNROLL_INCH * 16*4];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);

    const int N = psc(c) / 4;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd16 == 0)
        {
            for (int j = 0; j < 4; j++)
            {
                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;

                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);

                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);

                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);

                tmp_k0[tmp_i] = weight_data[w_offset];
                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
        }

        barrier();
    }

    if (gx >= psc(outw) || gy >= psc(outh))
        return;

    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);

    barrier();

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;

            const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16);

            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
        }
    }
 }
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
@@ -1,232 +0,0 @@
 // Tencent is pleased to support the open source community by making ncnn available.
 //
 // Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
 //
 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 // in compliance with the License. You may obtain a copy of the License at
 //
 // https://opensource.org/licenses/BSD-3-Clause
 //
 // Unless required by applicable law or agreed to in writing, software distributed
 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 // specific language governing permissions and limitations under the License.

 #version 450

 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
 #extension GL_NV_cooperative_matrix: require

 layout (constant_id = 0) const int maxk = 1;

 #define shape_constant_id_offset 1
 layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
 layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
 layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
 layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

 layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
 layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

 layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
 layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
 layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };

 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
 } p;

 #define UNROLL_INCH 4

 shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
 shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
 shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
 shared uvec2 tmp_k3[UNROLL_INCH * 8*2];

 void main()
 {
    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;

    const int lx = int(gl_LocalInvocationID.x);

    const int lxd8 = lx / 8; // 0 1 2 3
    const int lxm8 = lx % 8; // 0 1 2 3 .... 7

    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);

    const int N = psc(c) / 2;

    int z = 0;
    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
    {
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (z < N)
    {
        const int remain = N - z;

        if (lxd8 < remain)
        {
            for (int j = 0; j < 2; j++)
            {
                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;

                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);

                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);

                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;

                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);

                tmp_k0[tmp_ki] = weight_data[w_offset];
                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
            }
        }

        barrier();

        for (int z4 = 0; z4 < remain; z4++)
        {
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);

            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);

            // sum += v * k
            sum0 = coopMatMulAddNV(A0, B0, sum0);
            sum1 = coopMatMulAddNV(A1, B0, sum1);
            sum2 = coopMatMulAddNV(A0, B1, sum2);
            sum3 = coopMatMulAddNV(A1, B1, sum3);
            sum4 = coopMatMulAddNV(A0, B2, sum4);
            sum5 = coopMatMulAddNV(A1, B2, sum5);
            sum6 = coopMatMulAddNV(A0, B3, sum6);
            sum7 = coopMatMulAddNV(A1, B3, sum7);
        }

        barrier();
    }

    if (gx >= psc(outw) || gy >= psc(outh))
        return;

    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);

    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);

    barrier();

    const int lxd16 = lx / 16; // 0 1
    const int lxm16 = lx % 16; // 0 1 2 3 .... 15

    {
        for (int j = 0; j < 4; j++)
        {
            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
            const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16);

            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
        }
    }
 }