diff --git a/src/gpu.cpp b/src/gpu.cpp
index 6c14ac98c..6ba970031 100644
--- a/src/gpu.cpp
+++ b/src/gpu.cpp
@@ -376,14 +376,16 @@ public:
     VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT queryShaderAtomicFloat2Features;
 
     // extension properties
-    void* queryDeviceProperties;
+    void* queryExtensionProperties;
     VkPhysicalDeviceFloatControlsPropertiesKHR queryFloatControlsProperties;
     VkPhysicalDeviceShaderIntegerDotProductProperties queryShaderIntegerDotProductProperties;
     VkPhysicalDeviceSubgroupProperties querySubgroupProperties;
     VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties;
     VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties;
-    std::vector<VkCooperativeMatrixPropertiesKHR> queryCooperativeMatrixProperties;
-    std::vector<VkCooperativeMatrixPropertiesNV> queryCooperativeMatrixPropertiesNV;
+
+    // extension sub properties
+    std::vector<VkCooperativeMatrixPropertiesKHR> queryCooperativeMatrixSubProperties;
+    std::vector<VkCooperativeMatrixPropertiesNV> queryCooperativeMatrixSubPropertiesNV;
 };
 
 void GpuInfoPrivate::query_features()
@@ -855,17 +857,19 @@ void GpuInfoPrivate::query_extension_features()
 
     // query cooperative_matrix
     memset(&queryCooperativeMatrixFeatures, 0, sizeof(queryCooperativeMatrixFeatures));
-    memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV));
     queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR;
     queryCooperativeMatrixFeatures.pNext = 0;
-    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
-    queryCooperativeMatrixFeaturesNV.pNext = 0;
     if (support_VK_KHR_cooperative_matrix)
     {
         queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures;
         queryExtensionFeatures = &queryCooperativeMatrixFeatures;
     }
-    else if (support_VK_NV_cooperative_matrix)
+
+    // query nv cooperative matrix
+    memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV));
+    queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV;
+    queryCooperativeMatrixFeaturesNV.pNext = 0;
+    if (support_VK_NV_cooperative_matrix)
     {
         queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures;
         queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV;
@@ -978,7 +982,7 @@ void GpuInfoPrivate::query_extension_features()
 
 void GpuInfoPrivate::query_extension_properties()
 {
-    queryDeviceProperties = 0;
+    queryExtensionProperties = 0;
 
     // query float controls
     memset(&queryFloatControlsProperties, 0, sizeof(queryFloatControlsProperties));
@@ -986,8 +990,8 @@ void GpuInfoPrivate::query_extension_properties()
     queryFloatControlsProperties.pNext = 0;
     if (support_VK_KHR_shader_float_controls)
     {
-        queryFloatControlsProperties.pNext = queryDeviceProperties;
-        queryDeviceProperties = &queryFloatControlsProperties;
+        queryFloatControlsProperties.pNext = queryExtensionProperties;
+        queryExtensionProperties = &queryFloatControlsProperties;
     }
 
     // query integer dot product
@@ -996,8 +1000,8 @@ void GpuInfoPrivate::query_extension_properties()
     queryShaderIntegerDotProductProperties.pNext = 0;
     if (support_VK_KHR_driver_properties)
     {
-        queryShaderIntegerDotProductProperties.pNext = queryDeviceProperties;
-        queryDeviceProperties = &queryShaderIntegerDotProductProperties;
+        queryShaderIntegerDotProductProperties.pNext = queryExtensionProperties;
+        queryExtensionProperties = &queryShaderIntegerDotProductProperties;
     }
 
     // query subgroup
@@ -1006,8 +1010,8 @@ void GpuInfoPrivate::query_extension_properties()
     querySubgroupProperties.pNext = 0;
     if (VK_VERSION_MAJOR(g_instance.instance_api_version) >= 1 && VK_VERSION_MINOR(g_instance.instance_api_version) >= 1)
     {
-        querySubgroupProperties.pNext = queryDeviceProperties;
-        queryDeviceProperties = &querySubgroupProperties;
+        querySubgroupProperties.pNext = queryExtensionProperties;
+        queryExtensionProperties = &querySubgroupProperties;
     }
     else
     {
@@ -1032,8 +1036,8 @@ void GpuInfoPrivate::query_extension_properties()
     queryDriverProperties.pNext = 0;
     if (support_VK_KHR_driver_properties)
     {
-        queryDriverProperties.pNext = queryDeviceProperties;
-        queryDeviceProperties = &queryDriverProperties;
+        queryDriverProperties.pNext = queryExtensionProperties;
+        queryExtensionProperties = &queryDriverProperties;
     }
 
     // query subgroup size control
@@ -1042,15 +1046,15 @@ void GpuInfoPrivate::query_extension_properties()
     querySubgroupSizeControlProperties.pNext = 0;
     if (support_VK_EXT_subgroup_size_control)
     {
-        querySubgroupSizeControlProperties.pNext = queryDeviceProperties;
-        queryDeviceProperties = &querySubgroupSizeControlProperties;
+        querySubgroupSizeControlProperties.pNext = queryExtensionProperties;
+        queryExtensionProperties = &querySubgroupSizeControlProperties;
     }
 
     if (support_VK_KHR_get_physical_device_properties2)
     {
         VkPhysicalDeviceProperties2KHR queryProperties;
         queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR;
-        queryProperties.pNext = queryDeviceProperties;
+        queryProperties.pNext = queryExtensionProperties;
 
         vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties);
 
@@ -1072,8 +1076,8 @@ void GpuInfoPrivate::query_extension_properties()
     }
 
     // query supported cooperative matrix types and operations
-    queryCooperativeMatrixProperties.clear();
-    queryCooperativeMatrixPropertiesNV.clear();
+    queryCooperativeMatrixSubProperties.clear();
+    queryCooperativeMatrixSubPropertiesNV.clear();
     support_cooperative_matrix_8_8_16 = false;
     support_cooperative_matrix_16_8_8 = false;
     support_cooperative_matrix_16_8_16 = false;
@@ -1087,14 +1091,14 @@ void GpuInfoPrivate::query_extension_properties()
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
         }
 
-        queryCooperativeMatrixProperties.resize(propertyCount);
+        queryCooperativeMatrixSubProperties.resize(propertyCount);
         for (uint32_t j = 0; j < propertyCount; j++)
         {
-            memset(&queryCooperativeMatrixProperties[j], 0, sizeof(queryCooperativeMatrixProperties[j]));
-            queryCooperativeMatrixProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
-            queryCooperativeMatrixProperties[j].pNext = 0;
+            memset(&queryCooperativeMatrixSubProperties[j], 0, sizeof(queryCooperativeMatrixSubProperties[j]));
+            queryCooperativeMatrixSubProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+            queryCooperativeMatrixSubProperties[j].pNext = 0;
         }
-        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixProperties.data());
+        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixSubProperties.data());
         if (ret != VK_SUCCESS)
         {
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret);
@@ -1102,7 +1106,7 @@ void GpuInfoPrivate::query_extension_properties()
 
         for (uint32_t j = 0; j < propertyCount; j++)
         {
-            const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixProperties[j];
+            const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixSubProperties[j];
             // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope);
 
             if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
@@ -1144,14 +1148,14 @@ void GpuInfoPrivate::query_extension_properties()
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
         }
 
-        queryCooperativeMatrixPropertiesNV.resize(propertyCount);
+        queryCooperativeMatrixSubPropertiesNV.resize(propertyCount);
         for (uint32_t j = 0; j < propertyCount; j++)
         {
-            memset(&queryCooperativeMatrixPropertiesNV[j], 0, sizeof(queryCooperativeMatrixPropertiesNV[j]));
-            queryCooperativeMatrixPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
-            queryCooperativeMatrixPropertiesNV[j].pNext = 0;
+            memset(&queryCooperativeMatrixSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixSubPropertiesNV[j]));
+            queryCooperativeMatrixSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV;
+            queryCooperativeMatrixSubPropertiesNV[j].pNext = 0;
         }
-        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixPropertiesNV.data());
+        ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixSubPropertiesNV.data());
         if (ret != VK_SUCCESS)
         {
             NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret);
@@ -1159,7 +1163,7 @@ void GpuInfoPrivate::query_extension_properties()
 
         for (uint32_t j = 0; j < propertyCount; j++)
         {
-            const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixPropertiesNV[j];
+            const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixSubPropertiesNV[j];
             // NCNN_LOGE("cpm %2d %2d %2d  %d %d %d %d  %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope);
 
             if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16
@@ -1837,9 +1841,9 @@ const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& GpuInfo::queryShaderAtomicF
     return d->queryShaderAtomicFloat2Features;
 }
 
-const void* GpuInfo::queryDeviceProperties() const
+const void* GpuInfo::queryExtensionProperties() const
 {
-    return d->queryDeviceProperties;
+    return d->queryExtensionProperties;
 }
 
 const VkPhysicalDeviceShaderIntegerDotProductProperties& GpuInfo::queryShaderIntegerDotProductProperties() const
@@ -1862,14 +1866,14 @@ const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSi
     return d->querySubgroupSizeControlProperties;
 }
 
-const std::vector<VkCooperativeMatrixPropertiesKHR>& GpuInfo::queryCooperativeMatrixProperties() const
+const std::vector<VkCooperativeMatrixPropertiesKHR>& GpuInfo::queryCooperativeMatrixSubProperties() const
 {
-    return d->queryCooperativeMatrixProperties;
+    return d->queryCooperativeMatrixSubProperties;
 }
 
-const std::vector<VkCooperativeMatrixPropertiesNV>& GpuInfo::queryCooperativeMatrixPropertiesNV() const
+const std::vector<VkCooperativeMatrixPropertiesNV>& GpuInfo::queryCooperativeMatrixSubPropertiesNV() const
 {
-    return d->queryCooperativeMatrixPropertiesNV;
+    return d->queryCooperativeMatrixSubPropertiesNV;
 }
 
 static int init_instance_core()
diff --git a/src/gpu.h b/src/gpu.h
index a0cecfd5d..da9c5573c 100644
--- a/src/gpu.h
+++ b/src/gpu.h
@@ -365,14 +365,16 @@ public:
     const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& queryShaderAtomicFloat2Features() const;
 
     // extension properties
-    const void* queryDeviceProperties() const;
+    const void* queryExtensionProperties() const;
     const VkPhysicalDeviceFloatControlsPropertiesKHR& queryFloatControlsProperties() const;
     const VkPhysicalDeviceShaderIntegerDotProductProperties& queryShaderIntegerDotProductProperties() const;
     const VkPhysicalDeviceSubgroupProperties& querySubgroupProperties() const;
     const VkPhysicalDeviceDriverPropertiesKHR& queryDriverProperties() const;
     const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& querySubgroupSizeControlProperties() const;
-    const std::vector<VkCooperativeMatrixPropertiesKHR>& queryCooperativeMatrixProperties() const;
-    const std::vector<VkCooperativeMatrixPropertiesNV>& queryCooperativeMatrixPropertiesNV() const;
+
+    // extension sub properties
+    const std::vector<VkCooperativeMatrixPropertiesKHR>& queryCooperativeMatrixSubProperties() const;
+    const std::vector<VkCooperativeMatrixPropertiesNV>& queryCooperativeMatrixSubPropertiesNV() const;
 
 private:
     GpuInfo(const GpuInfo&);
diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp
index 933ff6d73..a659c34be 100644
--- a/src/layer/vulkan/convolution_vulkan.cpp
+++ b/src/layer/vulkan/convolution_vulkan.cpp
@@ -416,17 +416,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
                 if (use_cooperative_matrix_16_8_8)
                 {
-                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
-                    else
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
+                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
                 }
                 else if (use_cooperative_matrix_16_16_16)
                 {
-                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
-                    else
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
+                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16;
                 }
 
                 pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
@@ -696,17 +690,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
                 if (use_cooperative_matrix_16_8_8)
                 {
-                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8;
-                    else
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8;
+                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8;
                 }
                 else if (use_cooperative_matrix_16_16_16)
                 {
-                    if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16;
-                    else
-                        shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16;
+                    shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16;
                 }
 
                 pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
@@ -1028,17 +1016,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
         if (use_cooperative_matrix_16_8_8)
         {
-            if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_8_8;
-            else
-                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_8_8;
+            shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_8_8;
         }
         else if (use_cooperative_matrix_16_16_16)
         {
-            if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_16_16;
-            else
-                shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_16_16;
+            shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_16_16;
         }
 
         pipeline_convolution_gemm = new Pipeline(vkdev);
@@ -1099,17 +1081,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
 
         if (use_cooperative_matrix_16_8_8)
         {
-            if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_8_8;
-            else
-                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_8_8;
+            shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_8_8;
         }
         else if (use_cooperative_matrix_16_16_16)
         {
-            if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_16_16;
-            else
-                shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_16_16;
+            shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_16_16;
         }
 
         pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp
index 7cca37aae..5f953efe8 100644
--- a/src/layer/vulkan/deconvolution_vulkan.cpp
+++ b/src/layer/vulkan/deconvolution_vulkan.cpp
@@ -301,17 +301,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
 
             if (use_cooperative_matrix_16_8_8)
             {
-                if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_8_8;
-                else
-                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_8_8;
+                shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_8_8;
             }
             else if (use_cooperative_matrix_16_16_16)
             {
-                if (vkdev->info.support_VK_KHR_cooperative_matrix())
-                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_16_16;
-                else
-                    shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_16_16;
+                shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_16_16;
             }
 
             pipeline_deconvolution_gemm = new Pipeline(vkdev);
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp
similarity index 69%
rename from src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
rename to src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp
index 398bf7fde..ee9f69817 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
@@ -73,18 +77,40 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3;
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
+#else
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+#endif
+#endif
 
     if (bias_term == 1)
     {
+#if ncnn_VK_KHR_cooperative_matrix
         coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias0;
         coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> bias1;
 
         coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
+#endif
 
 #if NCNN_fp16_arithmetic
         sum0 = bias0;
@@ -92,18 +118,39 @@ void main()
         sum2 = bias1;
         sum3 = bias1;
 #else
+#if ncnn_VK_KHR_cooperative_matrix
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
         sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(bias1);
+#elif ncnn_VK_NV_cooperative_matrix
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+#endif
 #endif
     }
     else
     {
+#if ncnn_VK_KHR_cooperative_matrix
         sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
         sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
         sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
         sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
+#endif
     }
 
     const int N = psc(c) / 4;
@@ -132,6 +179,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -147,6 +195,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -178,6 +243,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -193,6 +259,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -201,6 +284,7 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
     coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -216,6 +300,24 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 #endif
 
     barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
similarity index 66%
rename from src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
rename to src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
index 5d79ddb2a..c36ba392f 100644
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int bias_term = 0;
 layout (constant_id = 1) const int activation_type = 0;
@@ -75,6 +79,7 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
@@ -83,9 +88,31 @@ void main()
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5;
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6;
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7;
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
+#else
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+#endif
+#endif
 
     if (bias_term == 1)
     {
+#if ncnn_VK_KHR_cooperative_matrix
         coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias0;
         coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias1;
         coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> bias2;
@@ -95,6 +122,17 @@ void main()
         coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor);
         coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
+        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
+        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
+#endif
 
 #if NCNN_fp16_arithmetic
         sum0 = bias0;
@@ -106,6 +144,7 @@ void main()
         sum6 = bias3;
         sum7 = bias3;
 #else
+#if ncnn_VK_KHR_cooperative_matrix
         sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias0);
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias1);
@@ -114,10 +153,21 @@ void main()
         sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias2);
         sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
         sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(bias3);
+#elif ncnn_VK_NV_cooperative_matrix
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+#endif
 #endif
     }
     else
     {
+#if ncnn_VK_KHR_cooperative_matrix
         sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
         sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
         sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -126,6 +176,27 @@ void main()
         sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
         sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
         sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
+#endif
     }
 
     const int N = psc(c) / 2;
@@ -160,6 +231,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -183,6 +255,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -220,6 +317,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -243,6 +341,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -251,6 +374,7 @@ void main()
     if (gx >= psc(outcstep) || gy >= psc(outc))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
     coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -278,6 +402,36 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 #endif
 
     barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
deleted file mode 100644
index cc3a00a3f..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp
+++ /dev/null
@@ -1,260 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int bias_term = 0;
-layout (constant_id = 1) const int activation_type = 0;
-layout (constant_id = 2) const float activation_param_0 = 0;
-layout (constant_id = 3) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 4
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 2
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-#if NCNN_fp16_arithmetic
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3;
-#else
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
-#endif
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
-
-        coopMatLoadNV(bias0, bias_data, gy, 0, false);
-        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
-
-#if NCNN_fp16_arithmetic
-        sum0 = bias0;
-        sum1 = bias0;
-        sum2 = bias1;
-        sum3 = bias1;
-#else
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
-#endif
-    }
-    else
-    {
-#if NCNN_fp16_arithmetic
-        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-#else
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-#endif
-    }
-
-    const int N = psc(c) / 4;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd16 == 0)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outcstep) || gy >= psc(outc))
-        return;
-
-#if NCNN_fp16_arithmetic
-    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
-#else
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
-#endif
-
-    barrier();
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
-
-            uvec2 sum0_u2 = tmp_v0[tmp_vi];
-            uvec2 sum1_u2 = tmp_v1[tmp_vi];
-
-            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-
-            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-
-            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
-
-            if (gy + lxd16 * 4 + j < psc(outc))
-            {
-                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
-                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
deleted file mode 100644
index c0494e58a..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp
+++ /dev/null
@@ -1,335 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int bias_term = 0;
-layout (constant_id = 1) const int activation_type = 0;
-layout (constant_id = 2) const float activation_param_0 = 0;
-layout (constant_id = 3) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 4
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-#if NCNN_fp16_arithmetic
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6;
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7;
-#else
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
-#endif
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
-
-        coopMatLoadNV(bias0, bias_data, gy, 0, false);
-        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
-        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
-        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
-
-#if NCNN_fp16_arithmetic
-        sum0 = bias0;
-        sum1 = bias0;
-        sum2 = bias1;
-        sum3 = bias1;
-        sum4 = bias2;
-        sum5 = bias2;
-        sum6 = bias3;
-        sum7 = bias3;
-#else
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
-        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
-        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
-        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
-        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
-#endif
-    }
-    else
-    {
-#if NCNN_fp16_arithmetic
-        sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-#else
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-#endif
-    }
-
-    const int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd8 < remain)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outcstep) || gy >= psc(outc))
-        return;
-
-#if NCNN_fp16_arithmetic
-    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
-#else
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
-#endif
-
-    barrier();
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
-            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
-
-            if (gy + j * 2 + lxd16 < psc(outc))
-            {
-                if (gx + lxm16 < psc(outcstep))
-                {
-                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
-                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-                    buffer_st4(top_blob_data, gi, sum0);
-                }
-                if (gx + lxm16 + 16 < psc(outcstep))
-                {
-                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
-                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-                    buffer_st4(top_blob_data, gi + 16, sum1);
-                }
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp
similarity index 70%
rename from src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
rename to src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp
index 598e63601..d8dcbb0b1 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int batch = 1;
 
@@ -61,10 +65,24 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<afp, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
+#else
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
+#endif
 
     const int N = psc(c) / 4;
 
@@ -92,6 +110,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -107,6 +126,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -138,6 +174,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -153,6 +190,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -161,6 +215,7 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
     coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -176,6 +231,24 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
+#else
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 #endif
 
     barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
similarity index 67%
rename from src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
rename to src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
index 7826f1ac0..de95f5307 100644
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int batch = 1;
 
@@ -63,6 +67,7 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -71,6 +76,27 @@ void main()
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<afp, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
+#else
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
+#endif
 
     const int N = psc(c) / 2;
 
@@ -104,6 +130,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -127,6 +154,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -164,6 +216,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -187,6 +240,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -195,6 +273,7 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
 #if NCNN_fp16_arithmetic
     coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -222,6 +301,36 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#endif
+#elif ncnn_VK_NV_cooperative_matrix
+#if NCNN_fp16_arithmetic
+    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
+#else
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 #endif
 
     barrier();
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
deleted file mode 100644
index 0182b7960..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp
+++ /dev/null
@@ -1,203 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int batch = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
-layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
-layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int c;
-    int cstep;
-
-    int outw;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 2
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-    int gz = int(gl_GlobalInvocationID.z);
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-#if NCNN_fp16_arithmetic
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f);
-#else
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-#endif
-
-    const int N = psc(c) / 4;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_tm_data[w_offset];
-                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd16 == 0)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_tm_data[w_offset];
-                tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
-        return;
-
-#if NCNN_fp16_arithmetic
-    coopMatStoreNV(sum0, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false);
-#else
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
-#endif
-
-    barrier();
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
-            const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16);
-
-            if (gy + lxd16 * 4 + j < psc(outc))
-            {
-                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
-                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
deleted file mode 100644
index 078c6a52c..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp
+++ /dev/null
@@ -1,256 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int batch = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; };
-layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; };
-layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int c;
-    int cstep;
-
-    int outw;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-    int gz = int(gl_GlobalInvocationID.z);
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-#if NCNN_fp16_arithmetic
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f);
-#else
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-#endif
-
-    const int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
-                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd8 < remain)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_tm_data[w_offset];
-                tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
-        return;
-
-#if NCNN_fp16_arithmetic
-    coopMatStoreNV(sum0, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false);
-#else
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
-#endif
-
-    barrier();
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
-            const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16);
-
-            if (gy + j * 2 + lxd16 < psc(outc))
-            {
-                if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi];
-                if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi];
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp
similarity index 76%
rename from src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
rename to src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp
index 804321bff..c3f87ffec 100644
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
@@ -81,6 +85,7 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0;
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1;
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2;
@@ -106,6 +111,33 @@ void main()
         sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
         sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     }
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    }
+#endif
 
     const int maxk = kernel_w * kernel_h;
     const int N = psc(c) / 4 * maxk;
@@ -148,6 +180,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -163,6 +196,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -208,6 +258,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -223,6 +274,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -231,6 +299,7 @@ void main()
     if (gx >= outsize || gy >= psc(outc))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -240,6 +309,17 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
similarity index 72%
rename from src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
rename to src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
index 3d2f7d1e5..b8f6e3f17 100644
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
@@ -20,7 +20,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int kernel_w = 1;
 layout (constant_id = 1) const int kernel_h = 1;
@@ -83,6 +87,7 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0;
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1;
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2;
@@ -124,6 +129,49 @@ void main()
         sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
         sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     }
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
+
+    if (bias_term == 1)
+    {
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
+        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
+
+        coopMatLoadNV(bias0, bias_data, gy, 0, false);
+        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
+        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
+        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
+
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
+    }
+    else
+    {
+        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    }
+#endif
 
     const int maxk = kernel_w * kernel_h;
     const int N = psc(c) / 2 * maxk;
@@ -172,6 +220,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -195,6 +244,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -246,6 +320,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -269,6 +344,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -277,6 +377,7 @@ void main()
     if (gx >= outsize || gy >= psc(outc))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -294,6 +395,25 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
deleted file mode 100644
index 51bdb91f4..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp
+++ /dev/null
@@ -1,269 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 2
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int outsize = psc(outw) * psc(outh);
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3;
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1;
-
-        coopMatLoadNV(bias0, bias_data, gy, 0, false);
-        coopMatLoadNV(bias1, bias_data, gy + 4, 0, false);
-
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1);
-    }
-    else
-    {
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    }
-
-    const int maxk = kernel_w * kernel_h;
-    const int N = psc(c) / 4 * maxk;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int sz = (z + lxd16) / maxk;
-                const int kk = (z + lxd16) % maxk;
-
-                const int ky = kk / kernel_w;
-                const int kx = kk % kernel_w;
-
-                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
-
-                const ivec2 sy16 = gx16 / psc(outw);
-                const ivec2 sx16 = gx16 % psc(outw);
-
-                const ivec2 sxs16 = sx16 * stride_w;
-                const ivec2 sys16 = sy16 * stride_h;
-
-                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
-
-                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
-                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
-
-                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd16 == 0)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int sz = (z + lxd16) / maxk;
-                const int kk = (z + lxd16) % maxk;
-
-                const int ky = kk / kernel_w;
-                const int kx = kk % kernel_w;
-
-                const ivec2 gx16 = gx + lxm16 + ivec2(0, 16);
-
-                const ivec2 sy16 = gx16 / psc(outw);
-                const ivec2 sx16 = gx16 % psc(outw);
-
-                const ivec2 sxs16 = sx16 * stride_w;
-                const ivec2 sys16 = sy16 * stride_h;
-
-                const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
-
-                tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
-                tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
-
-                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= outsize || gy >= psc(outc))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
-
-    barrier();
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
-
-            uvec2 sum0_u2 = tmp_v0[tmp_vi];
-            uvec2 sum1_u2 = tmp_v1[tmp_vi];
-
-            afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-            afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-
-            sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-            sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-
-            const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16);
-
-            if (gy + lxd16 * 4 + j < psc(outc))
-            {
-                if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0);
-                if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1);
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
deleted file mode 100644
index 786c2ebfb..000000000
--- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp
+++ /dev/null
@@ -1,328 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_GOOGLE_include_directive: enable
-#include "vulkan_activation.comp"
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int kernel_w = 1;
-layout (constant_id = 1) const int kernel_h = 1;
-layout (constant_id = 2) const int dilation_w = 1;
-layout (constant_id = 3) const int dilation_h = 1;
-layout (constant_id = 4) const int stride_w = 1;
-layout (constant_id = 5) const int stride_h = 1;
-layout (constant_id = 6) const int bias_term = 0;
-layout (constant_id = 7) const int activation_type = 0;
-layout (constant_id = 8) const float activation_param_0 = 0;
-layout (constant_id = 9) const float activation_param_1 = 0;
-
-#define shape_constant_id_offset 10
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
-layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-    int outc;
-    int outcstep;
-} p;
-
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int outsize = psc(outw) * psc(outh);
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6;
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7;
-
-    if (bias_term == 1)
-    {
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2;
-        fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3;
-
-        coopMatLoadNV(bias0, bias_data, gy, 0, false);
-        coopMatLoadNV(bias1, bias_data, gy + 2, 0, false);
-        coopMatLoadNV(bias2, bias_data, gy + 4, 0, false);
-        coopMatLoadNV(bias3, bias_data, gy + 6, 0, false);
-
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1);
-        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
-        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2);
-        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
-        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3);
-    }
-    else
-    {
-        sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-        sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    }
-
-    const int maxk = kernel_w * kernel_h;
-    const int N = psc(c) / 2 * maxk;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                const int sz = (z + lxd8) / maxk;
-                const int kk = (z + lxd8) % maxk;
-
-                const int ky = kk / kernel_w;
-                const int kx = kk % kernel_w;
-
-                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
-
-                const ivec4 sy16 = gx16 / psc(outw);
-                const ivec4 sx16 = gx16 % psc(outw);
-
-                const ivec4 sxs16 = sx16 * stride_w;
-                const ivec4 sys16 = sy16 * stride_h;
-
-                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
-
-                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
-                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd8 < remain)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                const int sz = (z + lxd8) / maxk;
-                const int kk = (z + lxd8) % maxk;
-
-                const int ky = kk / kernel_w;
-                const int kx = kk % kernel_w;
-
-                const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24);
-
-                const ivec4 sy16 = gx16 / psc(outw);
-                const ivec4 sx16 = gx16 % psc(outw);
-
-                const ivec4 sxs16 = sx16 * stride_w;
-                const ivec4 sys16 = sy16 * stride_h;
-
-                const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w;
-
-                tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0);
-                tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (gx >= outsize || gy >= psc(outc))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
-
-    barrier();
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
-            const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16);
-
-            if (gy + j * 2 + lxd16 < psc(outc))
-            {
-                if (gx + lxm16 < psc(outcstep))
-                {
-                    uvec2 sum0_u2 = tmp_v0[tmp_vi];
-                    afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y));
-                    sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
-                    buffer_st4(top_blob_data, gi, sum0);
-                }
-                if (gx + lxm16 + 16 < psc(outcstep))
-                {
-                    uvec2 sum1_u2 = tmp_v1[tmp_vi];
-                    afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y));
-                    sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
-                    buffer_st4(top_blob_data, gi + 16, sum1);
-                }
-            }
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp
similarity index 73%
rename from src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
rename to src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp
index c5047220c..487adc8e1 100644
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int maxk = 1;
 
@@ -62,10 +66,17 @@ void main()
     const int lxd16 = lx / 16; // 0 1
     const int lxm16 = lx % 16; // 0 1 2 3 .... 15
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum3 = coopmat<float, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
+#endif
 
     const int N = psc(c) / 4;
 
@@ -93,6 +104,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -108,6 +120,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -139,6 +168,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
@@ -154,6 +184,23 @@ void main()
             sum1 = coopMatMulAdd(A1, B0, sum1);
             sum2 = coopMatMulAdd(A0, B1, sum2);
             sum3 = coopMatMulAdd(A1, B1, sum3);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
+            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
+            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+#endif
         }
 
         barrier();
@@ -162,6 +209,7 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outh))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 16, gl_MatrixUseAccumulator>(sum2);
@@ -171,6 +219,17 @@ void main()
     coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
similarity index 70%
rename from src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
rename to src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
index 3c444ee9f..6e06d3c83 100644
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp
+++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp
@@ -17,7 +17,11 @@
 #extension GL_KHR_memory_scope_semantics: require
 #extension GL_EXT_shader_explicit_arithmetic_types: require
 #extension GL_EXT_shader_explicit_arithmetic_types_float16: require
+#if ncnn_VK_KHR_cooperative_matrix
 #extension GL_KHR_cooperative_matrix: require
+#elif ncnn_VK_NV_cooperative_matrix
+#extension GL_NV_cooperative_matrix: require
+#endif
 
 layout (constant_id = 0) const int maxk = 1;
 
@@ -64,6 +68,7 @@ void main()
     const int lxd8 = lx / 8; // 0 1 2 3
     const int lxm8 = lx % 8; // 0 1 2 3 .... 7
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
@@ -72,6 +77,16 @@ void main()
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum5 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum6 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
     coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum7 = coopmat<float, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(0.f);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
+#endif
 
     const int N = psc(c) / 2;
 
@@ -105,6 +120,7 @@ void main()
 
         for (int z4 = 0; z4 < UNROLL_INCH; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -128,6 +144,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -165,6 +206,7 @@ void main()
 
         for (int z4 = 0; z4 < remain; z4++)
         {
+#if ncnn_VK_KHR_cooperative_matrix
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A0;
             coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseA> A1;
             coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor);
@@ -188,6 +230,31 @@ void main()
             sum5 = coopMatMulAdd(A1, B2, sum5);
             sum6 = coopMatMulAdd(A0, B3, sum6);
             sum7 = coopMatMulAdd(A1, B3, sum7);
+#elif ncnn_VK_NV_cooperative_matrix
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
+            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
+            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
+
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
+            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
+            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
+            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
+            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
+            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
+
+            // sum += v * k
+            sum0 = coopMatMulAddNV(A0, B0, sum0);
+            sum1 = coopMatMulAddNV(A1, B0, sum1);
+            sum2 = coopMatMulAddNV(A0, B1, sum2);
+            sum3 = coopMatMulAddNV(A1, B1, sum3);
+            sum4 = coopMatMulAddNV(A0, B2, sum4);
+            sum5 = coopMatMulAddNV(A1, B2, sum5);
+            sum6 = coopMatMulAddNV(A0, B3, sum6);
+            sum7 = coopMatMulAddNV(A1, B3, sum7);
+#endif
         }
 
         barrier();
@@ -196,6 +263,7 @@ void main()
     if (gx >= psc(outw) || gy >= psc(outh))
         return;
 
+#if ncnn_VK_KHR_cooperative_matrix
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum0_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum0);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum1_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum1);
     coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator> sum2_fp16 = coopmat<float16_t, gl_ScopeSubgroup, 16, 8, gl_MatrixUseAccumulator>(sum2);
@@ -213,6 +281,25 @@ void main()
     coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
     coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor);
+#elif ncnn_VK_NV_cooperative_matrix
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
+    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
+
+    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
+    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
+    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
+    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
+    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
+    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
+    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
+    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
+#endif
 
     barrier();
 
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
deleted file mode 100644
index d292ae40b..000000000
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp
+++ /dev/null
@@ -1,188 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int maxk = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-} p;
-
-#define UNROLL_INCH 2
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k0[UNROLL_INCH * 16*4];
-shared uvec2 tmp_k1[UNROLL_INCH * 16*4];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f);
-
-    const int N = psc(c) / 4;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd16 == 0)
-        {
-            for (int j = 0; j < 4; j++)
-            {
-                const int tmp_i = lxd16*16*4 + lxm16 * 4 + j;
-
-                const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16);
-
-                tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-
-                const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j);
-
-                tmp_k0[tmp_i] = weight_data[w_offset];
-                tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1;
-            coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false);
-            coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy >= psc(outh))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false);
-
-    barrier();
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 4 + j + lxd16*16*4;
-
-            const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16);
-
-            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
-            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
-        }
-    }
-}
diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
deleted file mode 100644
index 5f00ab932..000000000
--- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp
+++ /dev/null
@@ -1,232 +0,0 @@
-// Tencent is pleased to support the open source community by making ncnn available.
-//
-// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
-//
-// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
-// in compliance with the License. You may obtain a copy of the License at
-//
-// https://opensource.org/licenses/BSD-3-Clause
-//
-// Unless required by applicable law or agreed to in writing, software distributed
-// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
-// CONDITIONS OF ANY KIND, either express or implied. See the License for the
-// specific language governing permissions and limitations under the License.
-
-#version 450
-
-#extension GL_KHR_memory_scope_semantics: require
-#extension GL_EXT_shader_explicit_arithmetic_types: require
-#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
-#extension GL_NV_cooperative_matrix: require
-
-layout (constant_id = 0) const int maxk = 1;
-
-#define shape_constant_id_offset 1
-layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
-layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
-layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
-layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;
-
-layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
-layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
-
-layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; };
-layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; };
-layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; };
-
-layout (push_constant) uniform parameter
-{
-    int w;
-    int h;
-    int c;
-    int cstep;
-
-    int outw;
-    int outh;
-} p;
-
-#define UNROLL_INCH 4
-
-shared uvec2 tmp_v0[UNROLL_INCH * 16*2];
-shared uvec2 tmp_v1[UNROLL_INCH * 16*2];
-shared uvec2 tmp_k0[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k1[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k2[UNROLL_INCH * 8*2];
-shared uvec2 tmp_k3[UNROLL_INCH * 8*2];
-
-void main()
-{
-    int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16;
-    int gy = int(gl_GlobalInvocationID.y) * 2 * 4;
-
-    const int lx = int(gl_LocalInvocationID.x);
-
-    const int lxd8 = lx / 8; // 0 1 2 3
-    const int lxm8 = lx % 8; // 0 1 2 3 .... 7
-
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-    fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f);
-
-    const int N = psc(c) / 2;
-
-    int z = 0;
-    for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH)
-    {
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < UNROLL_INCH; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (z < N)
-    {
-        const int remain = N - z;
-
-        if (lxd8 < remain)
-        {
-            for (int j = 0; j < 2; j++)
-            {
-                const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j;
-
-                int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8);
-
-                tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0);
-                tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0);
-                tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0);
-                tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0);
-
-                const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j;
-
-                int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j);
-
-                tmp_k0[tmp_ki] = weight_data[w_offset];
-                tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8];
-                tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16];
-                tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24];
-            }
-        }
-
-        barrier();
-
-        for (int z4 = 0; z4 < remain; z4++)
-        {
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1;
-            coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false);
-            coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false);
-
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2;
-            fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3;
-            coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false);
-            coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false);
-            coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false);
-            coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false);
-
-            // sum += v * k
-            sum0 = coopMatMulAddNV(A0, B0, sum0);
-            sum1 = coopMatMulAddNV(A1, B0, sum1);
-            sum2 = coopMatMulAddNV(A0, B1, sum2);
-            sum3 = coopMatMulAddNV(A1, B1, sum3);
-            sum4 = coopMatMulAddNV(A0, B2, sum4);
-            sum5 = coopMatMulAddNV(A1, B2, sum5);
-            sum6 = coopMatMulAddNV(A0, B3, sum6);
-            sum7 = coopMatMulAddNV(A1, B3, sum7);
-        }
-
-        barrier();
-    }
-
-    if (gx >= psc(outw) || gy >= psc(outh))
-        return;
-
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6);
-    fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7);
-
-    coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false);
-    coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false);
-    coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false);
-    coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false);
-    coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false);
-    coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false);
-    coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false);
-    coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false);
-
-    barrier();
-
-    const int lxd16 = lx / 16; // 0 1
-    const int lxm16 = lx % 16; // 0 1 2 3 .... 15
-
-    {
-        for (int j = 0; j < 4; j++)
-        {
-            const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2;
-            const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16);
-
-            if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi];
-            if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi];
-        }
-    }
-}