diff --git a/src/gpu.cpp b/src/gpu.cpp index 6c14ac98c..6ba970031 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -376,14 +376,16 @@ public: VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT queryShaderAtomicFloat2Features; // extension properties - void* queryDeviceProperties; + void* queryExtensionProperties; VkPhysicalDeviceFloatControlsPropertiesKHR queryFloatControlsProperties; VkPhysicalDeviceShaderIntegerDotProductProperties queryShaderIntegerDotProductProperties; VkPhysicalDeviceSubgroupProperties querySubgroupProperties; VkPhysicalDeviceDriverPropertiesKHR queryDriverProperties; VkPhysicalDeviceSubgroupSizeControlPropertiesEXT querySubgroupSizeControlProperties; - std::vector queryCooperativeMatrixProperties; - std::vector queryCooperativeMatrixPropertiesNV; + + // extension sub properties + std::vector queryCooperativeMatrixSubProperties; + std::vector queryCooperativeMatrixSubPropertiesNV; }; void GpuInfoPrivate::query_features() @@ -855,17 +857,19 @@ void GpuInfoPrivate::query_extension_features() // query cooperative_matrix memset(&queryCooperativeMatrixFeatures, 0, sizeof(queryCooperativeMatrixFeatures)); - memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV)); queryCooperativeMatrixFeatures.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_KHR; queryCooperativeMatrixFeatures.pNext = 0; - queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; - queryCooperativeMatrixFeaturesNV.pNext = 0; if (support_VK_KHR_cooperative_matrix) { queryCooperativeMatrixFeatures.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrixFeatures; } - else if (support_VK_NV_cooperative_matrix) + + // query nv cooperative matrix + memset(&queryCooperativeMatrixFeaturesNV, 0, sizeof(queryCooperativeMatrixFeaturesNV)); + queryCooperativeMatrixFeaturesNV.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_COOPERATIVE_MATRIX_FEATURES_NV; + queryCooperativeMatrixFeaturesNV.pNext = 0; + if (support_VK_NV_cooperative_matrix) { queryCooperativeMatrixFeaturesNV.pNext = queryExtensionFeatures; queryExtensionFeatures = &queryCooperativeMatrixFeaturesNV; @@ -978,7 +982,7 @@ void GpuInfoPrivate::query_extension_features() void GpuInfoPrivate::query_extension_properties() { - queryDeviceProperties = 0; + queryExtensionProperties = 0; // query float controls memset(&queryFloatControlsProperties, 0, sizeof(queryFloatControlsProperties)); @@ -986,8 +990,8 @@ void GpuInfoPrivate::query_extension_properties() queryFloatControlsProperties.pNext = 0; if (support_VK_KHR_shader_float_controls) { - queryFloatControlsProperties.pNext = queryDeviceProperties; - queryDeviceProperties = &queryFloatControlsProperties; + queryFloatControlsProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &queryFloatControlsProperties; } // query integer dot product @@ -996,8 +1000,8 @@ void GpuInfoPrivate::query_extension_properties() queryShaderIntegerDotProductProperties.pNext = 0; if (support_VK_KHR_driver_properties) { - queryShaderIntegerDotProductProperties.pNext = queryDeviceProperties; - queryDeviceProperties = &queryShaderIntegerDotProductProperties; + queryShaderIntegerDotProductProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &queryShaderIntegerDotProductProperties; } // query subgroup @@ -1006,8 +1010,8 @@ void GpuInfoPrivate::query_extension_properties() querySubgroupProperties.pNext = 0; if (VK_VERSION_MAJOR(g_instance.instance_api_version) >= 1 && VK_VERSION_MINOR(g_instance.instance_api_version) >= 1) { - querySubgroupProperties.pNext = queryDeviceProperties; - queryDeviceProperties = &querySubgroupProperties; + querySubgroupProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &querySubgroupProperties; } else { @@ -1032,8 +1036,8 @@ void GpuInfoPrivate::query_extension_properties() queryDriverProperties.pNext = 0; if (support_VK_KHR_driver_properties) { - queryDriverProperties.pNext = queryDeviceProperties; - queryDeviceProperties = &queryDriverProperties; + queryDriverProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &queryDriverProperties; } // query subgroup size control @@ -1042,15 +1046,15 @@ void GpuInfoPrivate::query_extension_properties() querySubgroupSizeControlProperties.pNext = 0; if (support_VK_EXT_subgroup_size_control) { - querySubgroupSizeControlProperties.pNext = queryDeviceProperties; - queryDeviceProperties = &querySubgroupSizeControlProperties; + querySubgroupSizeControlProperties.pNext = queryExtensionProperties; + queryExtensionProperties = &querySubgroupSizeControlProperties; } if (support_VK_KHR_get_physical_device_properties2) { VkPhysicalDeviceProperties2KHR queryProperties; queryProperties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR; - queryProperties.pNext = queryDeviceProperties; + queryProperties.pNext = queryExtensionProperties; vkGetPhysicalDeviceProperties2KHR(physicalDevice, &queryProperties); @@ -1072,8 +1076,8 @@ void GpuInfoPrivate::query_extension_properties() } // query supported cooperative matrix types and operations - queryCooperativeMatrixProperties.clear(); - queryCooperativeMatrixPropertiesNV.clear(); + queryCooperativeMatrixSubProperties.clear(); + queryCooperativeMatrixSubPropertiesNV.clear(); support_cooperative_matrix_8_8_16 = false; support_cooperative_matrix_16_8_8 = false; support_cooperative_matrix_16_8_16 = false; @@ -1087,14 +1091,14 @@ void GpuInfoPrivate::query_extension_properties() NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); } - queryCooperativeMatrixProperties.resize(propertyCount); + queryCooperativeMatrixSubProperties.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { - memset(&queryCooperativeMatrixProperties[j], 0, sizeof(queryCooperativeMatrixProperties[j])); - queryCooperativeMatrixProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; - queryCooperativeMatrixProperties[j].pNext = 0; + memset(&queryCooperativeMatrixSubProperties[j], 0, sizeof(queryCooperativeMatrixSubProperties[j])); + queryCooperativeMatrixSubProperties[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + queryCooperativeMatrixSubProperties[j].pNext = 0; } - ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixProperties.data()); + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(physicalDevice, &propertyCount, queryCooperativeMatrixSubProperties.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR failed %d", ret); @@ -1102,7 +1106,7 @@ void GpuInfoPrivate::query_extension_properties() for (uint32_t j = 0; j < propertyCount; j++) { - const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixProperties[j]; + const VkCooperativeMatrixPropertiesKHR& cmp = queryCooperativeMatrixSubProperties[j]; // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.ResultType, cmp.scope); if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 @@ -1144,14 +1148,14 @@ void GpuInfoPrivate::query_extension_properties() NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); } - queryCooperativeMatrixPropertiesNV.resize(propertyCount); + queryCooperativeMatrixSubPropertiesNV.resize(propertyCount); for (uint32_t j = 0; j < propertyCount; j++) { - memset(&queryCooperativeMatrixPropertiesNV[j], 0, sizeof(queryCooperativeMatrixPropertiesNV[j])); - queryCooperativeMatrixPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV; - queryCooperativeMatrixPropertiesNV[j].pNext = 0; + memset(&queryCooperativeMatrixSubPropertiesNV[j], 0, sizeof(queryCooperativeMatrixSubPropertiesNV[j])); + queryCooperativeMatrixSubPropertiesNV[j].sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_NV; + queryCooperativeMatrixSubPropertiesNV[j].pNext = 0; } - ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixPropertiesNV.data()); + ret = vkGetPhysicalDeviceCooperativeMatrixPropertiesNV(physicalDevice, &propertyCount, queryCooperativeMatrixSubPropertiesNV.data()); if (ret != VK_SUCCESS) { NCNN_LOGE("vkGetPhysicalDeviceCooperativeMatrixPropertiesNV failed %d", ret); @@ -1159,7 +1163,7 @@ void GpuInfoPrivate::query_extension_properties() for (uint32_t j = 0; j < propertyCount; j++) { - const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixPropertiesNV[j]; + const VkCooperativeMatrixPropertiesNV& cmp = queryCooperativeMatrixSubPropertiesNV[j]; // NCNN_LOGE("cpm %2d %2d %2d %d %d %d %d %d", cmp.MSize, cmp.NSize, cmp.KSize, cmp.AType, cmp.BType, cmp.CType, cmp.DType, cmp.scope); if (cmp.MSize == 8 && cmp.NSize == 8 && cmp.KSize == 16 @@ -1837,9 +1841,9 @@ const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& GpuInfo::queryShaderAtomicF return d->queryShaderAtomicFloat2Features; } -const void* GpuInfo::queryDeviceProperties() const +const void* GpuInfo::queryExtensionProperties() const { - return d->queryDeviceProperties; + return d->queryExtensionProperties; } const VkPhysicalDeviceShaderIntegerDotProductProperties& GpuInfo::queryShaderIntegerDotProductProperties() const @@ -1862,14 +1866,14 @@ const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& GpuInfo::querySubgroupSi return d->querySubgroupSizeControlProperties; } -const std::vector& GpuInfo::queryCooperativeMatrixProperties() const +const std::vector& GpuInfo::queryCooperativeMatrixSubProperties() const { - return d->queryCooperativeMatrixProperties; + return d->queryCooperativeMatrixSubProperties; } -const std::vector& GpuInfo::queryCooperativeMatrixPropertiesNV() const +const std::vector& GpuInfo::queryCooperativeMatrixSubPropertiesNV() const { - return d->queryCooperativeMatrixPropertiesNV; + return d->queryCooperativeMatrixSubPropertiesNV; } static int init_instance_core() diff --git a/src/gpu.h b/src/gpu.h index a0cecfd5d..da9c5573c 100644 --- a/src/gpu.h +++ b/src/gpu.h @@ -365,14 +365,16 @@ public: const VkPhysicalDeviceShaderAtomicFloat2FeaturesEXT& queryShaderAtomicFloat2Features() const; // extension properties - const void* queryDeviceProperties() const; + const void* queryExtensionProperties() const; const VkPhysicalDeviceFloatControlsPropertiesKHR& queryFloatControlsProperties() const; const VkPhysicalDeviceShaderIntegerDotProductProperties& queryShaderIntegerDotProductProperties() const; const VkPhysicalDeviceSubgroupProperties& querySubgroupProperties() const; const VkPhysicalDeviceDriverPropertiesKHR& queryDriverProperties() const; const VkPhysicalDeviceSubgroupSizeControlPropertiesEXT& querySubgroupSizeControlProperties() const; - const std::vector& queryCooperativeMatrixProperties() const; - const std::vector& queryCooperativeMatrixPropertiesNV() const; + + // extension sub properties + const std::vector& queryCooperativeMatrixSubProperties() const; + const std::vector& queryCooperativeMatrixSubPropertiesNV() const; private: GpuInfo(const GpuInfo&); diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index 933ff6d73..a659c34be 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -416,17 +416,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (use_cooperative_matrix_16_8_8) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8; - else - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8; + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8; } else if (use_cooperative_matrix_16_16_16) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16; - else - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16; + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16; } pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev); @@ -696,17 +690,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (use_cooperative_matrix_16_8_8) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8; - else - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8; + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8; } else if (use_cooperative_matrix_16_16_16) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16; - else - shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16; + shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16; } pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev); @@ -1028,17 +1016,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (use_cooperative_matrix_16_8_8) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_8_8; - else - shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_8_8; + shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_8_8; } else if (use_cooperative_matrix_16_16_16) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_gemm_khr_cm_16_16_16; - else - shader_type_index = LayerShaderType::convolution_pack4_gemm_nv_cm_16_16_16; + shader_type_index = LayerShaderType::convolution_pack4_gemm_cm_16_16_16; } pipeline_convolution_gemm = new Pipeline(vkdev); @@ -1099,17 +1081,11 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (use_cooperative_matrix_16_8_8) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_8_8; - else - shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_8_8; + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_8_8; } else if (use_cooperative_matrix_16_16_16) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_khr_cm_16_16_16; - else - shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_nv_cm_16_16_16; + shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1_cm_16_16_16; } pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp index 7cca37aae..5f953efe8 100644 --- a/src/layer/vulkan/deconvolution_vulkan.cpp +++ b/src/layer/vulkan/deconvolution_vulkan.cpp @@ -301,17 +301,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) if (use_cooperative_matrix_16_8_8) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_8_8; - else - shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_8_8; + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_8_8; } else if (use_cooperative_matrix_16_16_16) { - if (vkdev->info.support_VK_KHR_cooperative_matrix()) - shader_type_index = LayerShaderType::deconvolution_pack4_gemm_khr_cm_16_16_16; - else - shader_type_index = LayerShaderType::deconvolution_pack4_gemm_nv_cm_16_16_16; + shader_type_index = LayerShaderType::deconvolution_pack4_gemm_cm_16_16_16; } pipeline_deconvolution_gemm = new Pipeline(vkdev); diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp similarity index 69% rename from src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp rename to src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp index 398bf7fde..ee9f69817 100644 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_16_16.comp +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_16_16.comp @@ -20,7 +20,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int bias_term = 0; layout (constant_id = 1) const int activation_type = 0; @@ -73,18 +77,40 @@ void main() const int lxd16 = lx / 16; // 0 1 const int lxm16 = lx % 16; // 0 1 2 3 .... 15 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0; coopmat sum1; coopmat sum2; coopmat sum3; +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3; +#else + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; +#endif +#endif if (bias_term == 1) { +#if ncnn_VK_KHR_cooperative_matrix coopmat bias0; coopmat bias1; coopMatLoad(bias0, bias_data, gy, 0, gl_CooperativeMatrixLayoutRowMajor); coopMatLoad(bias1, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); +#endif #if NCNN_fp16_arithmetic sum0 = bias0; @@ -92,18 +118,39 @@ void main() sum2 = bias1; sum3 = bias1; #else +#if ncnn_VK_KHR_cooperative_matrix sum0 = coopmat(bias0); sum1 = coopmat(bias0); sum2 = coopmat(bias1); sum3 = coopmat(bias1); +#elif ncnn_VK_NV_cooperative_matrix + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); +#endif #endif } else { +#if ncnn_VK_KHR_cooperative_matrix sum0 = coopmat(0.f); sum1 = coopmat(0.f); sum2 = coopmat(0.f); sum3 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); +#else + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); +#endif +#endif } const int N = psc(c) / 4; @@ -132,6 +179,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -147,6 +195,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -178,6 +243,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -193,6 +259,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -201,6 +284,7 @@ void main() if (gx >= psc(outcstep) || gy >= psc(outc)) return; +#if ncnn_VK_KHR_cooperative_matrix #if NCNN_fp16_arithmetic coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -216,6 +300,24 @@ void main() coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); +#endif +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + coopMatStoreNV(sum0, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false); +#else + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); +#endif #endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp similarity index 66% rename from src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp rename to src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp index 5d79ddb2a..c36ba392f 100644 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_khr_cm_16_8_8.comp +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp @@ -20,7 +20,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int bias_term = 0; layout (constant_id = 1) const int activation_type = 0; @@ -75,6 +79,7 @@ void main() const int lxd8 = lx / 8; // 0 1 2 3 const int lxm8 = lx % 8; // 0 1 2 3 .... 7 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0; coopmat sum1; coopmat sum2; @@ -83,9 +88,31 @@ void main() coopmat sum5; coopmat sum6; coopmat sum7; +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7; +#else + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; +#endif +#endif if (bias_term == 1) { +#if ncnn_VK_KHR_cooperative_matrix coopmat bias0; coopmat bias1; coopmat bias2; @@ -95,6 +122,17 @@ void main() coopMatLoad(bias1, bias_data, gy + 2, 0, gl_CooperativeMatrixLayoutRowMajor); coopMatLoad(bias2, bias_data, gy + 4, 0, gl_CooperativeMatrixLayoutRowMajor); coopMatLoad(bias3, bias_data, gy + 6, 0, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); + coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); + coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); +#endif #if NCNN_fp16_arithmetic sum0 = bias0; @@ -106,6 +144,7 @@ void main() sum6 = bias3; sum7 = bias3; #else +#if ncnn_VK_KHR_cooperative_matrix sum0 = coopmat(bias0); sum1 = coopmat(bias0); sum2 = coopmat(bias1); @@ -114,10 +153,21 @@ void main() sum5 = coopmat(bias2); sum6 = coopmat(bias3); sum7 = coopmat(bias3); +#elif ncnn_VK_NV_cooperative_matrix + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); +#endif #endif } else { +#if ncnn_VK_KHR_cooperative_matrix sum0 = coopmat(0.f); sum1 = coopmat(0.f); sum2 = coopmat(0.f); @@ -126,6 +176,27 @@ void main() sum5 = coopmat(0.f); sum6 = coopmat(0.f); sum7 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); +#else + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); +#endif +#endif } const int N = psc(c) / 2; @@ -160,6 +231,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -183,6 +255,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -220,6 +317,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -243,6 +341,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -251,6 +374,7 @@ void main() if (gx >= psc(outcstep) || gy >= psc(outc)) return; +#if ncnn_VK_KHR_cooperative_matrix #if NCNN_fp16_arithmetic coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -278,6 +402,36 @@ void main() coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); +#endif +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + coopMatStoreNV(sum0, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false); +#else + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); +#endif #endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp deleted file mode 100644 index cc3a00a3f..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_16_16.comp +++ /dev/null @@ -1,260 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; - -#define shape_constant_id_offset 4 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 2 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - -#if NCNN_fp16_arithmetic - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3; -#else - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; -#endif - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; - - coopMatLoadNV(bias0, bias_data, gy, 0, false); - coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); - -#if NCNN_fp16_arithmetic - sum0 = bias0; - sum1 = bias0; - sum2 = bias1; - sum3 = bias1; -#else - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); -#endif - } - else - { -#if NCNN_fp16_arithmetic - sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); -#else - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); -#endif - } - - const int N = psc(c) / 4; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd16 == 0) - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = ((z + lxd16) * 4 + j) * psc(outcstep) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (gx >= psc(outcstep) || gy >= psc(outc)) - return; - -#if NCNN_fp16_arithmetic - coopMatStoreNV(sum0, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false); -#else - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); -#endif - - barrier(); - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; - - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - - const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); - - if (gy + lxd16 * 4 + j < psc(outc)) - { - if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); - if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); - } - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp deleted file mode 100644 index c0494e58a..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1_nv_cm_16_8_8.comp +++ /dev/null @@ -1,335 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int bias_term = 0; -layout (constant_id = 1) const int activation_type = 0; -layout (constant_id = 2) const float activation_param_0 = 0; -layout (constant_id = 3) const float activation_param_1 = 0; - -#define shape_constant_id_offset 4 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - -#if NCNN_fp16_arithmetic - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7; -#else - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; -#endif - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; - - coopMatLoadNV(bias0, bias_data, gy, 0, false); - coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); - coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); - coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); - -#if NCNN_fp16_arithmetic - sum0 = bias0; - sum1 = bias0; - sum2 = bias1; - sum3 = bias1; - sum4 = bias2; - sum5 = bias2; - sum6 = bias3; - sum7 = bias3; -#else - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); - sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); - sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); - sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); - sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); -#endif - } - else - { -#if NCNN_fp16_arithmetic - sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); -#else - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); -#endif - } - - const int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd8 < remain) - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = ((z + lxd8) * 2 + j) * psc(outcstep) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outcstep) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outcstep) ? bottom_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outcstep) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outcstep) ? bottom_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (gx >= psc(outcstep) || gy >= psc(outc)) - return; - -#if NCNN_fp16_arithmetic - coopMatStoreNV(sum0, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false); -#else - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); -#endif - - barrier(); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; - const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); - - if (gy + j * 2 + lxd16 < psc(outc)) - { - if (gx + lxm16 < psc(outcstep)) - { - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - buffer_st4(top_blob_data, gi, sum0); - } - if (gx + lxm16 + 16 < psc(outcstep)) - { - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - buffer_st4(top_blob_data, gi + 16, sum1); - } - } - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp similarity index 70% rename from src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp rename to src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp index 598e63601..d8dcbb0b1 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_16_16.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_16_16.comp @@ -17,7 +17,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int batch = 1; @@ -61,10 +65,24 @@ void main() const int lxd16 = lx / 16; // 0 1 const int lxm16 = lx % 16; // 0 1 2 3 .... 15 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0 = coopmat(0.f); coopmat sum1 = coopmat(0.f); coopmat sum2 = coopmat(0.f); coopmat sum3 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); +#else + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); +#endif +#endif const int N = psc(c) / 4; @@ -92,6 +110,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -107,6 +126,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -138,6 +174,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -153,6 +190,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -161,6 +215,7 @@ void main() if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) return; +#if ncnn_VK_KHR_cooperative_matrix #if NCNN_fp16_arithmetic coopMatStore(sum0, tmp_v0, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum1, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -176,6 +231,24 @@ void main() coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); +#endif +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + coopMatStoreNV(sum0, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false); +#else + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); +#endif #endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp similarity index 67% rename from src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp rename to src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp index 7826f1ac0..de95f5307 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_khr_cm_16_8_8.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_cm_16_8_8.comp @@ -17,7 +17,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int batch = 1; @@ -63,6 +67,7 @@ void main() const int lxd8 = lx / 8; // 0 1 2 3 const int lxm8 = lx % 8; // 0 1 2 3 .... 7 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0 = coopmat(0.f); coopmat sum1 = coopmat(0.f); coopmat sum2 = coopmat(0.f); @@ -71,6 +76,27 @@ void main() coopmat sum5 = coopmat(0.f); coopmat sum6 = coopmat(0.f); coopmat sum7 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); +#else + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); +#endif +#endif const int N = psc(c) / 2; @@ -104,6 +130,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -127,6 +154,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -164,6 +216,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -187,6 +240,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -195,6 +273,7 @@ void main() if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) return; +#if ncnn_VK_KHR_cooperative_matrix #if NCNN_fp16_arithmetic coopMatStore(sum0, tmp_v0, 0, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum1, tmp_v1, 0, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -222,6 +301,36 @@ void main() coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); +#endif +#elif ncnn_VK_NV_cooperative_matrix +#if NCNN_fp16_arithmetic + coopMatStoreNV(sum0, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false); +#else + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); +#endif #endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp deleted file mode 100644 index 0182b7960..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_16_16.comp +++ /dev/null @@ -1,203 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int batch = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; -layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outw; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 2 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - int gz = int(gl_GlobalInvocationID.z); - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - -#if NCNN_fp16_arithmetic - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(0.f); -#else - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); -#endif - - const int N = psc(c) / 4; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_tm_data[w_offset]; - tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd16 == 0) - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = gz * psc(cstep) + ((z + lxd16) * 4 + j) * psc(outw) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_tm_data[w_offset]; - tmp_k1[tmp_i] = weight_tm_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) - return; - -#if NCNN_fp16_arithmetic - coopMatStoreNV(sum0, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3, tmp_v1, 16*4, 4, false); -#else - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); -#endif - - barrier(); - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; - const int gi = gz * psc(outcstep) + (gy + lxd16 * 4 + j) * psc(outw) + (gx + lxm16); - - if (gy + lxd16 * 4 + j < psc(outc)) - { - if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; - } - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp deleted file mode 100644 index 078c6a52c..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm_nv_cm_16_8_8.comp +++ /dev/null @@ -1,256 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int batch = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_tm_blob { uvec2 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { uvec2 top_tm_blob_data[]; }; -layout (binding = 2) readonly buffer weight_tm_blob { uvec2 weight_tm_data[]; }; - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outw; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - int gz = int(gl_GlobalInvocationID.z); - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - -#if NCNN_fp16_arithmetic - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(0.f); -#else - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); -#endif - - const int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_tm_data[w_offset]; - tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd8 < remain) - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = gz * psc(cstep) + ((z + lxd8) * 2 + j) * psc(outw) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_tm_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_tm_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_tm_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_tm_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gz * psc(outc) * psc(c) * 4 + gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_tm_data[w_offset]; - tmp_k1[tmp_ki] = weight_tm_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_tm_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_tm_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) - return; - -#if NCNN_fp16_arithmetic - coopMatStoreNV(sum0, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7, tmp_v1, 16*6, 2, false); -#else - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); -#endif - - barrier(); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; - const int gi = gz * psc(outcstep) + (gy + lxd16 + j*2) * psc(outw) + (gx + lxm16); - - if (gy + j * 2 + lxd16 < psc(outc)) - { - if (gx + lxm16 < psc(outw)) top_tm_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + lxm16 + 16 < psc(outw)) top_tm_blob_data[gi + 16] = tmp_v1[tmp_vi]; - } - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp similarity index 76% rename from src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp rename to src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp index 804321bff..c3f87ffec 100644 --- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_16_16.comp +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_16_16.comp @@ -20,7 +20,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; @@ -81,6 +85,7 @@ void main() const int lxd16 = lx / 16; // 0 1 const int lxm16 = lx % 16; // 0 1 2 3 .... 15 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0; coopmat sum1; coopmat sum2; @@ -106,6 +111,33 @@ void main() sum2 = coopmat(0.f); sum3 = coopmat(0.f); } +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + } +#endif const int maxk = kernel_w * kernel_h; const int N = psc(c) / 4 * maxk; @@ -148,6 +180,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -163,6 +196,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -208,6 +258,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -223,6 +274,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -231,6 +299,7 @@ void main() if (gx >= outsize || gy >= psc(outc)) return; +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0_fp16 = coopmat(sum0); coopmat sum1_fp16 = coopmat(sum1); coopmat sum2_fp16 = coopmat(sum2); @@ -240,6 +309,17 @@ void main() coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); +#endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp similarity index 72% rename from src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp rename to src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp index 3d2f7d1e5..b8f6e3f17 100644 --- a/src/layer/vulkan/shader/convolution_pack4_gemm_khr_cm_16_8_8.comp +++ b/src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp @@ -20,7 +20,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; @@ -83,6 +87,7 @@ void main() const int lxd8 = lx / 8; // 0 1 2 3 const int lxm8 = lx % 8; // 0 1 2 3 .... 7 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0; coopmat sum1; coopmat sum2; @@ -124,6 +129,49 @@ void main() sum6 = coopmat(0.f); sum7 = coopmat(0.f); } +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; + + if (bias_term == 1) + { + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; + + coopMatLoadNV(bias0, bias_data, gy, 0, false); + coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); + coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); + coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); + + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); + } + else + { + sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + } +#endif const int maxk = kernel_w * kernel_h; const int N = psc(c) / 2 * maxk; @@ -172,6 +220,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -195,6 +244,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -246,6 +320,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -269,6 +344,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -277,6 +377,7 @@ void main() if (gx >= outsize || gy >= psc(outc)) return; +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0_fp16 = coopmat(sum0); coopmat sum1_fp16 = coopmat(sum1); coopmat sum2_fp16 = coopmat(sum2); @@ -294,6 +395,25 @@ void main() coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); +#endif barrier(); diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp deleted file mode 100644 index 51bdb91f4..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_16_16.comp +++ /dev/null @@ -1,269 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 2 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int outsize = psc(outw) * psc(outh); - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3; - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> bias1; - - coopMatLoadNV(bias0, bias_data, gy, 0, false); - coopMatLoadNV(bias1, bias_data, gy + 4, 0, false); - - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias0); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(bias1); - } - else - { - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - } - - const int maxk = kernel_w * kernel_h; - const int N = psc(c) / 4 * maxk; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int sz = (z + lxd16) / maxk; - const int kk = (z + lxd16) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); - - const ivec2 sy16 = gx16 / psc(outw); - const ivec2 sx16 = gx16 % psc(outw); - - const ivec2 sxs16 = sx16 * stride_w; - const ivec2 sys16 = sy16 * stride_h; - - const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; - - tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); - - int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd16 == 0) - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int sz = (z + lxd16) / maxk; - const int kk = (z + lxd16) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec2 gx16 = gx + lxm16 + ivec2(0, 16); - - const ivec2 sy16 = gx16 / psc(outw); - const ivec2 sx16 = gx16 % psc(outw); - - const ivec2 sxs16 = sx16 * stride_w; - const ivec2 sys16 = sy16 * stride_h; - - const ivec2 v_offset = (sz * 4 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; - - tmp_v0[tmp_i] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v1[tmp_i] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); - - int w_offset = gy * psc(c) * 4 * maxk + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * maxk * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (gx >= outsize || gy >= psc(outc)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); - - barrier(); - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; - - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - - const int gi = (gy + lxd16 * 4 + j) * psc(outcstep) + (gx + lxm16); - - if (gy + lxd16 * 4 + j < psc(outc)) - { - if (gx + lxm16 < psc(outcstep)) buffer_st4(top_blob_data, gi, sum0); - if (gx + lxm16 + 16 < psc(outcstep)) buffer_st4(top_blob_data, gi + 16, sum1); - } - } - } -} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp deleted file mode 100644 index 786c2ebfb..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_gemm_nv_cm_16_8_8.comp +++ /dev/null @@ -1,328 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_GOOGLE_include_directive: enable -#include "vulkan_activation.comp" - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; -layout (binding = 3) readonly buffer bias_blob { uvec2 bias_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; - int outc; - int outcstep; -} p; - -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int outsize = psc(outw) * psc(outh); - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6; - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7; - - if (bias_term == 1) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias1; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias2; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> bias3; - - coopMatLoadNV(bias0, bias_data, gy, 0, false); - coopMatLoadNV(bias1, bias_data, gy + 2, 0, false); - coopMatLoadNV(bias2, bias_data, gy + 4, 0, false); - coopMatLoadNV(bias3, bias_data, gy + 6, 0, false); - - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias0); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias1); - sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); - sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias2); - sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); - sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(bias3); - } - else - { - sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - } - - const int maxk = kernel_w * kernel_h; - const int N = psc(c) / 2 * maxk; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - const int sz = (z + lxd8) / maxk; - const int kk = (z + lxd8) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); - - const ivec4 sy16 = gx16 / psc(outw); - const ivec4 sx16 = gx16 % psc(outw); - - const ivec4 sxs16 = sx16 * stride_w; - const ivec4 sys16 = sy16 * stride_h; - - const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; - - tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); - tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); - tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd8 < remain) - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - const int sz = (z + lxd8) / maxk; - const int kk = (z + lxd8) % maxk; - - const int ky = kk / kernel_w; - const int kx = kk % kernel_w; - - const ivec4 gx16 = gx + lxm8 + ivec4(0, 8, 16, 24); - - const ivec4 sy16 = gx16 / psc(outw); - const ivec4 sx16 = gx16 % psc(outw); - - const ivec4 sxs16 = sx16 * stride_w; - const ivec4 sys16 = sy16 * stride_h; - - const ivec4 v_offset = (sz * 2 + j) * psc(cstep) + (sys16 + ky * dilation_h) * psc(w) + sxs16 + kx * dilation_w; - - tmp_v0[tmp_vi] = gx16.r < psc(outcstep) ? bottom_blob_data[v_offset.r] : uvec2(0); - tmp_v0[tmp_vi + 16] = gx16.g < psc(outcstep) ? bottom_blob_data[v_offset.g] : uvec2(0); - tmp_v1[tmp_vi] = gx16.b < psc(outcstep) ? bottom_blob_data[v_offset.b] : uvec2(0); - tmp_v1[tmp_vi + 16] = gx16.a < psc(outcstep) ? bottom_blob_data[v_offset.a] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 * maxk + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * maxk * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (gx >= outsize || gy >= psc(outc)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); - - barrier(); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; - const int gi = (gy + lxd16 + j*2) * psc(outcstep) + (gx + lxm16); - - if (gy + j * 2 + lxd16 < psc(outc)) - { - if (gx + lxm16 < psc(outcstep)) - { - uvec2 sum0_u2 = tmp_v0[tmp_vi]; - afpvec4 sum0 = afpvec4(unpackHalf2x16(sum0_u2.x), unpackHalf2x16(sum0_u2.y)); - sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1); - buffer_st4(top_blob_data, gi, sum0); - } - if (gx + lxm16 + 16 < psc(outcstep)) - { - uvec2 sum1_u2 = tmp_v1[tmp_vi]; - afpvec4 sum1 = afpvec4(unpackHalf2x16(sum1_u2.x), unpackHalf2x16(sum1_u2.y)); - sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1); - buffer_st4(top_blob_data, gi + 16, sum1); - } - } - } - } -} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp similarity index 73% rename from src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp rename to src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp index c5047220c..487adc8e1 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_16_16.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_16_16.comp @@ -17,7 +17,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int maxk = 1; @@ -62,10 +66,17 @@ void main() const int lxd16 = lx / 16; // 0 1 const int lxm16 = lx % 16; // 0 1 2 3 .... 15 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0 = coopmat(0.f); coopmat sum1 = coopmat(0.f); coopmat sum2 = coopmat(0.f); coopmat sum3 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); +#endif const int N = psc(c) / 4; @@ -93,6 +104,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -108,6 +120,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -139,6 +168,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*4, 4, gl_CooperativeMatrixLayoutRowMajor); @@ -154,6 +184,23 @@ void main() sum1 = coopMatMulAdd(A1, B0, sum1); sum2 = coopMatMulAdd(A0, B1, sum2); sum3 = coopMatMulAdd(A1, B1, sum3); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); + coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; + coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); + coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); +#endif } barrier(); @@ -162,6 +209,7 @@ void main() if (gx >= psc(outw) || gy >= psc(outh)) return; +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0_fp16 = coopmat(sum0); coopmat sum1_fp16 = coopmat(sum1); coopmat sum2_fp16 = coopmat(sum2); @@ -171,6 +219,17 @@ void main() coopMatStore(sum1_fp16, tmp_v1, 0, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum2_fp16, tmp_v0, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum3_fp16, tmp_v1, 16*4, 4, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); +#endif barrier(); diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp similarity index 70% rename from src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp rename to src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp index 3c444ee9f..6e06d3c83 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_khr_cm_16_8_8.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm_cm_16_8_8.comp @@ -17,7 +17,11 @@ #extension GL_KHR_memory_scope_semantics: require #extension GL_EXT_shader_explicit_arithmetic_types: require #extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#if ncnn_VK_KHR_cooperative_matrix #extension GL_KHR_cooperative_matrix: require +#elif ncnn_VK_NV_cooperative_matrix +#extension GL_NV_cooperative_matrix: require +#endif layout (constant_id = 0) const int maxk = 1; @@ -64,6 +68,7 @@ void main() const int lxd8 = lx / 8; // 0 1 2 3 const int lxm8 = lx % 8; // 0 1 2 3 .... 7 +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0 = coopmat(0.f); coopmat sum1 = coopmat(0.f); coopmat sum2 = coopmat(0.f); @@ -72,6 +77,16 @@ void main() coopmat sum5 = coopmat(0.f); coopmat sum6 = coopmat(0.f); coopmat sum7 = coopmat(0.f); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); + fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); +#endif const int N = psc(c) / 2; @@ -105,6 +120,7 @@ void main() for (int z4 = 0; z4 < UNROLL_INCH; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -128,6 +144,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -165,6 +206,7 @@ void main() for (int z4 = 0; z4 < remain; z4++) { +#if ncnn_VK_KHR_cooperative_matrix coopmat A0; coopmat A1; coopMatLoad(A0, tmp_v0, z4*16*2, 2, gl_CooperativeMatrixLayoutRowMajor); @@ -188,6 +230,31 @@ void main() sum5 = coopMatMulAdd(A1, B2, sum5); sum6 = coopMatMulAdd(A0, B3, sum6); sum7 = coopMatMulAdd(A1, B3, sum7); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; + coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); + coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); + + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; + fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; + coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); + coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); + coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); + coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); + + // sum += v * k + sum0 = coopMatMulAddNV(A0, B0, sum0); + sum1 = coopMatMulAddNV(A1, B0, sum1); + sum2 = coopMatMulAddNV(A0, B1, sum2); + sum3 = coopMatMulAddNV(A1, B1, sum3); + sum4 = coopMatMulAddNV(A0, B2, sum4); + sum5 = coopMatMulAddNV(A1, B2, sum5); + sum6 = coopMatMulAddNV(A0, B3, sum6); + sum7 = coopMatMulAddNV(A1, B3, sum7); +#endif } barrier(); @@ -196,6 +263,7 @@ void main() if (gx >= psc(outw) || gy >= psc(outh)) return; +#if ncnn_VK_KHR_cooperative_matrix coopmat sum0_fp16 = coopmat(sum0); coopmat sum1_fp16 = coopmat(sum1); coopmat sum2_fp16 = coopmat(sum2); @@ -213,6 +281,25 @@ void main() coopMatStore(sum5_fp16, tmp_v1, 16*4, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum6_fp16, tmp_v0, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); coopMatStore(sum7_fp16, tmp_v1, 16*6, 2, gl_CooperativeMatrixLayoutRowMajor); +#elif ncnn_VK_NV_cooperative_matrix + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); + fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); + + coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); + coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); + coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); + coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); + coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); + coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); + coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); + coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); +#endif barrier(); diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp deleted file mode 100644 index d292ae40b..000000000 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_16_16.comp +++ /dev/null @@ -1,188 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2023 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int maxk = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; -} p; - -#define UNROLL_INCH 2 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k0[UNROLL_INCH * 16*4]; -shared uvec2 tmp_k1[UNROLL_INCH * 16*4]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 16> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 16>(0.f); - - const int N = psc(c) / 4; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd16 == 0) - { - for (int j = 0; j < 4; j++) - { - const int tmp_i = lxd16*16*4 + lxm16 * 4 + j; - - const int v_offset = ((z + lxd16) * 4 + j) * psc(cstep) + (gx + lxm16); - - tmp_v0[tmp_i] = (gx + lxm16) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v1[tmp_i] = (gx + lxm16 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - - const int w_offset = gy * psc(c) * 4 + (z + lxd16) * 4 * 16 + (lxm16 * 4 + j); - - tmp_k0[tmp_i] = weight_data[w_offset]; - tmp_k1[tmp_i] = weight_data[w_offset + psc(c) * 16]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*4, 4, false); - coopMatLoadNV(A1, tmp_v1, z4*16*4, 4, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> B1; - coopMatLoadNV(B0, tmp_k0, z4*16*4, 4, false); - coopMatLoadNV(B1, tmp_k1, z4*16*4, 4, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - } - - barrier(); - } - - if (gx >= psc(outw) || gy >= psc(outh)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 16> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 16>(sum3); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 4, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 4, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*4, 4, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*4, 4, false); - - barrier(); - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 4 + j + lxd16*16*4; - - const int gi = ((gy / 4 + lxd16) / maxk * maxk * 4 + (gy / 4 + lxd16) % maxk) * psc(outw) + j * maxk * psc(outw) + (gx + lxm16); - - if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; - } - } -} diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp deleted file mode 100644 index 5f00ab932..000000000 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm_nv_cm_16_8_8.comp +++ /dev/null @@ -1,232 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#extension GL_KHR_memory_scope_semantics: require -#extension GL_EXT_shader_explicit_arithmetic_types: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_NV_cooperative_matrix: require - -layout (constant_id = 0) const int maxk = 1; - -#define shape_constant_id_offset 1 -layout (constant_id = shape_constant_id_offset + 0) const int w = 0; -layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 2) const int c = 0; -layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; - -layout (binding = 0) readonly buffer bottom_blob { uvec2 bottom_blob_data[]; }; -layout (binding = 1) writeonly buffer col_blob { uvec2 col_blob_data[]; }; -layout (binding = 2) readonly buffer weight_blob { uvec2 weight_data[]; }; - -layout (push_constant) uniform parameter -{ - int w; - int h; - int c; - int cstep; - - int outw; - int outh; -} p; - -#define UNROLL_INCH 4 - -shared uvec2 tmp_v0[UNROLL_INCH * 16*2]; -shared uvec2 tmp_v1[UNROLL_INCH * 16*2]; -shared uvec2 tmp_k0[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k1[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k2[UNROLL_INCH * 8*2]; -shared uvec2 tmp_k3[UNROLL_INCH * 8*2]; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x) / 32 * 2 * 16; - int gy = int(gl_GlobalInvocationID.y) * 2 * 4; - - const int lx = int(gl_LocalInvocationID.x); - - const int lxd8 = lx / 8; // 0 1 2 3 - const int lxm8 = lx % 8; // 0 1 2 3 .... 7 - - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum0 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum1 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum2 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum3 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum4 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum5 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum6 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - fcoopmatNV<32, gl_ScopeSubgroup, 16, 8> sum7 = fcoopmatNV<32, gl_ScopeSubgroup, 16, 8>(0.f); - - const int N = psc(c) / 2; - - int z = 0; - for (; z + (UNROLL_INCH - 1) < N; z += UNROLL_INCH) - { - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < UNROLL_INCH; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (z < N) - { - const int remain = N - z; - - if (lxd8 < remain) - { - for (int j = 0; j < 2; j++) - { - const int tmp_vi = lxd8*16*2 + lxm8 * 2 + j; - - int v_offset = ((z + lxd8) * 2 + j) * psc(cstep) + (gx + lxm8); - - tmp_v0[tmp_vi] = (gx + lxm8) < psc(outw) ? bottom_blob_data[v_offset] : uvec2(0); - tmp_v0[tmp_vi + 16] = (gx + lxm8 + 8) < psc(outw) ? bottom_blob_data[v_offset + 8] : uvec2(0); - tmp_v1[tmp_vi] = (gx + lxm8 + 16) < psc(outw) ? bottom_blob_data[v_offset + 16] : uvec2(0); - tmp_v1[tmp_vi + 16] = (gx + lxm8 + 24) < psc(outw) ? bottom_blob_data[v_offset + 24] : uvec2(0); - - const int tmp_ki = lxd8*8*2 + lxm8 * 2 + j; - - int w_offset = gy * psc(c) * 4 + (z + lxd8) * 2 * 8 + (lxm8 * 2 + j); - - tmp_k0[tmp_ki] = weight_data[w_offset]; - tmp_k1[tmp_ki] = weight_data[w_offset + psc(c) * 8]; - tmp_k2[tmp_ki] = weight_data[w_offset + psc(c) * 16]; - tmp_k3[tmp_ki] = weight_data[w_offset + psc(c) * 24]; - } - } - - barrier(); - - for (int z4 = 0; z4 < remain; z4++) - { - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A0; - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> A1; - coopMatLoadNV(A0, tmp_v0, z4*16*2, 2, false); - coopMatLoadNV(A1, tmp_v1, z4*16*2, 2, false); - - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B0; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B1; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B2; - fcoopmatNV<16, gl_ScopeSubgroup, 8, 8> B3; - coopMatLoadNV(B0, tmp_k0, z4*8*2, 2, false); - coopMatLoadNV(B1, tmp_k1, z4*8*2, 2, false); - coopMatLoadNV(B2, tmp_k2, z4*8*2, 2, false); - coopMatLoadNV(B3, tmp_k3, z4*8*2, 2, false); - - // sum += v * k - sum0 = coopMatMulAddNV(A0, B0, sum0); - sum1 = coopMatMulAddNV(A1, B0, sum1); - sum2 = coopMatMulAddNV(A0, B1, sum2); - sum3 = coopMatMulAddNV(A1, B1, sum3); - sum4 = coopMatMulAddNV(A0, B2, sum4); - sum5 = coopMatMulAddNV(A1, B2, sum5); - sum6 = coopMatMulAddNV(A0, B3, sum6); - sum7 = coopMatMulAddNV(A1, B3, sum7); - } - - barrier(); - } - - if (gx >= psc(outw) || gy >= psc(outh)) - return; - - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum0_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum0); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum1_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum1); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum2_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum2); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum3_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum3); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum4_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum4); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum5_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum5); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum6_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum6); - fcoopmatNV<16, gl_ScopeSubgroup, 16, 8> sum7_fp16 = fcoopmatNV<16, gl_ScopeSubgroup, 16, 8>(sum7); - - coopMatStoreNV(sum0_fp16, tmp_v0, 0, 2, false); - coopMatStoreNV(sum1_fp16, tmp_v1, 0, 2, false); - coopMatStoreNV(sum2_fp16, tmp_v0, 16*2, 2, false); - coopMatStoreNV(sum3_fp16, tmp_v1, 16*2, 2, false); - coopMatStoreNV(sum4_fp16, tmp_v0, 16*4, 2, false); - coopMatStoreNV(sum5_fp16, tmp_v1, 16*4, 2, false); - coopMatStoreNV(sum6_fp16, tmp_v0, 16*6, 2, false); - coopMatStoreNV(sum7_fp16, tmp_v1, 16*6, 2, false); - - barrier(); - - const int lxd16 = lx / 16; // 0 1 - const int lxm16 = lx % 16; // 0 1 2 3 .... 15 - - { - for (int j = 0; j < 4; j++) - { - const int tmp_vi = lxm16 * 2 + lxd16 + j*16*2; - const int gi = ((gy / 2 + j) / maxk * maxk * 2 + (gy / 2 + j) % maxk) * psc(outw) + lxd16 * maxk * psc(outw) + (gx + lxm16); - - if (gx + lxm16 < psc(outw)) col_blob_data[gi] = tmp_v0[tmp_vi]; - if (gx + lxm16 + 16 < psc(outw)) col_blob_data[gi + 16] = tmp_v1[tmp_vi]; - } - } -}