From 4302f78f55167bfb02fa50bbe421f99660313cea Mon Sep 17 00:00:00 2001 From: nihui Date: Sun, 27 Mar 2022 17:47:31 +0800 Subject: [PATCH] less specialization constant for vulkan conv1x1s1d1 shaders (#3657) --- src/layer/vulkan/convolution_vulkan.cpp | 414 ++++++++++-------- .../vulkan/shader/convolution_1x1s1d1.comp | 64 ++- .../shader/convolution_pack1to4_1x1s1d1.comp | 83 ++-- .../shader/convolution_pack1to8_1x1s1d1.comp | 69 ++- .../shader/convolution_pack4_1x1s1d1.comp | 101 ++--- .../shader/convolution_pack4to1_1x1s1d1.comp | 83 ++-- .../shader/convolution_pack4to8_1x1s1d1.comp | 79 ++-- .../shader/convolution_pack8_1x1s1d1.comp | 79 ++-- .../shader/convolution_pack8to1_1x1s1d1.comp | 69 ++- .../shader/convolution_pack8to4_1x1s1d1.comp | 75 ++-- 10 files changed, 520 insertions(+), 596 deletions(-) diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index 1305abec4..e7facb37a 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -172,52 +172,6 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) padding->create_pipeline(opt); } - std::vector specializations(10 + 10); - specializations[0].i = kernel_w; - specializations[1].i = kernel_h; - specializations[2].i = dilation_w; - specializations[3].i = dilation_h; - specializations[4].i = stride_w; - specializations[5].i = stride_h; - specializations[6].i = bias_term; - specializations[7].i = activation_type; - specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[10 + 0].i = shape_bordered_packed.dims; - specializations[10 + 1].i = shape_bordered_packed.w; - specializations[10 + 2].i = shape_bordered_packed.h; - specializations[10 + 3].i = shape_bordered_packed.c; - specializations[10 + 4].i = shape_bordered_packed.cstep; - specializations[10 + 5].i = out_shape_packed.dims; - specializations[10 + 6].i = out_shape_packed.w; - specializations[10 + 7].i = out_shape_packed.h; - specializations[10 + 8].i = out_shape_packed.c; - specializations[10 + 9].i = out_shape_packed.cstep; - - if (is_conv1x1s1d1) - { - int shader_type_index = -1; - if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1; - if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1; - if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1; - if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1; - if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1; - if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1; - if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1; - if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1; - if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1; - - pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); - if (opt.use_shader_local_memory) - { - pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8); - } - else - { - pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack)); - } - pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations); - } if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16) { // winograd43 @@ -477,59 +431,117 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) opt.use_image_storage = false; } + std::vector specializations(10 + 8); + specializations[0].i = kernel_w; + specializations[1].i = kernel_h; + specializations[2].i = dilation_w; + specializations[3].i = dilation_h; + specializations[4].i = stride_w; + specializations[5].i = stride_h; + specializations[6].i = bias_term; + specializations[7].i = activation_type; + specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[10 + 0].i = shape_bordered_packed.w; + specializations[10 + 1].i = shape_bordered_packed.h; + specializations[10 + 2].i = shape_bordered_packed.c; + specializations[10 + 3].i = shape_bordered_packed.cstep; + specializations[10 + 4].i = out_shape_packed.w; + specializations[10 + 5].i = out_shape_packed.h; + specializations[10 + 6].i = out_shape_packed.c; + specializations[10 + 7].i = out_shape_packed.cstep; + + Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0); + if (out_shape_packed.dims != 0) { - std::vector specializations(10 + 8); - specializations[0].i = kernel_w; - specializations[1].i = kernel_h; - specializations[2].i = dilation_w; - specializations[3].i = dilation_h; - specializations[4].i = stride_w; - specializations[5].i = stride_h; - specializations[6].i = bias_term; - specializations[7].i = activation_type; - specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; - specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[10 + 0].i = shape_bordered_packed.w; - specializations[10 + 1].i = shape_bordered_packed.h; - specializations[10 + 2].i = shape_bordered_packed.c; - specializations[10 + 3].i = shape_bordered_packed.cstep; - specializations[10 + 4].i = out_shape_packed.w; - specializations[10 + 5].i = out_shape_packed.h; - specializations[10 + 6].i = out_shape_packed.c; - specializations[10 + 7].i = out_shape_packed.cstep; - - Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0); - if (out_shape_packed.dims != 0) - { - local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h); - local_size_xyz.h = std::min(4, out_shape_packed.c); - } + local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h); + local_size_xyz.h = std::min(4, out_shape_packed.c); + } - int shader_type_index = -1; - if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm; - if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm; - if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm; - if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm; - if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm; - if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm; - if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm; - if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm; - if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; - - pipeline_convolution_gemm = new Pipeline(vkdev); - if (opt.use_shader_local_memory) - { - pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); - } - else - { - pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); - } - pipeline_convolution_gemm->create(shader_type_index, opt, specializations); + int shader_type_index = -1; + if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm; + if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm; + if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm; + if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm; + if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm; + if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm; + if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm; + if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm; + if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; + + pipeline_convolution_gemm = new Pipeline(vkdev); + if (opt.use_shader_local_memory) + { + pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + } + pipeline_convolution_gemm->create(shader_type_index, opt, specializations); + } + if (is_conv1x1s1d1) + { + std::vector specializations(4 + 8); + specializations[0].i = bias_term; + specializations[1].i = activation_type; + specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[4 + 0].i = shape_bordered_packed.w; + specializations[4 + 1].i = shape_bordered_packed.h; + specializations[4 + 2].i = shape_bordered_packed.c; + specializations[4 + 3].i = shape_bordered_packed.cstep; + specializations[4 + 4].i = out_shape_packed.w; + specializations[4 + 5].i = out_shape_packed.h; + specializations[4 + 6].i = out_shape_packed.c; + specializations[4 + 7].i = out_shape_packed.cstep; + + int shader_type_index = -1; + if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1; + if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1; + if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1; + if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1; + if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1; + if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1; + if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1; + if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1; + if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1; + + pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); + if (opt.use_shader_local_memory) + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, std::min(8, num_output / out_elempack), 1); } + pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations); } else { + std::vector specializations(10 + 10); + specializations[0].i = kernel_w; + specializations[1].i = kernel_h; + specializations[2].i = dilation_w; + specializations[3].i = dilation_h; + specializations[4].i = stride_w; + specializations[5].i = stride_h; + specializations[6].i = bias_term; + specializations[7].i = activation_type; + specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; + specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; + specializations[10 + 0].i = shape_bordered_packed.dims; + specializations[10 + 1].i = shape_bordered_packed.w; + specializations[10 + 2].i = shape_bordered_packed.h; + specializations[10 + 3].i = shape_bordered_packed.c; + specializations[10 + 4].i = shape_bordered_packed.cstep; + specializations[10 + 5].i = out_shape_packed.dims; + specializations[10 + 6].i = out_shape_packed.w; + specializations[10 + 7].i = out_shape_packed.h; + specializations[10 + 8].i = out_shape_packed.c; + specializations[10 + 9].i = out_shape_packed.cstep; + Mat local_size_xyz(8, 8, std::min(4, (num_output / out_elempack + 1) / 2), (void*)0); if (out_shape_packed.dims != 0) { @@ -1194,34 +1206,63 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16) { // gemm - { - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); - if (top_blob.empty()) - return -100; - - std::vector bindings(4); - bindings[0] = bottom_blob_bordered; - bindings[1] = top_blob; - bindings[2] = weight_data_gpu; - bindings[3] = bias_data_gpu; - - std::vector constants(8); - constants[0].i = bottom_blob_bordered.w; - constants[1].i = bottom_blob_bordered.h; - constants[2].i = bottom_blob_bordered.c; - constants[3].i = bottom_blob_bordered.cstep; - constants[4].i = top_blob.w; - constants[5].i = top_blob.h; - constants[6].i = top_blob.c; - constants[7].i = top_blob.cstep; - - VkMat dispatcher; - dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = top_blob.c; - dispatcher.c = 1; - - cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); - } + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(8); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = bottom_blob_bordered.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = top_blob.c; + constants[7].i = top_blob.cstep; + + VkMat dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = top_blob.c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); + + return 0; + } + if (is_conv1x1s1d1) + { + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu; + bindings[3] = bias_data_gpu; + + std::vector constants(8); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = bottom_blob_bordered.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = top_blob.c; + constants[7].i = top_blob.cstep; + + VkMat dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = top_blob.c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); return 0; } @@ -1248,25 +1289,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - // record - if (is_conv1x1s1d1) - { - VkMat dispatcher; - dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = 1; - dispatcher.c = top_blob.c; - - cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); - } - else - { - VkMat dispatcher; - dispatcher.w = (top_blob.w + 1) / 2; - dispatcher.h = (top_blob.h + 1) / 2; - dispatcher.c = (top_blob.c + 1) / 2; + VkMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = (top_blob.c + 1) / 2; - cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); - } + cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); return 0; } @@ -1567,34 +1595,63 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16) { // gemm - { - top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); - if (top_blob.empty()) - return -100; - - std::vector bindings(4); - bindings[0] = bottom_blob_bordered; - bindings[1] = top_blob; - bindings[2] = weight_data_gpu_image; - bindings[3] = bias_data_gpu_image; - - std::vector constants(8); - constants[0].i = bottom_blob_bordered.w; - constants[1].i = bottom_blob_bordered.h; - constants[2].i = bottom_blob_bordered.c; - constants[3].i = 0; // bottom_blob_bordered.cstep; - constants[4].i = top_blob.w; - constants[5].i = top_blob.h; - constants[6].i = top_blob.c; - constants[7].i = 0; // top_blob.cstep; - - VkImageMat dispatcher; - dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = top_blob.c; - dispatcher.c = 1; - - cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); - } + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image; + + std::vector constants(8); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = 0; // bottom_blob_bordered.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = top_blob.c; + constants[7].i = 0; // top_blob.cstep; + + VkImageMat dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = top_blob.c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); + + return 0; + } + if (is_conv1x1s1d1) + { + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + std::vector bindings(4); + bindings[0] = bottom_blob_bordered; + bindings[1] = top_blob; + bindings[2] = weight_data_gpu_image; + bindings[3] = bias_data_gpu_image; + + std::vector constants(8); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = 0; // bottom_blob_bordered.cstep; + constants[4].i = top_blob.w; + constants[5].i = top_blob.h; + constants[6].i = top_blob.c; + constants[7].i = 0; // top_blob.cstep; + + VkImageMat dispatcher; + dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; + dispatcher.h = top_blob.c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); return 0; } @@ -1621,25 +1678,12 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b constants[8].i = top_blob.c; constants[9].i = 0; //top_blob.cstep; - // record - if (is_conv1x1s1d1) - { - VkImageMat dispatcher; - dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = 1; - dispatcher.c = top_blob.c; - - cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher); - } - else - { - VkImageMat dispatcher; - dispatcher.w = (top_blob.w + 1) / 2; - dispatcher.h = (top_blob.h + 1) / 2; - dispatcher.c = (top_blob.c + 1) / 2; + VkImageMat dispatcher; + dispatcher.w = (top_blob.w + 1) / 2; + dispatcher.h = (top_blob.h + 1) / 2; + dispatcher.c = (top_blob.c + 1) / 2; - cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); - } + cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher); return 0; } diff --git a/src/layer/vulkan/shader/convolution_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_1x1s1d1.comp index f34474331..8b287de98 100644 --- a/src/layer/vulkan/shader/convolution_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_1x1s1d1.comp @@ -21,29 +21,21 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -64,13 +56,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -82,16 +72,14 @@ void main() #if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx * 4 >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -100,9 +88,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum = afpvec4(image3d_ld1(bias_blob, ivec3(gz, 0, 0))); + sum = afpvec4(image3d_ld1(bias_blob, ivec3(gy, 0, 0))); #else - sum = afpvec4(buffer_ld1(bias_data, gz)); + sum = afpvec4(buffer_ld1(bias_data, gy)); #endif } else @@ -118,7 +106,7 @@ void main() for (int z = 0; z < psc(c); z++) { - afp k = image3d_ld1(weight_blob, ivec3(0, z, gz)); + afp k = image3d_ld1(weight_blob, ivec3(0, z, gy)); sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z)); sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z)); @@ -126,7 +114,7 @@ void main() sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); } #else - int w_offset = gz * psc(c); + int w_offset = gy * psc(c); int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -174,12 +162,12 @@ void main() #if NCNN_image_shader - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum.g); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum.b); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum.a); #else - const int gi = gz * psc(outcstep) / 4 + gx; + const int gi = gy * psc(outcstep) / 4 + gx; #if NCNN_fp16_packed top_blob_data[gi] = sum; diff --git a/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp index 3acc66311..f9028f7e2 100644 --- a/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp @@ -23,29 +23,21 @@ #define LOCAL_MEMORY_UNROLL_INCH 8 -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -83,14 +73,13 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); #if NCNN_image_shader - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else #if !NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif #endif @@ -103,9 +92,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - afpvec4 b = buffer_ld4(bias_data, gz); + afpvec4 b = buffer_ld4(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -133,7 +122,7 @@ void main() afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz)); + afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy)); sum0 += v0 * k; sum1 += v1 * k; @@ -141,21 +130,21 @@ void main() sum3 += v3 * k; } #else - int w_offset = gz * psc(c); + int w_offset = gy * psc(c); int v_offset = gx; #if NCNN_shader_local_memory const int lx = int(gl_LocalInvocationID.x); - const int lz = int(gl_LocalInvocationID.z); + const int ly = int(gl_LocalInvocationID.y); int z = 0; for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) { - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -163,7 +152,7 @@ void main() { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); } } @@ -176,7 +165,7 @@ void main() afp v2 = lfp2afp(tmp_v[lx][z4][2]); afp v3 = lfp2afp(tmp_v[lx][z4][3]); - afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; @@ -194,11 +183,11 @@ void main() { const int remain = psc(c) - z; - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < remain; z4++) { - tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -206,7 +195,7 @@ void main() { for (int z4 = 0; z4 < remain; z4++) { - tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); } } @@ -219,7 +208,7 @@ void main() afp v2 = lfp2afp(tmp_v[lx][z4][2]); afp v3 = lfp2afp(tmp_v[lx][z4][3]); - afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; @@ -297,17 +286,17 @@ void main() } #if NCNN_image_shader - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else #if NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp index 50e743f2f..847d37091 100644 --- a/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if NCNN_image_shader + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -99,9 +84,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - afpvec8 b = buffer_ld8(bias_data, gz); + afpvec8 b = buffer_ld8(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -129,7 +114,7 @@ void main() afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz)); + afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy)); // sum += v * k; sum0[0] += v0 * k[0]; @@ -145,7 +130,7 @@ void main() sum3[1] += v3 * k[1]; } #else - int w_offset = gz * psc(c); + int w_offset = gy * psc(c); int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -248,12 +233,12 @@ void main() } #if NCNN_image_shader - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp index df5e2e4b0..e377191f4 100644 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp @@ -23,29 +23,21 @@ #define LOCAL_MEMORY_UNROLL_INCH 8 -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -83,14 +73,13 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); #if NCNN_image_shader - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else #if !NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif #endif @@ -103,9 +92,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - afpvec4 b = buffer_ld4(bias_data, gz); + afpvec4 b = buffer_ld4(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -134,10 +123,10 @@ void main() afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); afpmat4 k = afpmat4( - image3d_ld4(weight_blob, ivec3(0, z, gz)), - image3d_ld4(weight_blob, ivec3(1, z, gz)), - image3d_ld4(weight_blob, ivec3(2, z, gz)), - image3d_ld4(weight_blob, ivec3(3, z, gz)) + image3d_ld4(weight_blob, ivec3(0, z, gy)), + image3d_ld4(weight_blob, ivec3(1, z, gy)), + image3d_ld4(weight_blob, ivec3(2, z, gy)), + image3d_ld4(weight_blob, ivec3(3, z, gy)) ); sum0 += v0 * k; @@ -146,21 +135,21 @@ void main() sum3 += v3 * k; } #else - int w_offset = gz * psc(c) * 4; + int w_offset = gy * psc(c) * 4; int v_offset = gx; #if NCNN_shader_local_memory const int lx = int(gl_LocalInvocationID.x); - const int lz = int(gl_LocalInvocationID.z); + const int ly = int(gl_LocalInvocationID.y); int z = 0; for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) { - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -168,7 +157,7 @@ void main() { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); } } @@ -181,10 +170,10 @@ void main() afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); - afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); - afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); - afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); - afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); afpmat4 k = afpmat4(k0, k1, k2, k3); @@ -204,11 +193,11 @@ void main() { const int remain = psc(c) - z; - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < remain; z4++) { - tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -216,7 +205,7 @@ void main() { for (int z4 = 0; z4 < remain; z4++) { - tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); } } @@ -229,10 +218,10 @@ void main() afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); - afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); - afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); - afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); - afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); afpmat4 k = afpmat4(k0, k1, k2, k3); @@ -317,17 +306,17 @@ void main() } #if NCNN_image_shader - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else #if NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp index 636a3c258..b040af642 100644 --- a/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp @@ -23,29 +23,21 @@ #define LOCAL_MEMORY_UNROLL_INCH 8 -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -83,14 +73,13 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); #if NCNN_image_shader - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else #if !NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif #endif @@ -103,9 +92,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); + afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); #else - afp b = buffer_ld1(bias_data, gz); + afp b = buffer_ld1(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -133,7 +122,7 @@ void main() afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz)); + afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy)); sum0 += dot(v0, k); sum1 += dot(v1, k); @@ -141,21 +130,21 @@ void main() sum3 += dot(v3, k); } #else - int w_offset = gz * psc(c); + int w_offset = gy * psc(c); int v_offset = gx; #if NCNN_shader_local_memory const int lx = int(gl_LocalInvocationID.x); - const int lz = int(gl_LocalInvocationID.z); + const int ly = int(gl_LocalInvocationID.y); int z = 0; for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) { - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -163,7 +152,7 @@ void main() { for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); } } @@ -176,7 +165,7 @@ void main() afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); - afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += dot(v0, k); sum1 += dot(v1, k); @@ -194,11 +183,11 @@ void main() { const int remain = psc(c) - z; - if (lz < 4) + if (ly < 4) { for (int z4 = 0; z4 < remain; z4++) { - tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); } } @@ -206,7 +195,7 @@ void main() { for (int z4 = 0; z4 < remain; z4++) { - tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); } } @@ -219,7 +208,7 @@ void main() afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); - afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]); + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += dot(v0, k); sum1 += dot(v1, k); @@ -297,17 +286,17 @@ void main() } #if NCNN_image_shader - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else #if NCNN_shader_local_memory - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st1(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp index e8937d531..8c2375866 100644 --- a/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if NCNN_image_shader + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -99,9 +84,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - afpvec8 b = buffer_ld8(bias_data, gz); + afpvec8 b = buffer_ld8(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -129,14 +114,14 @@ void main() afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gz)); - afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gz)); - afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gz)); - afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gz)); - afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gz)); - afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gz)); - afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gz)); - afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gz)); + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gy)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gy)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gy)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gy)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gy)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gy)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gy)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gy)); // sum += v * k; sum0[0].r += dot(v0, k0); @@ -176,7 +161,7 @@ void main() sum3[1].a += dot(v3, k7); } #else - int w_offset = gz * psc(c) * 8; + int w_offset = gy * psc(c) * 8; int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -310,12 +295,12 @@ void main() } #if NCNN_image_shader - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp index 755bd4014..e74bc8424 100644 --- a/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if NCNN_image_shader + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -99,9 +84,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - afpvec8 b = buffer_ld8(bias_data, gz); + afpvec8 b = buffer_ld8(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -129,14 +114,14 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); - afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz)); - afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz)); - afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz)); - afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz)); + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gy)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gy)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gy)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gy)); // sum += v * k sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -176,7 +161,7 @@ void main() sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); } #else - int w_offset = gz * psc(c) * 8; + int w_offset = gy * psc(c) * 8; int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -310,12 +295,12 @@ void main() } #if NCNN_image_shader - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp index 0c6bf958b..f26c4d8ca 100644 --- a/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if NCNN_image_shader + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -99,9 +84,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); + afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); #else - afp b = buffer_ld1(bias_data, gz); + afp b = buffer_ld1(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -129,7 +114,7 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz)); + afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy)); // sum += dot(v, k); sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); @@ -138,7 +123,7 @@ void main() sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); } #else - int w_offset = gz * psc(c); + int w_offset = gy * psc(c); int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -210,12 +195,12 @@ void main() } #if NCNN_image_shader - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st1(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp index 5573c36bd..0803a9fd5 100644 --- a/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp @@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; -layout (constant_id = 2) const int dilation_w = 1; -layout (constant_id = 3) const int dilation_h = 1; -layout (constant_id = 4) const int stride_w = 1; -layout (constant_id = 5) const int stride_h = 1; -layout (constant_id = 6) const int bias_term = 0; -layout (constant_id = 7) const int activation_type = 0; -layout (constant_id = 8) const float activation_param_0 = 0; -layout (constant_id = 9) const float activation_param_1 = 0; - -#define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = 0) const int bias_term = 0; +layout (constant_id = 1) const int activation_type = 0; +layout (constant_id = 2) const float activation_param_0 = 0; +layout (constant_id = 3) const float activation_param_1 = 0; + +#define shape_constant_id_offset 4 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 6) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; int outc; @@ -75,19 +65,14 @@ layout (push_constant) uniform parameter void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if NCNN_image_shader + if (gx >= psc(outw) * psc(outh) || gy >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(outcstep) || gy >= psc(outc)) return; #endif @@ -99,9 +84,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - afpvec4 b = buffer_ld4(bias_data, gz); + afpvec4 b = buffer_ld4(bias_data, gy); #endif sum0 = b; sum1 = b; @@ -129,10 +114,10 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz)); + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy)); // sum += v * k; sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -156,7 +141,7 @@ void main() sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); } #else - int w_offset = gz * psc(c) * 4; + int w_offset = gy * psc(c) * 4; int v_offset = gx; for (int z = 0; z < psc(c); z++) @@ -246,12 +231,12 @@ void main() } #if NCNN_image_shader - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - int gi = gz * psc(outcstep) + gx; + int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi + 0, sum0); if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);