diff --git a/src/gpu.cpp b/src/gpu.cpp index a5dfb9b99..efe4edfa0 100644 --- a/src/gpu.cpp +++ b/src/gpu.cpp @@ -3210,6 +3210,63 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option custom_defines.push_back(std::make_pair("afpmat4", "mat4")); } + if (opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("lfp", "float16_t")); + custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4")); + } + else if (opt.use_fp16_storage || opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("lfp", "float")); + custom_defines.push_back(std::make_pair("lfpvec4", "uvec2")); + } + else + { + custom_defines.push_back(std::make_pair("lfp", "float")); + custom_defines.push_back(std::make_pair("lfpvec4", "vec4")); + } + + if (opt.use_fp16_storage && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "v")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v")); + } + else if (opt.use_fp16_packed && opt.use_fp16_arithmetic) + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "f16vec4(vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y)))")); + } + else if (opt.use_fp16_storage) + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "uvec2(packHalf2x16(vec4(v).rg),packHalf2x16(vec4(v).ba))")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "v")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))")); + } + else if (opt.use_fp16_packed) + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "v")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))")); + } + else + { + custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v")); + custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v")); + + custom_defines.push_back(std::make_pair("lfp2afp(v)", "v")); + custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v")); + } + if (opt.use_fp16_storage && opt.use_fp16_arithmetic) { custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]")); @@ -3546,6 +3603,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option } } + if (opt.use_shader_local_memory) + { + custom_defines.push_back(std::make_pair("NCNN_shader_local_memory", "1")); + } + std::string preamble; std::vector processes; diff --git a/src/layer/vulkan/convolution_vulkan.cpp b/src/layer/vulkan/convolution_vulkan.cpp index 8c174ec0a..900be0008 100644 --- a/src/layer/vulkan/convolution_vulkan.cpp +++ b/src/layer/vulkan/convolution_vulkan.cpp @@ -180,16 +180,16 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) block_y = outh_bordered / 4; shape_winograd_bordered = Mat(w_bordered, h_bordered, shape.c, (void*)0); - shape_winograd_input_transformed = Mat(36, block_x * block_y, shape.c, (void*)0); - shape_winograd_gemm = Mat(36, block_x * block_y, out_shape.c, (void*)0); + shape_winograd_input_transformed = Mat(block_x * block_y, shape.c, 36, (void*)0); + shape_winograd_gemm = Mat(block_x * block_y, out_shape.c, 36, (void*)0); shape_winograd_out_bordered = Mat(outw_bordered, outh_bordered, out_shape.c, (void*)0); } if (shape_winograd_bordered.dims == 3) shape_winograd_bordered_packed = Mat(shape_winograd_bordered.w, shape_winograd_bordered.h, shape_winograd_bordered.c / elempack, (void*)0, elemsize, elempack); - if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h, shape_winograd_input_transformed.c / elempack, (void*)0, elemsize, elempack); + if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h / elempack, 36, (void*)0, elemsize, elempack); - if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h, shape_winograd_gemm.c / out_elempack, (void*)0, out_elemsize, out_elempack); + if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h / out_elempack, 36, (void*)0, out_elemsize, out_elempack); if (shape_winograd_out_bordered.dims == 3) shape_winograd_out_bordered_packed = Mat(shape_winograd_out_bordered.w, shape_winograd_out_bordered.h, shape_winograd_out_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack); @@ -203,7 +203,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) opt.use_image_storage = false; } - Mat weight_data_packed_tm(36, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + Mat weight_data_packed_tm(num_input / elempack, num_output / out_elempack, 36, (size_t)4 * elempack * out_elempack, elempack * out_elempack); if (!vkdev->shape_support_image_storage(weight_data_packed_tm)) { support_image_storage = false; @@ -231,16 +231,16 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) block_y = outh_bordered / 2; shape_winograd_bordered = Mat(w_bordered, h_bordered, shape.c, (void*)0); - shape_winograd_input_transformed = Mat(16, block_x * block_y, shape.c, (void*)0); - shape_winograd_gemm = Mat(16, block_x * block_y, out_shape.c, (void*)0); + shape_winograd_input_transformed = Mat(block_x * block_y, shape.c, 16, (void*)0); + shape_winograd_gemm = Mat(block_x * block_y, out_shape.c, 16, (void*)0); shape_winograd_out_bordered = Mat(outw_bordered, outh_bordered, out_shape.c, (void*)0); } if (shape_winograd_bordered.dims == 3) shape_winograd_bordered_packed = Mat(shape_winograd_bordered.w, shape_winograd_bordered.h, shape_winograd_bordered.c / elempack, (void*)0, elemsize, elempack); - if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h, shape_winograd_input_transformed.c / elempack, (void*)0, elemsize, elempack); + if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h / elempack, 16, (void*)0, elemsize, elempack); - if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h, shape_winograd_gemm.c / out_elempack, (void*)0, out_elemsize, out_elempack); + if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h / out_elempack, 16, (void*)0, out_elemsize, out_elempack); if (shape_winograd_out_bordered.dims == 3) shape_winograd_out_bordered_packed = Mat(shape_winograd_out_bordered.w, shape_winograd_out_bordered.h, shape_winograd_out_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack); @@ -254,7 +254,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) opt.use_image_storage = false; } - Mat weight_data_packed_tm(16, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + Mat weight_data_packed_tm(num_input / elempack, num_output / out_elempack, 16, (size_t)4 * elempack * out_elempack, elempack * out_elempack); if (!vkdev->shape_support_image_storage(weight_data_packed_tm)) { support_image_storage = false; @@ -273,10 +273,10 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) // im2col + gemm if (shape.dims != 0 && out_shape.dims != 0) { - shape_col = Mat(out_shape.w * out_shape.h, kernel_w * kernel_h, shape.c, (void*)0); + shape_col = Mat(out_shape.w * out_shape.h, kernel_w * kernel_h * shape.c, (void*)0); } - Mat shape_col_packed = Mat(shape_col.w, shape_col.h, shape_col.c / elempack, (void*)0, elemsize, elempack); + if (shape_col.dims == 2) shape_col_packed = Mat(shape_col.w, shape_col.h / elempack, (void*)0, elemsize, elempack); // check blob shape if (!vkdev->shape_support_image_storage(shape_col_packed)) @@ -363,7 +363,14 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1; pipeline_convolution_1x1s1d1 = new Pipeline(vkdev); - pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack)); + if (opt.use_shader_local_memory) + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8); + } + else + { + pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack)); + } pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations); } else if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 32 && num_output >= 32 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8))) @@ -433,19 +440,27 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } { - std::vector specializations(0 + 5); - specializations[0 + 0].i = shape_winograd_input_transformed_packed.c; - specializations[0 + 1].i = shape_winograd_input_transformed_packed.cstep; - specializations[0 + 2].i = shape_winograd_gemm_packed.h; - specializations[0 + 3].i = shape_winograd_gemm_packed.c; - specializations[0 + 4].i = shape_winograd_gemm_packed.cstep; + std::vector specializations(1 + 5); + specializations[0].i = 36; + specializations[1 + 0].i = shape_winograd_input_transformed_packed.h; + specializations[1 + 1].i = shape_winograd_input_transformed_packed.cstep; + specializations[1 + 2].i = shape_winograd_gemm_packed.w; + specializations[1 + 3].i = shape_winograd_gemm_packed.h; + specializations[1 + 4].i = shape_winograd_gemm_packed.cstep; int shader_type_index = -1; - if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_gemm; - if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd43_gemm; + if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm; + if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd_gemm; pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev); - pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(4, 4, std::min(4, num_output / out_elempack)); + if (opt.use_shader_local_memory) + { + pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(4, std::min(4, num_output / out_elempack), 4); + } pipeline_convolution_3x3s1d1_winograd43_gemm->create(shader_type_index, opt, specializations); } @@ -539,19 +554,27 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } { - std::vector specializations(0 + 5); - specializations[0 + 0].i = shape_winograd_input_transformed_packed.c; - specializations[0 + 1].i = shape_winograd_input_transformed_packed.cstep; - specializations[0 + 2].i = shape_winograd_gemm_packed.h; - specializations[0 + 3].i = shape_winograd_gemm_packed.c; - specializations[0 + 4].i = shape_winograd_gemm_packed.cstep; + std::vector specializations(1 + 5); + specializations[0].i = 16; + specializations[1 + 0].i = shape_winograd_input_transformed_packed.h; + specializations[1 + 1].i = shape_winograd_input_transformed_packed.cstep; + specializations[1 + 2].i = shape_winograd_gemm_packed.w; + specializations[1 + 3].i = shape_winograd_gemm_packed.h; + specializations[1 + 4].i = shape_winograd_gemm_packed.cstep; int shader_type_index = -1; - if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_gemm; - if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd23_gemm; + if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm; + if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd_gemm; pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev); - pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(4, 4, std::min(4, num_output / out_elempack)); + if (opt.use_shader_local_memory) + { + pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(4, std::min(4, num_output / out_elempack), 4); + } pipeline_convolution_3x3s1d1_winograd23_gemm->create(shader_type_index, opt, specializations); } @@ -561,7 +584,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) specializations[1].i = activation_type; specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f; specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[4 + 0].i = shape_winograd_gemm_packed.c; + specializations[4 + 0].i = shape_winograd_gemm_packed.h; specializations[4 + 1].i = shape_winograd_gemm_packed.cstep; specializations[4 + 2].i = block_x; specializations[4 + 3].i = block_y; @@ -581,30 +604,25 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && (num_input >= 16 && num_output >= 16)) { { - std::vector specializations(6 + 10); + std::vector specializations(6 + 6); specializations[0].i = kernel_w; specializations[1].i = kernel_h; specializations[2].i = dilation_w; specializations[3].i = dilation_h; specializations[4].i = stride_w; specializations[5].i = stride_h; - specializations[6 + 0].i = shape_bordered_packed.dims; - specializations[6 + 1].i = shape_bordered_packed.w; - specializations[6 + 2].i = shape_bordered_packed.h; - specializations[6 + 3].i = shape_bordered_packed.c; - specializations[6 + 4].i = shape_bordered_packed.cstep; - specializations[6 + 5].i = shape_col_packed.dims; - specializations[6 + 6].i = out_shape_packed.w; - specializations[6 + 7].i = out_shape_packed.h; - specializations[6 + 8].i = shape_col_packed.c; - specializations[6 + 9].i = shape_col_packed.cstep; - - Mat local_size_xyz(8, 1, std::min(4, num_input / elempack), (void*)0); + specializations[6 + 0].i = shape_bordered_packed.w; + specializations[6 + 1].i = shape_bordered_packed.h; + specializations[6 + 2].i = shape_bordered_packed.c; + specializations[6 + 3].i = shape_bordered_packed.cstep; + specializations[6 + 4].i = out_shape_packed.w; + specializations[6 + 5].i = out_shape_packed.h; + + Mat local_size_xyz(8, std::min(4, num_input / elempack), 1, (void*)0); if (shape_col_packed.dims != 0) { local_size_xyz.w = std::min(8, shape_col_packed.w); - local_size_xyz.h = std::min(1, shape_col_packed.h); - local_size_xyz.c = std::min(4, shape_col_packed.c); + local_size_xyz.h = std::min(4, shape_col_packed.h); } int shader_type_index = -1; @@ -618,30 +636,25 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) } { - std::vector specializations(6 + 10); + std::vector specializations(6 + 6); specializations[0].i = kernel_w; specializations[1].i = kernel_h; specializations[2].i = bias_term; specializations[3].i = activation_type; specializations[4].f = activation_params.w >= 1 ? activation_params[0] : 0.f; specializations[5].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[6 + 0].i = shape_col_packed.dims; - specializations[6 + 1].i = out_shape_packed.w; - specializations[6 + 2].i = out_shape_packed.h; - specializations[6 + 3].i = shape_col_packed.c; - specializations[6 + 4].i = shape_col_packed.cstep; - specializations[6 + 5].i = out_shape_packed.dims; - specializations[6 + 6].i = out_shape_packed.w; - specializations[6 + 7].i = out_shape_packed.h; - specializations[6 + 8].i = out_shape_packed.c; - specializations[6 + 9].i = out_shape_packed.cstep; - - Mat local_size_xyz(16, 1, std::min(4, num_output / out_elempack), (void*)0); + specializations[6 + 0].i = shape_col_packed.w; + specializations[6 + 1].i = shape_col_packed.h; + specializations[6 + 2].i = out_shape_packed.w; + specializations[6 + 3].i = out_shape_packed.h; + specializations[6 + 4].i = out_shape_packed.c; + specializations[6 + 5].i = out_shape_packed.cstep; + + Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0); if (out_shape_packed.dims != 0) { - local_size_xyz.w = std::min(16, out_shape_packed.w); - local_size_xyz.h = std::min(1, out_shape_packed.h); - local_size_xyz.c = std::min(4, out_shape_packed.c); + local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h); + local_size_xyz.h = std::min(4, out_shape_packed.c); } int shader_type_index = -1; @@ -656,7 +669,14 @@ int Convolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm; pipeline_convolution_gemm = new Pipeline(vkdev); - pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + if (opt.use_shader_local_memory) + { + pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + } pipeline_convolution_gemm->create(shader_type_index, opt, specializations); } } @@ -764,9 +784,43 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1; int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1; + bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1; + // src = kw-kh-inch-outch // dst = pa-pb-kw-kh-inch/pa-outch/pb Mat weight_data_packed; + if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && (num_input >= 16 && num_output >= 16)) + { + Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); + + weight_data_packed.create(maxk * num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + float* g00 = weight_data_packed.row(q / out_elempack); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int k = 0; k < maxk; k++) + { + for (int i = 0; i < out_elempack; i++) + { + const Mat k0 = weight_data_r2.channel(q + i); + + for (int j = 0; j < elempack; j++) + { + const float* k00 = k0.row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + else { Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output); @@ -860,18 +914,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } // src = 36-inch-outch - // dst = 8a-8b-36-inch/8a-outch/8b + // dst = 8a-8b-inch/8a-outch/8b-36 Mat weight_data_tm_packed; { - weight_data_tm_packed.create(36, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + weight_data_tm_packed.create(num_input / elempack, num_output / out_elempack, 36, (size_t)4 * elempack * out_elempack, elempack * out_elempack); - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + for (int k = 0; k < 36; k++) { - float* g00 = weight_data_tm_packed.channel(q / out_elempack); + float* g00 = weight_data_tm_packed.channel(k); - for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - for (int k = 0; k < 36; k++) + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { for (int i = 0; i < out_elempack; i++) { @@ -950,18 +1004,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) } // src = 16-inch-outch - // dst = 8a-8b-16-inch/8a-outch/8b + // dst = 8a-8b-inch/8a-outch/8b-16 Mat weight_data_tm_packed; { - weight_data_tm_packed.create(16, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + weight_data_tm_packed.create(num_input / elempack, num_output / out_elempack, 16, (size_t)4 * elempack * out_elempack, elempack * out_elempack); - for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + for (int k = 0; k < 16; k++) { - float* g00 = weight_data_tm_packed.channel(q / out_elempack); + float* g00 = weight_data_tm_packed.channel(k); - for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) { - for (int k = 0; k < 16; k++) + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) { for (int i = 0; i < out_elempack; i++) { @@ -1156,7 +1210,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // transform input VkMat bottom_tm_blob; { - bottom_tm_blob.create(36, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_tm_blob.create(block_x * block_y, channels, 36, elemsize, elempack, opt.workspace_vkallocator); if (bottom_tm_blob.empty()) return -100; @@ -1176,7 +1230,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom VkMat dispatcher; dispatcher.w = block_x; dispatcher.h = block_y; - dispatcher.c = bottom_tm_blob.c; + dispatcher.c = bottom_tm_blob.h; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_transform_input, bindings, constants, dispatcher); } @@ -1184,7 +1238,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // gemm VkMat top_tm_blob; { - top_tm_blob.create(36, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_tm_blob.create(block_x * block_y, num_output / out_elempack, 36, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_tm_blob.empty()) return -100; @@ -1194,16 +1248,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = weight_data_gpu_tm; std::vector constants(5); - constants[0].i = bottom_tm_blob.c; + constants[0].i = bottom_tm_blob.h; constants[1].i = bottom_tm_blob.cstep; - constants[2].i = top_tm_blob.h; - constants[3].i = top_tm_blob.c; + constants[2].i = top_tm_blob.w; + constants[3].i = top_tm_blob.h; constants[4].i = top_tm_blob.cstep; VkMat dispatcher; - dispatcher.w = top_tm_blob.w; - dispatcher.h = (top_tm_blob.h + 3) / 4; - dispatcher.c = top_tm_blob.c; + dispatcher.w = (top_tm_blob.w + 3) / 4; + dispatcher.h = top_tm_blob.h; + dispatcher.c = 36; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_gemm, bindings, constants, dispatcher); } @@ -1221,7 +1275,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = bias_data_gpu; std::vector constants(7); - constants[0].i = top_tm_blob.c; + constants[0].i = top_tm_blob.h; constants[1].i = top_tm_blob.cstep; constants[2].i = block_x; constants[3].i = block_y; @@ -1299,7 +1353,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // transform input VkMat bottom_tm_blob; { - bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_tm_blob.create(block_x * block_y, channels, 16, elemsize, elempack, opt.workspace_vkallocator); if (bottom_tm_blob.empty()) return -100; @@ -1319,7 +1373,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom VkMat dispatcher; dispatcher.w = block_x; dispatcher.h = block_y; - dispatcher.c = bottom_tm_blob.c; + dispatcher.c = bottom_tm_blob.h; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); } @@ -1327,7 +1381,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // gemm VkMat top_tm_blob; { - top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_tm_blob.create(block_x * block_y, num_output / out_elempack, 16, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_tm_blob.empty()) return -100; @@ -1337,16 +1391,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = weight_data_gpu_tm; std::vector constants(5); - constants[0].i = bottom_tm_blob.c; + constants[0].i = bottom_tm_blob.h; constants[1].i = bottom_tm_blob.cstep; - constants[2].i = top_tm_blob.h; - constants[3].i = top_tm_blob.c; + constants[2].i = top_tm_blob.w; + constants[3].i = top_tm_blob.h; constants[4].i = top_tm_blob.cstep; VkMat dispatcher; - dispatcher.w = top_tm_blob.w; - dispatcher.h = (top_tm_blob.h + 3) / 4; - dispatcher.c = top_tm_blob.c; + dispatcher.w = (top_tm_blob.w + 3) / 4; + dispatcher.h = top_tm_blob.h; + dispatcher.c = 16; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); } @@ -1364,7 +1418,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = bias_data_gpu; std::vector constants(7); - constants[0].i = top_tm_blob.c; + constants[0].i = top_tm_blob.h; constants[1].i = top_tm_blob.cstep; constants[2].i = block_x; constants[3].i = block_y; @@ -1410,7 +1464,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom // im2col VkMat bottom_blob_col; { - bottom_blob_col.create(outw * outh, maxk, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_blob_col.create(outw * outh, maxk * channels, elemsize, elempack, opt.workspace_vkallocator); if (bottom_blob_col.empty()) return -100; @@ -1418,17 +1472,13 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[0] = bottom_blob_bordered; bindings[1] = bottom_blob_col; - std::vector constants(10); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w; - constants[2].i = bottom_blob_bordered.h; - constants[3].i = bottom_blob_bordered.c; - constants[4].i = bottom_blob_bordered.cstep; - constants[5].i = bottom_blob_col.dims; - constants[6].i = outw; - constants[7].i = outh; - constants[8].i = bottom_blob_col.c; - constants[9].i = bottom_blob_col.cstep; + std::vector constants(6); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = bottom_blob_bordered.cstep; + constants[4].i = outw; + constants[5].i = outh; cmd.record_pipeline(pipeline_convolution_im2col, bindings, constants, bottom_blob_col); } @@ -1445,22 +1495,18 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom bindings[2] = weight_data_gpu; bindings[3] = bias_data_gpu; - std::vector constants(10); - constants[0].i = bottom_blob_col.dims; - constants[1].i = outw; - constants[2].i = outh; - constants[3].i = bottom_blob_col.c; - constants[4].i = bottom_blob_col.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = top_blob.cstep; + std::vector constants(6); + constants[0].i = bottom_blob_col.w; + constants[1].i = bottom_blob_col.h; + constants[2].i = top_blob.w; + constants[3].i = top_blob.h; + constants[4].i = top_blob.c; + constants[5].i = top_blob.cstep; VkMat dispatcher; dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = 1; - dispatcher.c = top_blob.c; + dispatcher.h = top_blob.c; + dispatcher.c = 1; cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); } @@ -1651,7 +1697,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b // transform input VkImageMat bottom_tm_blob; { - bottom_tm_blob.create(36, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_tm_blob.create(block_x * block_y, channels, 36, elemsize, elempack, opt.workspace_vkallocator); if (bottom_tm_blob.empty()) return -100; @@ -1671,7 +1717,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b VkImageMat dispatcher; dispatcher.w = block_x; dispatcher.h = block_y; - dispatcher.c = bottom_tm_blob.c; + dispatcher.c = bottom_tm_blob.h; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_transform_input, bindings, constants, dispatcher); } @@ -1679,7 +1725,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b // gemm VkImageMat top_tm_blob; { - top_tm_blob.create(36, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_tm_blob.create(block_x * block_y, num_output / out_elempack, 36, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_tm_blob.empty()) return -100; @@ -1689,16 +1735,16 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b bindings[2] = weight_data_gpu_tm_image; std::vector constants(5); - constants[0].i = bottom_tm_blob.c; + constants[0].i = bottom_tm_blob.h; constants[1].i = 0; //bottom_tm_blob.cstep; - constants[2].i = top_tm_blob.h; - constants[3].i = top_tm_blob.c; + constants[2].i = top_tm_blob.w; + constants[3].i = top_tm_blob.h; constants[4].i = 0; //top_tm_blob.cstep; VkImageMat dispatcher; - dispatcher.w = top_tm_blob.w; - dispatcher.h = (top_tm_blob.h + 3) / 4; - dispatcher.c = top_tm_blob.c; + dispatcher.w = (top_tm_blob.w + 3) / 4; + dispatcher.h = top_tm_blob.h; + dispatcher.c = 36; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_gemm, bindings, constants, dispatcher); } @@ -1716,7 +1762,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b bindings[2] = bias_data_gpu_image; std::vector constants(7); - constants[0].i = top_tm_blob.c; + constants[0].i = top_tm_blob.h; constants[1].i = 0; //top_tm_blob.cstep; constants[2].i = block_x; constants[3].i = block_y; @@ -1794,7 +1840,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b // transform input VkImageMat bottom_tm_blob; { - bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_tm_blob.create(block_x * block_y, channels, 16, elemsize, elempack, opt.workspace_vkallocator); if (bottom_tm_blob.empty()) return -100; @@ -1814,7 +1860,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b VkImageMat dispatcher; dispatcher.w = block_x; dispatcher.h = block_y; - dispatcher.c = bottom_tm_blob.c; + dispatcher.c = bottom_tm_blob.h; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher); } @@ -1822,7 +1868,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b // gemm VkImageMat top_tm_blob; { - top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_tm_blob.create(block_x * block_y, num_output / out_elempack, 16, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_tm_blob.empty()) return -100; @@ -1834,14 +1880,14 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b std::vector constants(5); constants[0].i = bottom_tm_blob.c; constants[1].i = 0; //bottom_tm_blob.cstep; - constants[2].i = top_tm_blob.h; - constants[3].i = top_tm_blob.c; + constants[2].i = top_tm_blob.w; + constants[3].i = top_tm_blob.h; constants[4].i = 0; //top_tm_blob.cstep; VkImageMat dispatcher; - dispatcher.w = top_tm_blob.w; - dispatcher.h = (top_tm_blob.h + 3) / 4; - dispatcher.c = top_tm_blob.c; + dispatcher.w = (top_tm_blob.w + 3) / 4; + dispatcher.h = top_tm_blob.h; + dispatcher.c = 16; cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher); } @@ -1859,7 +1905,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b bindings[2] = bias_data_gpu_image; std::vector constants(7); - constants[0].i = top_tm_blob.c; + constants[0].i = top_tm_blob.h; constants[1].i = 0; //top_tm_blob.cstep; constants[2].i = block_x; constants[3].i = block_y; @@ -1905,7 +1951,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b // im2col VkImageMat bottom_blob_col; { - bottom_blob_col.create(outw * outh, maxk, channels, elemsize, elempack, opt.workspace_vkallocator); + bottom_blob_col.create(outw * outh, maxk * channels, elemsize, elempack, opt.workspace_vkallocator); if (bottom_blob_col.empty()) return -100; @@ -1913,17 +1959,13 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b bindings[0] = bottom_blob_bordered; bindings[1] = bottom_blob_col; - std::vector constants(10); - constants[0].i = bottom_blob_bordered.dims; - constants[1].i = bottom_blob_bordered.w; - constants[2].i = bottom_blob_bordered.h; - constants[3].i = bottom_blob_bordered.c; - constants[4].i = 0; // bottom_blob_bordered.cstep; - constants[5].i = bottom_blob_col.dims; - constants[6].i = outw; - constants[7].i = outh; - constants[8].i = bottom_blob_col.c; - constants[9].i = 0; // bottom_blob_col.cstep; + std::vector constants(6); + constants[0].i = bottom_blob_bordered.w; + constants[1].i = bottom_blob_bordered.h; + constants[2].i = bottom_blob_bordered.c; + constants[3].i = 0; // bottom_blob_bordered.cstep; + constants[4].i = outw; + constants[5].i = outh; cmd.record_pipeline(pipeline_convolution_im2col, bindings, constants, bottom_blob_col); } @@ -1940,22 +1982,18 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b bindings[2] = weight_data_gpu_image; bindings[3] = bias_data_gpu_image; - std::vector constants(10); - constants[0].i = bottom_blob_col.dims; - constants[1].i = outw; - constants[2].i = outh; - constants[3].i = bottom_blob_col.c; - constants[4].i = 0; // bottom_blob_col.cstep; - constants[5].i = top_blob.dims; - constants[6].i = top_blob.w; - constants[7].i = top_blob.h; - constants[8].i = top_blob.c; - constants[9].i = 0; // top_blob.cstep; + std::vector constants(6); + constants[0].i = bottom_blob_col.w; + constants[1].i = bottom_blob_col.h; + constants[2].i = top_blob.w; + constants[3].i = top_blob.h; + constants[4].i = top_blob.c; + constants[5].i = 0; // top_blob.cstep; VkImageMat dispatcher; dispatcher.w = (top_blob.w * top_blob.h + 3) / 4; - dispatcher.h = 1; - dispatcher.c = top_blob.c; + dispatcher.h = top_blob.c; + dispatcher.c = 1; cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher); } diff --git a/src/layer/vulkan/deconvolution_vulkan.cpp b/src/layer/vulkan/deconvolution_vulkan.cpp index 13c42c5f5..ac9897df7 100644 --- a/src/layer/vulkan/deconvolution_vulkan.cpp +++ b/src/layer/vulkan/deconvolution_vulkan.cpp @@ -144,10 +144,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) Mat out_shape_col; if (shape.dims != 0 && out_shape.dims != 0) { - out_shape_col = Mat(shape.w * shape.h, kernel_w * kernel_h, out_shape.c, (void*)0); + out_shape_col = Mat(shape.w * shape.h, kernel_w * kernel_h * out_shape.c, (void*)0); } - Mat out_shape_col_packed = Mat(out_shape_col.w, out_shape_col.h, out_shape_col.c / out_elempack, (void*)0, out_elemsize, out_elempack); + Mat out_shape_col_packed; + if (out_shape_col.dims == 2) out_shape_col_packed = Mat(out_shape_col.w, out_shape_col.h / out_elempack, (void*)0, out_elemsize, out_elempack); // check blob shape if (!vkdev->shape_support_image_storage(out_shape_col_packed)) @@ -157,26 +158,19 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) } { - std::vector specializations(2 + 10); - specializations[0].i = kernel_w; - specializations[1].i = kernel_h; - specializations[2 + 0].i = shape_packed.dims; - specializations[2 + 1].i = shape_packed.w; - specializations[2 + 2].i = shape_packed.h; - specializations[2 + 3].i = shape_packed.c; - specializations[2 + 4].i = shape_packed.cstep; - specializations[2 + 5].i = out_shape_col_packed.dims; - specializations[2 + 6].i = out_shape_col_packed.w; - specializations[2 + 7].i = out_shape_col_packed.h; - specializations[2 + 8].i = out_shape_col_packed.c; - specializations[2 + 9].i = out_shape_col_packed.cstep; - - Mat local_size_xyz(8, 4, std::min(4, num_output / out_elempack), (void*)0); + std::vector specializations(0 + 6); + specializations[0 + 0].i = shape_packed.w; + specializations[0 + 1].i = shape_packed.h; + specializations[0 + 2].i = shape_packed.c; + specializations[0 + 3].i = shape_packed.cstep; + specializations[0 + 4].i = out_shape_col_packed.w; + specializations[0 + 5].i = out_shape_col_packed.h; + + Mat local_size_xyz(8, std::min(4, num_output / out_elempack), 1, (void*)0); if (out_shape_col_packed.dims != 0) { local_size_xyz.w = std::min(8, out_shape_col_packed.w); local_size_xyz.h = std::min(4, out_shape_col_packed.h); - local_size_xyz.c = std::min(4, out_shape_col_packed.c); } int shader_type_index = -1; @@ -191,12 +185,19 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::deconvolution_pack8to4_gemm; pipeline_deconvolution_gemm = new Pipeline(vkdev); - pipeline_deconvolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + if (opt.use_shader_local_memory) + { + pipeline_deconvolution_gemm->set_local_size_xyz(8, 8, 1); + } + else + { + pipeline_deconvolution_gemm->set_optimal_local_size_xyz(local_size_xyz); + } pipeline_deconvolution_gemm->create(shader_type_index, opt, specializations); } { - std::vector specializations(10 + 10); + std::vector specializations(10 + 6); specializations[0].i = kernel_w; specializations[1].i = kernel_h; specializations[2].i = dilation_w; @@ -207,16 +208,12 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt) specializations[7].i = activation_type; specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f; specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f; - specializations[10 + 0].i = out_shape_col_packed.dims; - specializations[10 + 1].i = shape_packed.w; - specializations[10 + 2].i = shape_packed.h; - specializations[10 + 3].i = out_shape_col_packed.c; - specializations[10 + 4].i = out_shape_col_packed.cstep; - specializations[10 + 5].i = out_shape_bordered_packed.dims; - specializations[10 + 6].i = out_shape_bordered_packed.w; - specializations[10 + 7].i = out_shape_bordered_packed.h; - specializations[10 + 8].i = out_shape_bordered_packed.c; - specializations[10 + 9].i = out_shape_bordered_packed.cstep; + specializations[10 + 0].i = shape_packed.w; + specializations[10 + 1].i = shape_packed.h; + specializations[10 + 2].i = out_shape_bordered_packed.w; + specializations[10 + 3].i = out_shape_bordered_packed.h; + specializations[10 + 4].i = out_shape_bordered_packed.c; + specializations[10 + 5].i = out_shape_bordered_packed.cstep; Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0); if (out_shape_bordered_packed.dims != 0) @@ -357,7 +354,40 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt) // src = kw-kh-inch-outch // dst = pa-pb-kw-kh-inch/pa-outch/pb + // dst = pa-pb-inch/pa-kw-kh-outch/pb (sgemm) Mat weight_data_packed; + if (opt.use_sgemm_convolution) + { + Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); + + weight_data_packed.create(num_input / elempack, maxk * num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack); + + for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack) + { + for (int k = 0; k < maxk; k++) + { + float* g00 = weight_data_packed.row(q / out_elempack * maxk + k); + + for (int p = 0; p + (elempack - 1) < num_input; p += elempack) + { + for (int i = 0; i < out_elempack; i++) + { + const Mat k0 = weight_data_r2.channel(q + i); + + for (int j = 0; j < elempack; j++) + { + const float* k00 = k0.row(p + j); + + g00[0] = k00[k]; + + g00++; + } + } + } + } + } + } + else { Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output); @@ -446,7 +476,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC // gemm VkMat top_blob_col; { - top_blob_col.create(w * h, maxk, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_blob_col.create(w * h, maxk * num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_blob_col.empty()) return -100; @@ -455,22 +485,18 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC bindings[1] = top_blob_col; bindings[2] = weight_data_gpu; - std::vector constants(10); - constants[0].i = bottom_blob.dims; - constants[1].i = bottom_blob.w; - constants[2].i = bottom_blob.h; - constants[3].i = bottom_blob.c; - constants[4].i = bottom_blob.cstep; - constants[5].i = top_blob_col.dims; - constants[6].i = top_blob_col.w; - constants[7].i = top_blob_col.h; - constants[8].i = top_blob_col.c; - constants[9].i = top_blob_col.cstep; + std::vector constants(6); + constants[0].i = bottom_blob.w; + constants[1].i = bottom_blob.h; + constants[2].i = bottom_blob.c; + constants[3].i = bottom_blob.cstep; + constants[4].i = top_blob_col.w; + constants[5].i = top_blob_col.h; VkMat dispatcher; dispatcher.w = (top_blob_col.w + 3) / 4; dispatcher.h = top_blob_col.h; - dispatcher.c = top_blob_col.c; + dispatcher.c = 1; cmd.record_pipeline(pipeline_deconvolution_gemm, bindings, constants, dispatcher); } @@ -493,17 +519,13 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC bindings[1] = top_blob_bordered; bindings[2] = bias_data_gpu; - std::vector constants(10); - constants[0].i = top_blob_col.dims; - constants[1].i = w; - constants[2].i = h; - constants[3].i = top_blob_col.c; - constants[4].i = top_blob_col.cstep; - constants[5].i = top_blob_bordered.dims; - constants[6].i = top_blob_bordered.w; - constants[7].i = top_blob_bordered.h; - constants[8].i = top_blob_bordered.c; - constants[9].i = top_blob_bordered.cstep; + std::vector constants(6); + constants[0].i = w; + constants[1].i = h; + constants[2].i = top_blob_bordered.w; + constants[3].i = top_blob_bordered.h; + constants[4].i = top_blob_bordered.c; + constants[5].i = top_blob_bordered.cstep; cmd.record_pipeline(pipeline_deconvolution_col2im, bindings, constants, top_blob_bordered); } @@ -644,7 +666,7 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top // gemm VkImageMat top_blob_col; { - top_blob_col.create(w * h, maxk, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); + top_blob_col.create(w * h, maxk * num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator); if (top_blob_col.empty()) return -100; @@ -653,22 +675,18 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top bindings[1] = top_blob_col; bindings[2] = weight_data_gpu_image; - std::vector constants(10); - constants[0].i = bottom_blob.dims; - constants[1].i = bottom_blob.w; - constants[2].i = bottom_blob.h; - constants[3].i = bottom_blob.c; - constants[4].i = 0; //bottom_blob.cstep; - constants[5].i = top_blob_col.dims; - constants[6].i = top_blob_col.w; - constants[7].i = top_blob_col.h; - constants[8].i = top_blob_col.c; - constants[9].i = 0; //top_blob_col.cstep; + std::vector constants(6); + constants[0].i = bottom_blob.w; + constants[1].i = bottom_blob.h; + constants[2].i = bottom_blob.c; + constants[3].i = 0; // bottom_blob.cstep; + constants[4].i = top_blob_col.w; + constants[5].i = top_blob_col.h; VkImageMat dispatcher; dispatcher.w = (top_blob_col.w + 3) / 4; dispatcher.h = top_blob_col.h; - dispatcher.c = top_blob_col.c; + dispatcher.c = 1; cmd.record_pipeline(pipeline_deconvolution_gemm, bindings, constants, dispatcher); } @@ -691,17 +709,13 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top bindings[1] = top_blob_bordered; bindings[2] = bias_data_gpu_image; - std::vector constants(10); - constants[0].i = top_blob_col.dims; - constants[1].i = w; - constants[2].i = h; - constants[3].i = top_blob_col.c; - constants[4].i = 0; //top_blob_col.cstep; - constants[5].i = top_blob_bordered.dims; - constants[6].i = top_blob_bordered.w; - constants[7].i = top_blob_bordered.h; - constants[8].i = top_blob_bordered.c; - constants[9].i = 0; //top_blob_bordered.cstep; + std::vector constants(6); + constants[0].i = w; + constants[1].i = h; + constants[2].i = top_blob_bordered.w; + constants[3].i = top_blob_bordered.h; + constants[4].i = top_blob_bordered.c; + constants[5].i = 0; //top_blob_bordered.cstep; cmd.record_pipeline(pipeline_deconvolution_col2im, bindings, constants, top_blob_bordered); } diff --git a/src/layer/vulkan/innerproduct_vulkan.cpp b/src/layer/vulkan/innerproduct_vulkan.cpp index fdbd6ba5a..3d94b7bed 100644 --- a/src/layer/vulkan/innerproduct_vulkan.cpp +++ b/src/layer/vulkan/innerproduct_vulkan.cpp @@ -27,14 +27,6 @@ InnerProduct_vulkan::InnerProduct_vulkan() flatten = 0; pipeline_innerproduct = 0; - pipeline_innerproduct_pack4 = 0; - pipeline_innerproduct_pack1to4 = 0; - pipeline_innerproduct_pack4to1 = 0; - pipeline_innerproduct_pack8 = 0; - pipeline_innerproduct_pack1to8 = 0; - pipeline_innerproduct_pack4to8 = 0; - pipeline_innerproduct_pack8to4 = 0; - pipeline_innerproduct_pack8to1 = 0; pipeline_innerproduct_gemm = 0; } @@ -110,64 +102,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) local_size_xyz.c = 1; } - { - pipeline_innerproduct_gemm = new Pipeline(vkdev); - pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz); - - // pack1 - if (in_elempack == 1 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations); - } - - // pack4 - if (in_elempack == 4 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations); - } - - // pack1to4 - if (in_elempack == 1 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations); - } - - // pack4to1 - if (in_elempack == 4 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations); - } - - // pack8 - if (in_elempack == 8 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations); - } - - // pack1to8 - if (in_elempack == 1 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations); - } - - // pack4to8 - if (in_elempack == 4 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations); - } - - // pack8to4 - if (in_elempack == 8 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations); - } - - // pack8to1 - if (in_elempack == 8 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations); - } - } + int shader_type_index = -1; + if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm; + if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp4; + if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to4; + if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to1; + if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp8; + if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to8; + if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to1; + if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to8; + if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to4; + + pipeline_innerproduct_gemm = new Pipeline(vkdev); + pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations); return 0; } @@ -268,77 +216,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) local_size_xyz.c = 1; } - // pack1 - if (in_elempack == 1 && out_elempack == 1) - { - pipeline_innerproduct = new Pipeline(vkdev); - pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct->create(LayerShaderType::innerproduct, opt, specializations); - } - - // pack4 - if (in_elempack == 4 && out_elempack == 4) - { - pipeline_innerproduct_pack4 = new Pipeline(vkdev); - pipeline_innerproduct_pack4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack4->create(LayerShaderType::innerproduct_pack4, opt, specializations); - } - - // pack1to4 - if (in_elempack == 1 && out_elempack == 4) - { - pipeline_innerproduct_pack1to4 = new Pipeline(vkdev); - pipeline_innerproduct_pack1to4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack1to4->create(LayerShaderType::innerproduct_pack1to4, opt, specializations); - } + int shader_type_index = -1; + if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct; + if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack4; + if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack1to4; + if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_pack4to1; + if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack8; + if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack1to8; + if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_pack8to1; + if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack4to8; + if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack8to4; - // pack4to1 - if (in_elempack == 4 && out_elempack == 1) - { - pipeline_innerproduct_pack4to1 = new Pipeline(vkdev); - pipeline_innerproduct_pack4to1->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack4to1->create(LayerShaderType::innerproduct_pack4to1, opt, specializations); - } - - // pack8 - if (in_elempack == 8 && out_elempack == 8) - { - pipeline_innerproduct_pack8 = new Pipeline(vkdev); - pipeline_innerproduct_pack8->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack8->create(LayerShaderType::innerproduct_pack8, opt, specializations); - } - - // pack1to8 - if (in_elempack == 1 && out_elempack == 8) - { - pipeline_innerproduct_pack1to8 = new Pipeline(vkdev); - pipeline_innerproduct_pack1to8->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack1to8->create(LayerShaderType::innerproduct_pack1to8, opt, specializations); - } - - // pack4to8 - if (in_elempack == 4 && out_elempack == 8) - { - pipeline_innerproduct_pack4to8 = new Pipeline(vkdev); - pipeline_innerproduct_pack4to8->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack4to8->create(LayerShaderType::innerproduct_pack4to8, opt, specializations); - } - - // pack8to4 - if (in_elempack == 8 && out_elempack == 4) - { - pipeline_innerproduct_pack8to4 = new Pipeline(vkdev); - pipeline_innerproduct_pack8to4->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack8to4->create(LayerShaderType::innerproduct_pack8to4, opt, specializations); - } - - // pack8to1 - if (in_elempack == 8 && out_elempack == 1) - { - pipeline_innerproduct_pack8to1 = new Pipeline(vkdev); - pipeline_innerproduct_pack8to1->set_optimal_local_size_xyz(local_size_xyz); - pipeline_innerproduct_pack8to1->create(LayerShaderType::innerproduct_pack8to1, opt, specializations); - } + pipeline_innerproduct = new Pipeline(vkdev); + pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct->create(shader_type_index, opt, specializations); // gemm for no shape hint if (shape.dims == 0) @@ -361,64 +252,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt) Mat local_size_xyz(std::min(16, num_output / out_elempack), 4, 1, (void*)0); - { - pipeline_innerproduct_gemm = new Pipeline(vkdev); - pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz); - - // pack1 - if (in_elempack == 1 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations); - } - - // pack4 - if (in_elempack == 4 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations); - } - - // pack1to4 - if (in_elempack == 1 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations); - } - - // pack4to1 - if (in_elempack == 4 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations); - } - - // pack8 - if (in_elempack == 8 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations); - } - - // pack1to8 - if (in_elempack == 1 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations); - } - - // pack4to8 - if (in_elempack == 4 && out_elempack == 8) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations); - } - - // pack8to4 - if (in_elempack == 8 && out_elempack == 4) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations); - } - - // pack8to1 - if (in_elempack == 8 && out_elempack == 1) - { - pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations); - } - } + int shader_type_index = -1; + if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm; + if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp4; + if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to4; + if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to1; + if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp8; + if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to8; + if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to1; + if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to8; + if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to4; + + pipeline_innerproduct_gemm = new Pipeline(vkdev); + pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz); + pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations); return 0; } @@ -438,30 +285,6 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt) delete pipeline_innerproduct; pipeline_innerproduct = 0; - delete pipeline_innerproduct_pack4; - pipeline_innerproduct_pack4 = 0; - - delete pipeline_innerproduct_pack1to4; - pipeline_innerproduct_pack1to4 = 0; - - delete pipeline_innerproduct_pack4to1; - pipeline_innerproduct_pack4to1 = 0; - - delete pipeline_innerproduct_pack8; - pipeline_innerproduct_pack8 = 0; - - delete pipeline_innerproduct_pack1to8; - pipeline_innerproduct_pack1to8 = 0; - - delete pipeline_innerproduct_pack4to8; - pipeline_innerproduct_pack4to8 = 0; - - delete pipeline_innerproduct_pack8to4; - pipeline_innerproduct_pack8to4 = 0; - - delete pipeline_innerproduct_pack8to1; - pipeline_innerproduct_pack8to1 = 0; - delete pipeline_innerproduct_gemm; pipeline_innerproduct_gemm = 0; @@ -586,14 +409,12 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo constants[8].i = top_blob_unpacked.c; constants[9].i = top_blob_unpacked.cstep; - const Pipeline* pipeline = pipeline_innerproduct_gemm; - VkMat dispatcher; dispatcher.w = top_blob_unpacked.w / out_elempack; dispatcher.h = top_blob_unpacked.h; dispatcher.c = 1; - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + cmd.record_pipeline(pipeline_innerproduct_gemm, bindings, constants, dispatcher); // packing if (elempack > 1) @@ -645,45 +466,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo constants[8].i = top_blob.c; constants[9].i = top_blob.cstep; - const Pipeline* pipeline = 0; - if (in_elempack == 1 && out_elempack == 1) - { - pipeline = pipeline_innerproduct; - } - else if (in_elempack == 4 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack4; - } - else if (in_elempack == 1 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack1to4; - } - else if (in_elempack == 4 && out_elempack == 1) - { - pipeline = pipeline_innerproduct_pack4to1; - } - else if (in_elempack == 8 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack8; - } - else if (in_elempack == 1 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack1to8; - } - else if (in_elempack == 4 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack4to8; - } - else if (in_elempack == 8 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack8to4; - } - else if (in_elempack == 8 && out_elempack == 1) - { - pipeline = pipeline_innerproduct_pack8to1; - } - - cmd.record_pipeline(pipeline, bindings, constants, top_blob); + cmd.record_pipeline(pipeline_innerproduct, bindings, constants, top_blob); return 0; } @@ -742,14 +525,12 @@ int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_ constants[8].i = top_blob_unpacked.c; constants[9].i = 0; //top_blob_unpacked.cstep; - const Pipeline* pipeline = pipeline_innerproduct_gemm; - VkImageMat dispatcher; dispatcher.w = top_blob_unpacked.w / out_elempack; dispatcher.h = top_blob_unpacked.h; dispatcher.c = 1; - cmd.record_pipeline(pipeline, bindings, constants, dispatcher); + cmd.record_pipeline(pipeline_innerproduct_gemm, bindings, constants, dispatcher); // packing if (elempack > 1) @@ -801,45 +582,7 @@ int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_ constants[8].i = top_blob.c; constants[9].i = 0; //top_blob.cstep; - const Pipeline* pipeline = 0; - if (in_elempack == 1 && out_elempack == 1) - { - pipeline = pipeline_innerproduct; - } - else if (in_elempack == 4 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack4; - } - else if (in_elempack == 1 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack1to4; - } - else if (in_elempack == 4 && out_elempack == 1) - { - pipeline = pipeline_innerproduct_pack4to1; - } - else if (in_elempack == 8 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack8; - } - else if (in_elempack == 1 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack1to8; - } - else if (in_elempack == 4 && out_elempack == 8) - { - pipeline = pipeline_innerproduct_pack4to8; - } - else if (in_elempack == 8 && out_elempack == 4) - { - pipeline = pipeline_innerproduct_pack8to4; - } - else if (in_elempack == 8 && out_elempack == 1) - { - pipeline = pipeline_innerproduct_pack8to1; - } - - cmd.record_pipeline(pipeline, bindings, constants, top_blob); + cmd.record_pipeline(pipeline_innerproduct, bindings, constants, top_blob); return 0; } diff --git a/src/layer/vulkan/innerproduct_vulkan.h b/src/layer/vulkan/innerproduct_vulkan.h index 46db9e939..87c550889 100644 --- a/src/layer/vulkan/innerproduct_vulkan.h +++ b/src/layer/vulkan/innerproduct_vulkan.h @@ -43,14 +43,6 @@ public: VkImageMat bias_data_gpu_image; Pipeline* pipeline_innerproduct; - Pipeline* pipeline_innerproduct_pack4; - Pipeline* pipeline_innerproduct_pack1to4; - Pipeline* pipeline_innerproduct_pack4to1; - Pipeline* pipeline_innerproduct_pack8; - Pipeline* pipeline_innerproduct_pack1to8; - Pipeline* pipeline_innerproduct_pack4to8; - Pipeline* pipeline_innerproduct_pack8to4; - Pipeline* pipeline_innerproduct_pack8to1; Pipeline* pipeline_innerproduct_gemm; }; diff --git a/src/layer/vulkan/shader/convolution_gemm.comp b/src/layer/vulkan/shader/convolution_gemm.comp index 0f8660518..2f7e23b28 100644 --- a/src/layer/vulkan/shader/convolution_gemm.comp +++ b/src/layer/vulkan/shader/convolution_gemm.comp @@ -21,6 +21,8 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif +#define LOCAL_MEMORY_UNROLL_INCH 8 + layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; layout (constant_id = 2) const int bias_term = 0; @@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; int outcstep; } p; +#if NCNN_shader_local_memory +shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfp tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) return; +#endif afp sum0; afp sum1; @@ -85,9 +85,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld1(bias_data, gz); + sum0 = buffer_ld1(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -106,48 +106,131 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) + { + afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0)); + afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0)); + afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0)); + afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0)); + + afp k = image3d_ld1(weight_blob, ivec3(z, gy, 0)); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int v_offset = gx; + int w_offset = gy * psc(h); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH) { - for (int kk = 0; kk < maxk; kk++) + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } + + if (lx == 0) { - afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z)); - afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z)); - afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z)); - afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]); + } + } + + barrier(); - afp k = image3d_ld1(weight_blob, ivec3(kk, z, gz)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afp k = lfp2afp(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); } -#else - int w_offset = gz * psc(c) * maxk; - for (int z = 0; z < psc(c); z++) + if (z < psc(h)) { - int v_offset = gx + z * psc(cstep); + const int remain = psc(h) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } - for (int kk = 0; kk < maxk; kk++) + if (lx == 0) { - afp v0 = buffer_ld1(col_blob_data, v_offset + 0); - afp v1 = buffer_ld1(col_blob_data, v_offset + 1); - afp v2 = buffer_ld1(col_blob_data, v_offset + 2); - afp v3 = buffer_ld1(col_blob_data, v_offset + 3); + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]); + } + } + + barrier(); - afp k = buffer_ld1(weight_data, w_offset); + for (int z4 = 0; z4 < remain; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afp k = lfp2afp(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; - - v_offset += psc(outw) * psc(outh); - w_offset += 1; } } +#else + for (int z = 0; z < psc(h); z++) + { + afp v0 = buffer_ld1(col_blob_data, v_offset + 0); + afp v1 = buffer_ld1(col_blob_data, v_offset + 1); + afp v2 = buffer_ld1(col_blob_data, v_offset + 2); + afp v3 = buffer_ld1(col_blob_data, v_offset + 3); + + afp k = buffer_ld1(weight_data, w_offset); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += psc(w); + w_offset += 1; + } +#endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) + return; #endif if (activation_type == 1) @@ -202,16 +285,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st1(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_im2col.comp b/src/layer/vulkan/shader/convolution_im2col.comp index 9e871a794..d7d7ba3b7 100644 --- a/src/layer/vulkan/shader/convolution_im2col.comp +++ b/src/layer/vulkan/shader/convolution_im2col.comp @@ -29,17 +29,13 @@ layout (constant_id = 4) const int stride_w = 1; layout (constant_id = 5) const int stride_h = 1; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -51,42 +47,40 @@ layout (binding = 1) writeonly buffer col_blob { sfp col_blob_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); const int maxk = kernel_w * kernel_h; - if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c)) return; - int sy = gx / psc(outw); - int sx = gx % psc(outw); + const int sy = gx / psc(outw); + const int sx = gx % psc(outw); + + const int sz = gy / maxk; + const int k = gy % maxk; - int ky = gy / kernel_w; - int kx = gy % kernel_w; + const int ky = k / kernel_w; + const int kx = k % kernel_w; #if NCNN_image_shader - image3d_cp1(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz)); + image3d_cp1(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz)); #else - const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; - const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx; + const int gi = gy * psc(outw) * psc(outh) + gx; buffer_cp1(col_blob_data, gi, bottom_blob_data, v_offset); #endif diff --git a/src/layer/vulkan/shader/convolution_pack1to4_gemm.comp b/src/layer/vulkan/shader/convolution_pack1to4_gemm.comp index 341c772e9..8e3cb9ce7 100644 --- a/src/layer/vulkan/shader/convolution_pack1to4_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack1to4_gemm.comp @@ -21,6 +21,8 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif +#define LOCAL_MEMORY_UNROLL_INCH 8 + layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; layout (constant_id = 2) const int bias_term = 0; @@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; int outcstep; } p; +#if NCNN_shader_local_memory +shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) return; +#endif afpvec4 sum0; afpvec4 sum1; @@ -85,9 +85,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld4(bias_data, gz); + sum0 = buffer_ld4(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -106,48 +106,131 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) + { + afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0)); + afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0)); + afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0)); + afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0)); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int v_offset = gx; + int w_offset = gy * psc(h); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH) { - for (int kk = 0; kk < maxk; kk++) + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } + + if (lx == 0) { - afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z)); - afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z)); - afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z)); - afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); - afpvec4 k = image3d_ld4(weight_blob, ivec3(kk, z, gz)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); } -#else - int w_offset = gz * psc(c) * maxk; - for (int z = 0; z < psc(c); z++) + if (z < psc(h)) { - int v_offset = gx + z * psc(cstep); + const int remain = psc(h) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } - for (int kk = 0; kk < maxk; kk++) + if (lx == 0) { - afp v0 = buffer_ld1(col_blob_data, v_offset + 0); - afp v1 = buffer_ld1(col_blob_data, v_offset + 1); - afp v2 = buffer_ld1(col_blob_data, v_offset + 2); - afp v3 = buffer_ld1(col_blob_data, v_offset + 3); + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); - afpvec4 k = buffer_ld4(weight_data, w_offset); + for (int z4 = 0; z4 < remain; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; - - v_offset += psc(outw) * psc(outh); - w_offset += 1; } } +#else + for (int z = 0; z < psc(h); z++) + { + afp v0 = buffer_ld1(col_blob_data, v_offset + 0); + afp v1 = buffer_ld1(col_blob_data, v_offset + 1); + afp v2 = buffer_ld1(col_blob_data, v_offset + 2); + afp v3 = buffer_ld1(col_blob_data, v_offset + 3); + + afpvec4 k = buffer_ld4(weight_data, w_offset); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += psc(w); + w_offset += 1; + } +#endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) + return; #endif if (activation_type == 1) @@ -202,16 +285,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack1to8_gemm.comp b/src/layer/vulkan/shader/convolution_pack1to8_gemm.comp index 0df90cb88..e33ff096d 100644 --- a/src/layer/vulkan/shader/convolution_pack1to8_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack1to8_gemm.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -73,9 +65,8 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(w) || gy >= psc(outc)) return; afpvec8 sum0; @@ -86,9 +77,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld8(bias_data, gz); + sum0 = buffer_ld8(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -107,63 +98,56 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - for (int kk = 0; kk < maxk; kk++) - { - afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z)); - afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z)); - afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z)); - afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z)); + afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0)); + afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0)); + afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0)); + afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0)); - afpvec8 k = image3d_ld8(weight_blob, ivec3(kk, z, gz)); + afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0)); - // sum += v * k; - sum0[0] += v0 * k[0]; - sum0[1] += v0 * k[1]; + // sum += v * k; + sum0[0] += v0 * k[0]; + sum0[1] += v0 * k[1]; - sum1[0] += v1 * k[0]; - sum1[1] += v1 * k[1]; + sum1[0] += v1 * k[0]; + sum1[1] += v1 * k[1]; - sum2[0] += v2 * k[0]; - sum2[1] += v2 * k[1]; + sum2[0] += v2 * k[0]; + sum2[1] += v2 * k[1]; - sum3[0] += v3 * k[0]; - sum3[1] += v3 * k[1]; - } + sum3[0] += v3 * k[0]; + sum3[1] += v3 * k[1]; } #else - int w_offset = gz * psc(c) * maxk; + int v_offset = gx; + int w_offset = gy * psc(h); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - int v_offset = gx + z * psc(cstep); + afp v0 = buffer_ld1(col_blob_data, v_offset + 0); + afp v1 = buffer_ld1(col_blob_data, v_offset + 1); + afp v2 = buffer_ld1(col_blob_data, v_offset + 2); + afp v3 = buffer_ld1(col_blob_data, v_offset + 3); - for (int kk = 0; kk < maxk; kk++) - { - afp v0 = buffer_ld1(col_blob_data, v_offset + 0); - afp v1 = buffer_ld1(col_blob_data, v_offset + 1); - afp v2 = buffer_ld1(col_blob_data, v_offset + 2); - afp v3 = buffer_ld1(col_blob_data, v_offset + 3); + afpvec8 k = buffer_ld8(weight_data, w_offset); - afpvec8 k = buffer_ld8(weight_data, w_offset); + // sum += v * k; + sum0[0] += v0 * k[0]; + sum0[1] += v0 * k[1]; - // sum += v * k; - sum0[0] += v0 * k[0]; - sum0[1] += v0 * k[1]; + sum1[0] += v1 * k[0]; + sum1[1] += v1 * k[1]; - sum1[0] += v1 * k[0]; - sum1[1] += v1 * k[1]; + sum2[0] += v2 * k[0]; + sum2[1] += v2 * k[1]; - sum2[0] += v2 * k[0]; - sum2[1] += v2 * k[1]; + sum3[0] += v3 * k[0]; + sum3[1] += v3 * k[1]; - sum3[0] += v3 * k[0]; - sum3[1] += v3 * k[1]; - - v_offset += psc(outw) * psc(outh); - w_offset += 1; - } + v_offset += psc(w); + w_offset += 1; } #endif @@ -243,16 +227,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp index 797c227e8..df5e2e4b0 100644 --- a/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp +++ b/src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp @@ -21,6 +21,8 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif +#define LOCAL_MEMORY_UNROLL_INCH 8 + layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; layout (constant_id = 2) const int dilation_w = 1; @@ -53,12 +55,7 @@ layout (binding = 3) uniform unfp sampler3D bias_blob; #else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) -// GL_EXT_shader_16bit_storage does not define f16mat4 type :( layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; -#else -layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; -#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; #endif @@ -77,22 +74,25 @@ layout (push_constant) uniform parameter int outcstep; } p; +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4]; +#endif + void main() { -#if NCNN_image_shader int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); +#if NCNN_image_shader if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) return; #else - int gx = int(gl_GlobalInvocationID.x) * 4; - int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - +#if !NCNN_shader_local_memory if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) return; +#endif #endif afpvec4 sum0; @@ -146,9 +146,103 @@ void main() sum3 += v3 * k; } #else - int w_offset = gz * psc(c); + int w_offset = gz * psc(c) * 4; int v_offset = gx; +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int lz = int(gl_LocalInvocationID.z); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (lz < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep); + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (lz < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + } +#else for (int z = 0; z < psc(c); z++) { afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0); @@ -156,26 +250,22 @@ void main() afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2); afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3); -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( afpmat4 k = afpmat4( - buffer_ld4(weight_data, w_offset * 4 + 0), - buffer_ld4(weight_data, w_offset * 4 + 1), - buffer_ld4(weight_data, w_offset * 4 + 2), - buffer_ld4(weight_data, w_offset * 4 + 3) + buffer_ld4(weight_data, w_offset + 0), + buffer_ld4(weight_data, w_offset + 1), + buffer_ld4(weight_data, w_offset + 2), + buffer_ld4(weight_data, w_offset + 3) ); -#else - afpmat4 k = sfp2afpmat4(weight_data[w_offset]); -#endif sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; - w_offset += 1; + w_offset += 4; v_offset += psc(cstep); } +#endif #endif if (activation_type == 1) @@ -232,6 +322,11 @@ void main() image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); #else +#if NCNN_shader_local_memory + if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc)) + return; +#endif + int gi = gz * psc(outcstep) + gx; buffer_st4(top_blob_data, gi + 0, sum0); diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp deleted file mode 100644 index 40211c64f..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp +++ /dev/null @@ -1,139 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#define shape_constant_id_offset 0 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -#if NCNN_image_shader -layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; -layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; -layout (binding = 2) uniform unfp sampler3D weight_tm_blob; -#else -layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) -// GL_EXT_shader_16bit_storage does not define f16mat4 type :( -layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; -#else -layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; -#endif -#endif - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outh; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y) * 4; - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc)) - return; - - afpvec4 sum0 = afpvec4(0.f); - afpvec4 sum1 = afpvec4(0.f); - afpvec4 sum2 = afpvec4(0.f); - afpvec4 sum3 = afpvec4(0.f); - -#if NCNN_image_shader - int wx = gx * 4; - - for (int z = 0; z < psc(c); z++) - { - afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z)); - afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z)); - afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z)); - afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z)); - - afpmat4 k = afpmat4( - image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz)) - ); - - sum0 += v0 * k; - sum1 += v1 * k; - sum2 += v2 * k; - sum3 += v3 * k; - } -#else - int v_offset = gy * 16 + gx; - int w_offset = gz * psc(c) * 16 + gx; - - for (int z = 0; z < psc(c); z++) - { - afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0); - afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 16); - afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 32); - afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 48); - -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - buffer_ld4(weight_tm_data, w_offset * 4 + 0), - buffer_ld4(weight_tm_data, w_offset * 4 + 1), - buffer_ld4(weight_tm_data, w_offset * 4 + 2), - buffer_ld4(weight_tm_data, w_offset * 4 + 3) - ); -#else - afpmat4 k = sfpmat4(weight_tm_data[w_offset]); -#endif - - sum0 += v0 * k; - sum1 += v1 * k; - sum2 += v2 * k; - sum3 += v3 * k; - - v_offset += psc(cstep); - w_offset += 16; - } -#endif - -#if NCNN_image_shader - image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); - image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); - image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); - image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); -#else - int gi = gz * psc(outcstep) + gy * 16 + gx; - - buffer_st4(top_tm_blob_data, gi + 0, sum0); - if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1); - if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2); - if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3); -#endif -} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp index 8734d01de..7cc021036 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp @@ -59,7 +59,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 4x4 @@ -161,42 +161,42 @@ void main() // store 16 #if NCNN_image_shader - int y = gy * p.block_x + gx; - - image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00); - image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01); - image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02); - image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03); - image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10); - image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11); - image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12); - image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13); - image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20); - image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21); - image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22); - image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23); - image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30); - image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31); - image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32); - image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33); + int x = gy * psc(block_x) + gx; + + image3d_st4(bottom_tm_blob, ivec3(x, gz, 0), v00); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 1), v01); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 2), v02); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 3), v03); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 4), v10); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 5), v11); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 6), v12); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 7), v13); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 8), v20); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 9), v21); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 10), v22); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 11), v23); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 12), v30); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 13), v31); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 14), v32); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 15), v33); #else - int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; - - buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v10); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v11); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v12); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v13); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v20); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v21); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v22); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v23); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v30); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v10); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v11); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v12); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v13); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v20); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v21); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v22); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v23); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v30); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v31); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v32); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v33); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp index 03c4f3667..928b6c6ea 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp @@ -66,48 +66,48 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 16 #if NCNN_image_shader - int sy = gy * p.block_x + gx; - - afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz)); - afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz)); - afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz)); - afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz)); - afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz)); - afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz)); - afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz)); - afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz)); - afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz)); - afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz)); - afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz)); - afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz)); - afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz)); - afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz)); - afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz)); - afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz)); + int sx = gy * psc(block_x) + gx; + + afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 0)); + afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 1)); + afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 2)); + afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 3)); + afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 4)); + afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 5)); + afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 6)); + afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 7)); + afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 8)); + afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 9)); + afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 10)); + afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 11)); + afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 12)); + afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 13)); + afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 14)); + afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 15)); #else - int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; - - afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0); - afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1); - afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2); - afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3); - afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4); - afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5); - afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6); - afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7); - afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8); - afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9); - afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10); - afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11); - afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12); - afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13); - afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14); - afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0 * psc(cstep)); + afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1 * psc(cstep)); + afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2 * psc(cstep)); + afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3 * psc(cstep)); + afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4 * psc(cstep)); + afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5 * psc(cstep)); + afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6 * psc(cstep)); + afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7 * psc(cstep)); + afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8 * psc(cstep)); + afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9 * psc(cstep)); + afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10 * psc(cstep)); + afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11 * psc(cstep)); + afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12 * psc(cstep)); + afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13 * psc(cstep)); + afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14 * psc(cstep)); + afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15 * psc(cstep)); #endif // const float itm[2][4] = { diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_gemm.comp deleted file mode 100644 index a269c17f5..000000000 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_gemm.comp +++ /dev/null @@ -1,139 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#define shape_constant_id_offset 0 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -#if NCNN_image_shader -layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; -layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; -layout (binding = 2) uniform unfp sampler3D weight_tm_blob; -#else -layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) -// GL_EXT_shader_16bit_storage does not define f16mat4 type :( -layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; -#else -layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; }; -#endif -#endif - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outh; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y) * 4; - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 36 || gy >= psc(outh) || gz >= psc(outc)) - return; - - afpvec4 sum0 = afpvec4(0.f); - afpvec4 sum1 = afpvec4(0.f); - afpvec4 sum2 = afpvec4(0.f); - afpvec4 sum3 = afpvec4(0.f); - -#if NCNN_image_shader - int wx = gx * 4; - - for (int z = 0; z < psc(c); z++) - { - afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z)); - afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z)); - afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z)); - afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z)); - - afpmat4 k = afpmat4( - image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)), - image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz)) - ); - - sum0 += v0 * k; - sum1 += v1 * k; - sum2 += v2 * k; - sum3 += v3 * k; - } -#else - int v_offset = gy * 36 + gx; - int w_offset = gz * psc(c) * 36 + gx; - - for (int z = 0; z < psc(c); z++) - { - afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0); - afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 36); - afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 72); - afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 108); - -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - buffer_ld4(weight_tm_data, w_offset * 4 + 0), - buffer_ld4(weight_tm_data, w_offset * 4 + 1), - buffer_ld4(weight_tm_data, w_offset * 4 + 2), - buffer_ld4(weight_tm_data, w_offset * 4 + 3) - ); -#else - afpmat4 k = sfpmat4(weight_tm_data[w_offset]); -#endif - - sum0 += v0 * k; - sum1 += v1 * k; - sum2 += v2 * k; - sum3 += v3 * k; - - v_offset += psc(cstep); - w_offset += 36; - } -#endif - -#if NCNN_image_shader - image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); - image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); - image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); - image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); -#else - int gi = gz * psc(outcstep) + gy * 36 + gx; - - buffer_st4(top_tm_blob_data, gi + 0, sum0); - if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 36, sum1); - if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 72, sum2); - if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 108, sum3); -#endif -} diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp index d64329022..ae5ce9881 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp @@ -59,7 +59,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 6x6 @@ -259,82 +259,82 @@ void main() // store 36 #if NCNN_image_shader - int y = gy * p.block_x + gx; - - image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00); - image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01); - image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02); - image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03); - image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v04); - image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v05); - image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v10); - image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v11); - image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v12); - image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v13); - image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v14); - image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v15); - image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v20); - image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v21); - image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v22); - image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v23); - image3d_st4(bottom_tm_blob, ivec3(16, y, gz), v24); - image3d_st4(bottom_tm_blob, ivec3(17, y, gz), v25); - image3d_st4(bottom_tm_blob, ivec3(18, y, gz), v30); - image3d_st4(bottom_tm_blob, ivec3(19, y, gz), v31); - image3d_st4(bottom_tm_blob, ivec3(20, y, gz), v32); - image3d_st4(bottom_tm_blob, ivec3(21, y, gz), v33); - image3d_st4(bottom_tm_blob, ivec3(22, y, gz), v34); - image3d_st4(bottom_tm_blob, ivec3(23, y, gz), v35); - image3d_st4(bottom_tm_blob, ivec3(24, y, gz), v40); - image3d_st4(bottom_tm_blob, ivec3(25, y, gz), v41); - image3d_st4(bottom_tm_blob, ivec3(26, y, gz), v42); - image3d_st4(bottom_tm_blob, ivec3(27, y, gz), v43); - image3d_st4(bottom_tm_blob, ivec3(28, y, gz), v44); - image3d_st4(bottom_tm_blob, ivec3(29, y, gz), v45); - image3d_st4(bottom_tm_blob, ivec3(30, y, gz), v50); - image3d_st4(bottom_tm_blob, ivec3(31, y, gz), v51); - image3d_st4(bottom_tm_blob, ivec3(32, y, gz), v52); - image3d_st4(bottom_tm_blob, ivec3(33, y, gz), v53); - image3d_st4(bottom_tm_blob, ivec3(34, y, gz), v54); - image3d_st4(bottom_tm_blob, ivec3(35, y, gz), v55); + int x = gy * psc(block_x) + gx; + + image3d_st4(bottom_tm_blob, ivec3(x, gz, 0), v00); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 1), v01); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 2), v02); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 3), v03); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 4), v04); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 5), v05); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 6), v10); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 7), v11); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 8), v12); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 9), v13); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 10), v14); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 11), v15); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 12), v20); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 13), v21); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 14), v22); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 15), v23); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 16), v24); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 17), v25); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 18), v30); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 19), v31); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 20), v32); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 21), v33); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 22), v34); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 23), v35); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 24), v40); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 25), v41); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 26), v42); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 27), v43); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 28), v44); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 29), v45); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 30), v50); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 31), v51); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 32), v52); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 33), v53); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 34), v54); + image3d_st4(bottom_tm_blob, ivec3(x, gz, 35), v55); #else - int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 36; - - buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v04); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v05); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v10); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v11); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v12); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v13); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v14); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v15); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v20); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v21); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v22); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v23); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 16, v24); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 17, v25); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 18, v30); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 19, v31); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 20, v32); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 21, v33); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 22, v34); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 23, v35); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 24, v40); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 25, v41); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 26, v42); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 27, v43); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 28, v44); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 29, v45); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 30, v50); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 31, v51); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 32, v52); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 33, v53); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 34, v54); - buffer_st4(bottom_tm_blob_data, v_tm_offset + 35, v55); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v04); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v05); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v10); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v11); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v12); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v13); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v14); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v15); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v20); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v21); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v22); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v23); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 16 * psc(outcstep), v24); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 17 * psc(outcstep), v25); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 18 * psc(outcstep), v30); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 19 * psc(outcstep), v31); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 20 * psc(outcstep), v32); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 21 * psc(outcstep), v33); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 22 * psc(outcstep), v34); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 23 * psc(outcstep), v35); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 24 * psc(outcstep), v40); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 25 * psc(outcstep), v41); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 26 * psc(outcstep), v42); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 27 * psc(outcstep), v43); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 28 * psc(outcstep), v44); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 29 * psc(outcstep), v45); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 30 * psc(outcstep), v50); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 31 * psc(outcstep), v51); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 32 * psc(outcstep), v52); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 33 * psc(outcstep), v53); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 34 * psc(outcstep), v54); + buffer_st4(bottom_tm_blob_data, v_tm_offset + 35 * psc(outcstep), v55); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp index 7161b5c98..295b9a56e 100644 --- a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp @@ -66,88 +66,88 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 36 #if NCNN_image_shader - int sy = gy * p.block_x + gx; + int sx = gy * psc(block_x) + gx; - afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz)); - afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz)); - afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz)); - afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz)); - afpvec4 v04 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz)); - afpvec4 v05 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz)); - afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz)); - afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz)); - afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz)); - afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz)); - afpvec4 v14 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz)); - afpvec4 v15 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz)); - afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz)); - afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz)); - afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz)); - afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz)); - afpvec4 v24 = image3d_ld4(top_tm_blob, ivec3(16, sy, gz)); - afpvec4 v25 = image3d_ld4(top_tm_blob, ivec3(17, sy, gz)); - afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(18, sy, gz)); - afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(19, sy, gz)); - afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(20, sy, gz)); - afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(21, sy, gz)); - afpvec4 v34 = image3d_ld4(top_tm_blob, ivec3(22, sy, gz)); - afpvec4 v35 = image3d_ld4(top_tm_blob, ivec3(23, sy, gz)); - afpvec4 v40 = image3d_ld4(top_tm_blob, ivec3(24, sy, gz)); - afpvec4 v41 = image3d_ld4(top_tm_blob, ivec3(25, sy, gz)); - afpvec4 v42 = image3d_ld4(top_tm_blob, ivec3(26, sy, gz)); - afpvec4 v43 = image3d_ld4(top_tm_blob, ivec3(27, sy, gz)); - afpvec4 v44 = image3d_ld4(top_tm_blob, ivec3(28, sy, gz)); - afpvec4 v45 = image3d_ld4(top_tm_blob, ivec3(29, sy, gz)); - afpvec4 v50 = image3d_ld4(top_tm_blob, ivec3(30, sy, gz)); - afpvec4 v51 = image3d_ld4(top_tm_blob, ivec3(31, sy, gz)); - afpvec4 v52 = image3d_ld4(top_tm_blob, ivec3(32, sy, gz)); - afpvec4 v53 = image3d_ld4(top_tm_blob, ivec3(33, sy, gz)); - afpvec4 v54 = image3d_ld4(top_tm_blob, ivec3(34, sy, gz)); - afpvec4 v55 = image3d_ld4(top_tm_blob, ivec3(35, sy, gz)); + afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 0)); + afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 1)); + afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 2)); + afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 3)); + afpvec4 v04 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 4)); + afpvec4 v05 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 5)); + afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 6)); + afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 7)); + afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 8)); + afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 9)); + afpvec4 v14 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 10)); + afpvec4 v15 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 11)); + afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 12)); + afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 13)); + afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 14)); + afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 15)); + afpvec4 v24 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 16)); + afpvec4 v25 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 17)); + afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 18)); + afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 19)); + afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 20)); + afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 21)); + afpvec4 v34 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 22)); + afpvec4 v35 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 23)); + afpvec4 v40 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 24)); + afpvec4 v41 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 25)); + afpvec4 v42 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 26)); + afpvec4 v43 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 27)); + afpvec4 v44 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 28)); + afpvec4 v45 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 29)); + afpvec4 v50 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 30)); + afpvec4 v51 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 31)); + afpvec4 v52 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 32)); + afpvec4 v53 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 33)); + afpvec4 v54 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 34)); + afpvec4 v55 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 35)); #else - int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 36; + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; - afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0); - afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1); - afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2); - afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3); - afpvec4 v04 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4); - afpvec4 v05 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5); - afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6); - afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7); - afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8); - afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9); - afpvec4 v14 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10); - afpvec4 v15 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11); - afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12); - afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13); - afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14); - afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15); - afpvec4 v24 = buffer_ld4(top_tm_blob_data, v_tm_offset + 16); - afpvec4 v25 = buffer_ld4(top_tm_blob_data, v_tm_offset + 17); - afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 18); - afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 19); - afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 20); - afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 21); - afpvec4 v34 = buffer_ld4(top_tm_blob_data, v_tm_offset + 22); - afpvec4 v35 = buffer_ld4(top_tm_blob_data, v_tm_offset + 23); - afpvec4 v40 = buffer_ld4(top_tm_blob_data, v_tm_offset + 24); - afpvec4 v41 = buffer_ld4(top_tm_blob_data, v_tm_offset + 25); - afpvec4 v42 = buffer_ld4(top_tm_blob_data, v_tm_offset + 26); - afpvec4 v43 = buffer_ld4(top_tm_blob_data, v_tm_offset + 27); - afpvec4 v44 = buffer_ld4(top_tm_blob_data, v_tm_offset + 28); - afpvec4 v45 = buffer_ld4(top_tm_blob_data, v_tm_offset + 29); - afpvec4 v50 = buffer_ld4(top_tm_blob_data, v_tm_offset + 30); - afpvec4 v51 = buffer_ld4(top_tm_blob_data, v_tm_offset + 31); - afpvec4 v52 = buffer_ld4(top_tm_blob_data, v_tm_offset + 32); - afpvec4 v53 = buffer_ld4(top_tm_blob_data, v_tm_offset + 33); - afpvec4 v54 = buffer_ld4(top_tm_blob_data, v_tm_offset + 34); - afpvec4 v55 = buffer_ld4(top_tm_blob_data, v_tm_offset + 35); + afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0 * psc(cstep)); + afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1 * psc(cstep)); + afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2 * psc(cstep)); + afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3 * psc(cstep)); + afpvec4 v04 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4 * psc(cstep)); + afpvec4 v05 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5 * psc(cstep)); + afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6 * psc(cstep)); + afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7 * psc(cstep)); + afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8 * psc(cstep)); + afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9 * psc(cstep)); + afpvec4 v14 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10 * psc(cstep)); + afpvec4 v15 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11 * psc(cstep)); + afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12 * psc(cstep)); + afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13 * psc(cstep)); + afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14 * psc(cstep)); + afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15 * psc(cstep)); + afpvec4 v24 = buffer_ld4(top_tm_blob_data, v_tm_offset + 16 * psc(cstep)); + afpvec4 v25 = buffer_ld4(top_tm_blob_data, v_tm_offset + 17 * psc(cstep)); + afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 18 * psc(cstep)); + afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 19 * psc(cstep)); + afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 20 * psc(cstep)); + afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 21 * psc(cstep)); + afpvec4 v34 = buffer_ld4(top_tm_blob_data, v_tm_offset + 22 * psc(cstep)); + afpvec4 v35 = buffer_ld4(top_tm_blob_data, v_tm_offset + 23 * psc(cstep)); + afpvec4 v40 = buffer_ld4(top_tm_blob_data, v_tm_offset + 24 * psc(cstep)); + afpvec4 v41 = buffer_ld4(top_tm_blob_data, v_tm_offset + 25 * psc(cstep)); + afpvec4 v42 = buffer_ld4(top_tm_blob_data, v_tm_offset + 26 * psc(cstep)); + afpvec4 v43 = buffer_ld4(top_tm_blob_data, v_tm_offset + 27 * psc(cstep)); + afpvec4 v44 = buffer_ld4(top_tm_blob_data, v_tm_offset + 28 * psc(cstep)); + afpvec4 v45 = buffer_ld4(top_tm_blob_data, v_tm_offset + 29 * psc(cstep)); + afpvec4 v50 = buffer_ld4(top_tm_blob_data, v_tm_offset + 30 * psc(cstep)); + afpvec4 v51 = buffer_ld4(top_tm_blob_data, v_tm_offset + 31 * psc(cstep)); + afpvec4 v52 = buffer_ld4(top_tm_blob_data, v_tm_offset + 32 * psc(cstep)); + afpvec4 v53 = buffer_ld4(top_tm_blob_data, v_tm_offset + 33 * psc(cstep)); + afpvec4 v54 = buffer_ld4(top_tm_blob_data, v_tm_offset + 34 * psc(cstep)); + afpvec4 v55 = buffer_ld4(top_tm_blob_data, v_tm_offset + 35 * psc(cstep)); #endif // const float otm[4][6] = { diff --git a/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp new file mode 100644 index 000000000..1f70b2fdb --- /dev/null +++ b/src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp @@ -0,0 +1,238 @@ +// Tencent is pleased to support the open source community by making ncnn available. +// +// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved. +// +// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except +// in compliance with the License. You may obtain a copy of the License at +// +// https://opensource.org/licenses/BSD-3-Clause +// +// Unless required by applicable law or agreed to in writing, software distributed +// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +// CONDITIONS OF ANY KIND, either express or implied. See the License for the +// specific language governing permissions and limitations under the License. + +#version 450 + +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic +#extension GL_EXT_shader_explicit_arithmetic_types_float16: require +#endif + +#define LOCAL_MEMORY_UNROLL_INCH 8 + +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 +layout (constant_id = shape_constant_id_offset + 0) const int c = 0; +layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; + +#if NCNN_image_shader +layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; +layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; +layout (binding = 2) uniform unfp sampler3D weight_tm_blob; +#else +layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; }; +layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; }; +layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; }; +#endif + +layout (push_constant) uniform parameter +{ + int c; + int cstep; + + int outw; + int outc; + int outcstep; +} p; + +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4]; +#endif + +void main() +{ + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); + int gz = int(gl_GlobalInvocationID.z); + +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; +#endif + + afpvec4 sum0 = afpvec4(0.f); + afpvec4 sum1 = afpvec4(0.f); + afpvec4 sum2 = afpvec4(0.f); + afpvec4 sum3 = afpvec4(0.f); + +#if NCNN_image_shader + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx + 0, z, gz)); + afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx + 1, z, gz)); + afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx + 2, z, gz)); + afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx + 3, z, gz)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_tm_blob, ivec3(z * 4 + 0, gy, gz)), + image3d_ld4(weight_tm_blob, ivec3(z * 4 + 1, gy, gz)), + image3d_ld4(weight_tm_blob, ivec3(z * 4 + 2, gy, gz)), + image3d_ld4(weight_tm_blob, ivec3(z * 4 + 3, gy, gz)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int v_offset = gz * psc(cstep) + gx; + int w_offset = (gz * psc(c) * psc(outc) + gy * psc(c)) * 4; + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_tm_blob_data[v_offset + z4 * psc(outw) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_tm_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(outw); + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_tm_blob_data[v_offset + z4 * psc(outw) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_tm_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + } +#else + for (int z = 0; z < psc(c); z++) + { + afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 1); + afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 2); + afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 3); + + afpmat4 k = afpmat4( + buffer_ld4(weight_tm_data, w_offset + 0), + buffer_ld4(weight_tm_data, w_offset + 1), + buffer_ld4(weight_tm_data, w_offset + 2), + buffer_ld4(weight_tm_data, w_offset + 3) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += psc(outw); + w_offset += 4; + } +#endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) + return; +#endif + +#if NCNN_image_shader + image3d_st4(top_tm_blob, ivec3(gx + 0, gy, gz), sum0); + image3d_st4(top_tm_blob, ivec3(gx + 1, gy, gz), sum1); + image3d_st4(top_tm_blob, ivec3(gx + 2, gy, gz), sum2); + image3d_st4(top_tm_blob, ivec3(gx + 3, gy, gz), sum3); +#else + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + + buffer_st4(top_tm_blob_data, gi + 0, sum0); + if (gx + 1 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 1, sum1); + if (gx + 2 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 2, sum2); + if (gx + 3 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 3, sum3); +#endif +} diff --git a/src/layer/vulkan/shader/convolution_pack4_gemm.comp b/src/layer/vulkan/shader/convolution_pack4_gemm.comp index 83f23e4b2..645302042 100644 --- a/src/layer/vulkan/shader/convolution_pack4_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack4_gemm.comp @@ -21,6 +21,8 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif +#define LOCAL_MEMORY_UNROLL_INCH 8 + layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; layout (constant_id = 2) const int bias_term = 0; @@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -49,38 +47,35 @@ layout (binding = 3) uniform unfp sampler3D bias_blob; #else layout (binding = 0) readonly buffer col_blob { sfpvec4 col_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) -// GL_EXT_shader_16bit_storage does not define f16mat4 type :( layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; -#else -layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; -#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; #endif layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; int outcstep; } p; +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) return; +#endif afpvec4 sum0; afpvec4 sum1; @@ -90,9 +85,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld4(bias_data, gz); + sum0 = buffer_ld4(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -111,63 +106,151 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) + { + afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0)); + afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0)); + afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0)); + afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0)); + + afpmat4 k = afpmat4( + image3d_ld4(weight_blob, ivec3(z * 4 + 0, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 1, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 2, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 3, gy, 0)) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } +#else + int v_offset = gx; + int w_offset = gy * psc(h) * 4; + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH) { - for (int kk = 0; kk < maxk; kk++) + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) { - afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z)); - afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z)); - afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z)); - afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z)); - - afpmat4 k = afpmat4( - image3d_ld4(weight_blob, ivec3(kk * 4 + 0, z, gz)), - image3d_ld4(weight_blob, ivec3(kk * 4 + 1, z, gz)), - image3d_ld4(weight_blob, ivec3(kk * 4 + 2, z, gz)), - image3d_ld4(weight_blob, ivec3(kk * 4 + 3, z, gz)) - ); + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w); + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); } -#else - int w_offset = gz * psc(c) * maxk; - for (int z = 0; z < psc(c); z++) + if (z < psc(h)) { - int v_offset = gx + z * psc(cstep); + const int remain = psc(h) - z; - for (int kk = 0; kk < maxk; kk++) + if (ly < 4) { - afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); - afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); - afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); - afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); - -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( - afpmat4 k = afpmat4( - buffer_ld4(weight_data, w_offset * 4 + 0), - buffer_ld4(weight_data, w_offset * 4 + 1), - buffer_ld4(weight_data, w_offset * 4 + 2), - buffer_ld4(weight_data, w_offset * 4 + 3) - ); -#else - afpmat4 k = afpmat4(weight_data[w_offset]); -#endif + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); sum0 += v0 * k; sum1 += v1 * k; sum2 += v2 * k; sum3 += v3 * k; - - v_offset += psc(outw) * psc(outh); - w_offset += 1; } } +#else + for (int z = 0; z < psc(h); z++) + { + afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); + afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); + afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); + + afpmat4 k = afpmat4( + buffer_ld4(weight_data, w_offset + 0), + buffer_ld4(weight_data, w_offset + 1), + buffer_ld4(weight_data, w_offset + 2), + buffer_ld4(weight_data, w_offset + 3) + ); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + + v_offset += psc(w); + w_offset += 4; + } +#endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) + return; #endif if (activation_type == 1) @@ -222,16 +305,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack4_im2col.comp b/src/layer/vulkan/shader/convolution_pack4_im2col.comp index 7d27c0c9e..342d90fcc 100644 --- a/src/layer/vulkan/shader/convolution_pack4_im2col.comp +++ b/src/layer/vulkan/shader/convolution_pack4_im2col.comp @@ -29,17 +29,13 @@ layout (constant_id = 4) const int stride_w = 1; layout (constant_id = 5) const int stride_h = 1; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -51,42 +47,40 @@ layout (binding = 1) writeonly buffer col_blob { sfpvec4 col_blob_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); const int maxk = kernel_w * kernel_h; - if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c)) return; - int sy = gx / psc(outw); - int sx = gx % psc(outw); + const int sy = gx / psc(outw); + const int sx = gx % psc(outw); + + const int sz = gy / maxk; + const int k = gy % maxk; - int ky = gy / kernel_w; - int kx = gy % kernel_w; + const int ky = k / kernel_w; + const int kx = k % kernel_w; #if NCNN_image_shader - image3d_cp4(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz)); + image3d_cp4(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz)); #else - const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; - const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx; + const int gi = gy * psc(outw) * psc(outh) + gx; buffer_cp4(col_blob_data, gi, bottom_blob_data, v_offset); #endif diff --git a/src/layer/vulkan/shader/convolution_pack4to1_gemm.comp b/src/layer/vulkan/shader/convolution_pack4to1_gemm.comp index d14034ba2..5314cc186 100644 --- a/src/layer/vulkan/shader/convolution_pack4to1_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack4to1_gemm.comp @@ -21,6 +21,8 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif +#define LOCAL_MEMORY_UNROLL_INCH 8 + layout (constant_id = 0) const int kernel_w = 1; layout (constant_id = 1) const int kernel_h = 1; layout (constant_id = 2) const int bias_term = 0; @@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; int outcstep; } p; +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) return; +#endif afp sum0; afp sum1; @@ -85,9 +85,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld1(bias_data, gz); + sum0 = buffer_ld1(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -106,48 +106,131 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) + { + afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0)); + afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0)); + afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0)); + afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0)); + + afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0)); + + sum0 += dot(v0, k); + sum1 += dot(v1, k); + sum2 += dot(v2, k); + sum3 += dot(v3, k); + } +#else + int v_offset = gx; + int w_offset = gy * psc(h); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH) { - for (int kk = 0; kk < maxk; kk++) + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } + + if (lx == 0) { - afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z)); - afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z)); - afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z)); - afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); - afpvec4 k = image3d_ld4(weight_blob, ivec3(kk, z, gz)); + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += dot(v0, k); sum1 += dot(v1, k); sum2 += dot(v2, k); sum3 += dot(v3, k); } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); } -#else - int w_offset = gz * psc(c) * maxk; - for (int z = 0; z < psc(c); z++) + if (z < psc(h)) { - int v_offset = gx + z * psc(cstep); + const int remain = psc(h) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]); + } + } - for (int kk = 0; kk < maxk; kk++) + if (lx == 0) { - afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); - afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); - afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); - afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); - afpvec4 k = buffer_ld4(weight_data, w_offset); + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); sum0 += dot(v0, k); sum1 += dot(v1, k); sum2 += dot(v2, k); sum3 += dot(v3, k); - - v_offset += psc(outw) * psc(outh); - w_offset += 1; } } +#else + for (int z = 0; z < psc(h); z++) + { + afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); + afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); + afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); + + afpvec4 k = buffer_ld4(weight_data, w_offset); + + sum0 += dot(v0, k); + sum1 += dot(v1, k); + sum2 += dot(v2, k); + sum3 += dot(v3, k); + + v_offset += psc(w); + w_offset += 1; + } +#endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(w) || gy >= psc(outc)) + return; #endif if (activation_type == 1) @@ -202,16 +285,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st1(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack4to8_gemm.comp b/src/layer/vulkan/shader/convolution_pack4to8_gemm.comp index 15f6759fa..32f69f3b3 100644 --- a/src/layer/vulkan/shader/convolution_pack4to8_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack4to8_gemm.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -73,9 +65,8 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(w) || gy >= psc(outc)) return; afpvec8 sum0; @@ -86,9 +77,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld8(bias_data, gz); + sum0 = buffer_ld8(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -107,125 +98,118 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - for (int kk = 0; kk < maxk; kk++) - { - afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z)); - afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z)); - afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z)); - afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z)); - - afpvec4 k0 = image3d_ld4(weight_blob, ivec3(kk * 8 + 0, z, gz)); - afpvec4 k1 = image3d_ld4(weight_blob, ivec3(kk * 8 + 1, z, gz)); - afpvec4 k2 = image3d_ld4(weight_blob, ivec3(kk * 8 + 2, z, gz)); - afpvec4 k3 = image3d_ld4(weight_blob, ivec3(kk * 8 + 3, z, gz)); - afpvec4 k4 = image3d_ld4(weight_blob, ivec3(kk * 8 + 4, z, gz)); - afpvec4 k5 = image3d_ld4(weight_blob, ivec3(kk * 8 + 5, z, gz)); - afpvec4 k6 = image3d_ld4(weight_blob, ivec3(kk * 8 + 6, z, gz)); - afpvec4 k7 = image3d_ld4(weight_blob, ivec3(kk * 8 + 7, z, gz)); - - // sum += v * k; - sum0[0].r += dot(v0, k0); - sum0[0].g += dot(v0, k1); - sum0[0].b += dot(v0, k2); - sum0[0].a += dot(v0, k3); - sum0[1].r += dot(v0, k4); - sum0[1].g += dot(v0, k5); - sum0[1].b += dot(v0, k6); - sum0[1].a += dot(v0, k7); - - sum1[0].r += dot(v1, k0); - sum1[0].g += dot(v1, k1); - sum1[0].b += dot(v1, k2); - sum1[0].a += dot(v1, k3); - sum1[1].r += dot(v1, k4); - sum1[1].g += dot(v1, k5); - sum1[1].b += dot(v1, k6); - sum1[1].a += dot(v1, k7); - - sum2[0].r += dot(v2, k0); - sum2[0].g += dot(v2, k1); - sum2[0].b += dot(v2, k2); - sum2[0].a += dot(v2, k3); - sum2[1].r += dot(v2, k4); - sum2[1].g += dot(v2, k5); - sum2[1].b += dot(v2, k6); - sum2[1].a += dot(v2, k7); - - sum3[0].r += dot(v3, k0); - sum3[0].g += dot(v3, k1); - sum3[0].b += dot(v3, k2); - sum3[0].a += dot(v3, k3); - sum3[1].r += dot(v3, k4); - sum3[1].g += dot(v3, k5); - sum3[1].b += dot(v3, k6); - sum3[1].a += dot(v3, k7); - } + afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0)); + afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0)); + afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0)); + afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0)); + + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(z * 8 + 0, gy, 0)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(z * 8 + 1, gy, 0)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(z * 8 + 2, gy, 0)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(z * 8 + 3, gy, 0)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(z * 8 + 4, gy, 0)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(z * 8 + 5, gy, 0)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(z * 8 + 6, gy, 0)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(z * 8 + 7, gy, 0)); + + // sum += v * k; + sum0[0].r += dot(v0, k0); + sum0[0].g += dot(v0, k1); + sum0[0].b += dot(v0, k2); + sum0[0].a += dot(v0, k3); + sum0[1].r += dot(v0, k4); + sum0[1].g += dot(v0, k5); + sum0[1].b += dot(v0, k6); + sum0[1].a += dot(v0, k7); + + sum1[0].r += dot(v1, k0); + sum1[0].g += dot(v1, k1); + sum1[0].b += dot(v1, k2); + sum1[0].a += dot(v1, k3); + sum1[1].r += dot(v1, k4); + sum1[1].g += dot(v1, k5); + sum1[1].b += dot(v1, k6); + sum1[1].a += dot(v1, k7); + + sum2[0].r += dot(v2, k0); + sum2[0].g += dot(v2, k1); + sum2[0].b += dot(v2, k2); + sum2[0].a += dot(v2, k3); + sum2[1].r += dot(v2, k4); + sum2[1].g += dot(v2, k5); + sum2[1].b += dot(v2, k6); + sum2[1].a += dot(v2, k7); + + sum3[0].r += dot(v3, k0); + sum3[0].g += dot(v3, k1); + sum3[0].b += dot(v3, k2); + sum3[0].a += dot(v3, k3); + sum3[1].r += dot(v3, k4); + sum3[1].g += dot(v3, k5); + sum3[1].b += dot(v3, k6); + sum3[1].a += dot(v3, k7); } #else - int w_offset = gz * psc(c) * maxk; + int v_offset = gx; + int w_offset = gy * psc(h) * 8; - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - int v_offset = gx + z * psc(cstep); - - for (int kk = 0; kk < maxk; kk++) - { - afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); - afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); - afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); - afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); - - afpvec4 k0 = buffer_ld4(weight_data, w_offset * 8 + 0); - afpvec4 k1 = buffer_ld4(weight_data, w_offset * 8 + 1); - afpvec4 k2 = buffer_ld4(weight_data, w_offset * 8 + 2); - afpvec4 k3 = buffer_ld4(weight_data, w_offset * 8 + 3); - afpvec4 k4 = buffer_ld4(weight_data, w_offset * 8 + 4); - afpvec4 k5 = buffer_ld4(weight_data, w_offset * 8 + 5); - afpvec4 k6 = buffer_ld4(weight_data, w_offset * 8 + 6); - afpvec4 k7 = buffer_ld4(weight_data, w_offset * 8 + 7); - - // sum += v * k; - sum0[0].r += dot(v0, k0); - sum0[0].g += dot(v0, k1); - sum0[0].b += dot(v0, k2); - sum0[0].a += dot(v0, k3); - sum0[1].r += dot(v0, k4); - sum0[1].g += dot(v0, k5); - sum0[1].b += dot(v0, k6); - sum0[1].a += dot(v0, k7); - - sum1[0].r += dot(v1, k0); - sum1[0].g += dot(v1, k1); - sum1[0].b += dot(v1, k2); - sum1[0].a += dot(v1, k3); - sum1[1].r += dot(v1, k4); - sum1[1].g += dot(v1, k5); - sum1[1].b += dot(v1, k6); - sum1[1].a += dot(v1, k7); - - sum2[0].r += dot(v2, k0); - sum2[0].g += dot(v2, k1); - sum2[0].b += dot(v2, k2); - sum2[0].a += dot(v2, k3); - sum2[1].r += dot(v2, k4); - sum2[1].g += dot(v2, k5); - sum2[1].b += dot(v2, k6); - sum2[1].a += dot(v2, k7); - - sum3[0].r += dot(v3, k0); - sum3[0].g += dot(v3, k1); - sum3[0].b += dot(v3, k2); - sum3[0].a += dot(v3, k3); - sum3[1].r += dot(v3, k4); - sum3[1].g += dot(v3, k5); - sum3[1].b += dot(v3, k6); - sum3[1].a += dot(v3, k7); - - v_offset += psc(outw) * psc(outh); - w_offset += 1; - } + afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0); + afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1); + afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2); + afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3); + + afpvec4 k0 = buffer_ld4(weight_data, w_offset + 0); + afpvec4 k1 = buffer_ld4(weight_data, w_offset + 1); + afpvec4 k2 = buffer_ld4(weight_data, w_offset + 2); + afpvec4 k3 = buffer_ld4(weight_data, w_offset + 3); + afpvec4 k4 = buffer_ld4(weight_data, w_offset + 4); + afpvec4 k5 = buffer_ld4(weight_data, w_offset + 5); + afpvec4 k6 = buffer_ld4(weight_data, w_offset + 6); + afpvec4 k7 = buffer_ld4(weight_data, w_offset + 7); + + // sum += v * k; + sum0[0].r += dot(v0, k0); + sum0[0].g += dot(v0, k1); + sum0[0].b += dot(v0, k2); + sum0[0].a += dot(v0, k3); + sum0[1].r += dot(v0, k4); + sum0[1].g += dot(v0, k5); + sum0[1].b += dot(v0, k6); + sum0[1].a += dot(v0, k7); + + sum1[0].r += dot(v1, k0); + sum1[0].g += dot(v1, k1); + sum1[0].b += dot(v1, k2); + sum1[0].a += dot(v1, k3); + sum1[1].r += dot(v1, k4); + sum1[1].g += dot(v1, k5); + sum1[1].b += dot(v1, k6); + sum1[1].a += dot(v1, k7); + + sum2[0].r += dot(v2, k0); + sum2[0].g += dot(v2, k1); + sum2[0].b += dot(v2, k2); + sum2[0].a += dot(v2, k3); + sum2[1].r += dot(v2, k4); + sum2[1].g += dot(v2, k5); + sum2[1].b += dot(v2, k6); + sum2[1].a += dot(v2, k7); + + sum3[0].r += dot(v3, k0); + sum3[0].g += dot(v3, k1); + sum3[0].b += dot(v3, k2); + sum3[0].a += dot(v3, k3); + sum3[1].r += dot(v3, k4); + sum3[1].g += dot(v3, k5); + sum3[1].b += dot(v3, k6); + sum3[1].a += dot(v3, k7); + + v_offset += psc(w); + w_offset += 8; } #endif @@ -305,16 +289,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp deleted file mode 100644 index e5f619fd3..000000000 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp +++ /dev/null @@ -1,198 +0,0 @@ -// Tencent is pleased to support the open source community by making ncnn available. -// -// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved. -// -// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except -// in compliance with the License. You may obtain a copy of the License at -// -// https://opensource.org/licenses/BSD-3-Clause -// -// Unless required by applicable law or agreed to in writing, software distributed -// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR -// CONDITIONS OF ANY KIND, either express or implied. See the License for the -// specific language governing permissions and limitations under the License. - -#version 450 - -#if NCNN_fp16_storage -#extension GL_EXT_shader_16bit_storage: require -struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; -#endif -#if NCNN_fp16_arithmetic -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#endif - -#define shape_constant_id_offset 0 -layout (constant_id = shape_constant_id_offset + 0) const int c = 0; -layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; - -#if NCNN_image_shader -layout (binding = 0) uniform unfp sampler3D bottom_tm_blob; -layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob; -layout (binding = 2) uniform unfp sampler3D weight_tm_blob; -#else -layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; }; -layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; }; -layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; }; -#endif - -layout (push_constant) uniform parameter -{ - int c; - int cstep; - - int outh; - int outc; - int outcstep; -} p; - -void main() -{ - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y) * 4; - int gz = int(gl_GlobalInvocationID.z); - - if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc)) - return; - - afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); - afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); - afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); - afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); - -#if NCNN_image_shader - int wx = gx * 8; - - for (int z = 0; z < psc(c); z++) - { - afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z)); - afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z)); - afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z)); - afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z)); - - afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz)); - afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz)); - afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz)); - afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz)); - afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz)); - - // sum += v * k - sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); - sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); - sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); - sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); - - sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); - sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); - sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); - sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); - - sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); - sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); - sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); - sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); - - sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); - sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); - sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); - sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); - } -#else - int v_offset = gy * 16 + gx; - int w_offset = (gz * psc(c) * 16 + gx) * 8; - - for (int z = 0; z < psc(c); z++) - { - afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0); - afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 16); - afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 32); - afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 48); - - afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0); - afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1); - afpvec8 k2 = buffer_ld8(weight_tm_data, w_offset + 2); - afpvec8 k3 = buffer_ld8(weight_tm_data, w_offset + 3); - afpvec8 k4 = buffer_ld8(weight_tm_data, w_offset + 4); - afpvec8 k5 = buffer_ld8(weight_tm_data, w_offset + 5); - afpvec8 k6 = buffer_ld8(weight_tm_data, w_offset + 6); - afpvec8 k7 = buffer_ld8(weight_tm_data, w_offset + 7); - - // sum += v * k - sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); - sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); - sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); - sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); - - sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); - sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); - sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); - sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); - - sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); - sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); - sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); - sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); - - sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); - sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); - sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); - sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); - - v_offset += psc(cstep); - w_offset += 16 * 8; - } -#endif - -#if NCNN_image_shader - image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); - image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); - image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); - image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); -#else - int gi = gz * psc(outcstep) + gy * 16 + gx; - - buffer_st8(top_tm_blob_data, gi + 0, sum0); - if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1); - if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2); - if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3); -#endif -} diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp index 23b89c572..cf47d370c 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp @@ -60,7 +60,7 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 4x4 @@ -162,42 +162,42 @@ void main() // store 16 #if NCNN_image_shader - int y = gy * p.block_x + gx; - - image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00); - image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01); - image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02); - image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03); - image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10); - image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11); - image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12); - image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13); - image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20); - image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21); - image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22); - image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23); - image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30); - image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31); - image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32); - image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33); + int x = gy * psc(block_x) + gx; + + image3d_st8(bottom_tm_blob, ivec3(x, gz, 0), v00); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 1), v01); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 2), v02); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 3), v03); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 4), v10); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 5), v11); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 6), v12); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 7), v13); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 8), v20); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 9), v21); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 10), v22); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 11), v23); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 12), v30); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 13), v31); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 14), v32); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 15), v33); #else - int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16; - - buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v10); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v11); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v12); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v13); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v20); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v21); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v22); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v23); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v30); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + buffer_st8(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v10); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v11); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v12); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v13); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v20); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v21); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v22); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v23); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v30); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v31); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v32); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v33); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp index b01c15a4f..12be96f3f 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp @@ -67,48 +67,48 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c)) + if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c)) return; // load 16 #if NCNN_image_shader - int sy = gy * p.block_x + gx; - - afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz)); - afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz)); - afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz)); - afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz)); - afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz)); - afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz)); - afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz)); - afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz)); - afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz)); - afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz)); - afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz)); - afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz)); - afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz)); - afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz)); - afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz)); - afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz)); + int sx = gy * psc(block_x) + gx; + + afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 0)); + afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 1)); + afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 2)); + afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 3)); + afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 4)); + afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 5)); + afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 6)); + afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 7)); + afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 8)); + afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 9)); + afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 10)); + afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 11)); + afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 12)); + afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 13)); + afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 14)); + afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 15)); #else - int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16; - - afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0); - afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1); - afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2); - afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3); - afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4); - afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5); - afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6); - afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7); - afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8); - afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9); - afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10); - afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11); - afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12); - afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13); - afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14); - afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0 * psc(cstep)); + afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1 * psc(cstep)); + afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2 * psc(cstep)); + afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3 * psc(cstep)); + afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4 * psc(cstep)); + afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5 * psc(cstep)); + afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6 * psc(cstep)); + afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7 * psc(cstep)); + afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8 * psc(cstep)); + afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9 * psc(cstep)); + afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10 * psc(cstep)); + afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11 * psc(cstep)); + afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12 * psc(cstep)); + afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13 * psc(cstep)); + afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14 * psc(cstep)); + afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15 * psc(cstep)); #endif // const float itm[2][4] = { diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp index 728583702..0898f0c63 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp @@ -260,82 +260,82 @@ void main() // store 36 #if NCNN_image_shader - int y = gy * p.block_x + gx; - - image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00); - image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01); - image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02); - image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03); - image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v04); - image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v05); - image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v10); - image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v11); - image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v12); - image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v13); - image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v14); - image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v15); - image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v20); - image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v21); - image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v22); - image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v23); - image3d_st8(bottom_tm_blob, ivec3(16, y, gz), v24); - image3d_st8(bottom_tm_blob, ivec3(17, y, gz), v25); - image3d_st8(bottom_tm_blob, ivec3(18, y, gz), v30); - image3d_st8(bottom_tm_blob, ivec3(19, y, gz), v31); - image3d_st8(bottom_tm_blob, ivec3(20, y, gz), v32); - image3d_st8(bottom_tm_blob, ivec3(21, y, gz), v33); - image3d_st8(bottom_tm_blob, ivec3(22, y, gz), v34); - image3d_st8(bottom_tm_blob, ivec3(23, y, gz), v35); - image3d_st8(bottom_tm_blob, ivec3(24, y, gz), v40); - image3d_st8(bottom_tm_blob, ivec3(25, y, gz), v41); - image3d_st8(bottom_tm_blob, ivec3(26, y, gz), v42); - image3d_st8(bottom_tm_blob, ivec3(27, y, gz), v43); - image3d_st8(bottom_tm_blob, ivec3(28, y, gz), v44); - image3d_st8(bottom_tm_blob, ivec3(29, y, gz), v45); - image3d_st8(bottom_tm_blob, ivec3(30, y, gz), v50); - image3d_st8(bottom_tm_blob, ivec3(31, y, gz), v51); - image3d_st8(bottom_tm_blob, ivec3(32, y, gz), v52); - image3d_st8(bottom_tm_blob, ivec3(33, y, gz), v53); - image3d_st8(bottom_tm_blob, ivec3(34, y, gz), v54); - image3d_st8(bottom_tm_blob, ivec3(35, y, gz), v55); + int x = gy * psc(block_x) + gx; + + image3d_st8(bottom_tm_blob, ivec3(x, gz, 0), v00); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 1), v01); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 2), v02); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 3), v03); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 4), v04); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 5), v05); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 6), v10); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 7), v11); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 8), v12); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 9), v13); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 10), v14); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 11), v15); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 12), v20); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 13), v21); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 14), v22); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 15), v23); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 16), v24); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 17), v25); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 18), v30); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 19), v31); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 20), v32); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 21), v33); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 22), v34); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 23), v35); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 24), v40); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 25), v41); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 26), v42); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 27), v43); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 28), v44); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 29), v45); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 30), v50); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 31), v51); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 32), v52); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 33), v53); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 34), v54); + image3d_st8(bottom_tm_blob, ivec3(x, gz, 35), v55); #else - int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 36; - - buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v04); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v05); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v10); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v11); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v12); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v13); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v14); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v15); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v20); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v21); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v22); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v23); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 16, v24); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 17, v25); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 18, v30); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 19, v31); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 20, v32); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 21, v33); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 22, v34); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 23, v35); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 24, v40); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 25, v41); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 26, v42); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 27, v43); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 28, v44); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 29, v45); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 30, v50); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 31, v51); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 32, v52); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 33, v53); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 34, v54); - buffer_st8(bottom_tm_blob_data, v_tm_offset + 35, v55); + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; + + buffer_st8(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v04); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v05); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v10); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v11); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v12); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v13); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v14); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v15); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v20); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v21); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v22); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v23); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 16 * psc(outcstep), v24); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 17 * psc(outcstep), v25); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 18 * psc(outcstep), v30); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 19 * psc(outcstep), v31); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 20 * psc(outcstep), v32); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 21 * psc(outcstep), v33); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 22 * psc(outcstep), v34); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 23 * psc(outcstep), v35); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 24 * psc(outcstep), v40); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 25 * psc(outcstep), v41); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 26 * psc(outcstep), v42); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 27 * psc(outcstep), v43); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 28 * psc(outcstep), v44); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 29 * psc(outcstep), v45); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 30 * psc(outcstep), v50); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 31 * psc(outcstep), v51); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 32 * psc(outcstep), v52); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 33 * psc(outcstep), v53); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 34 * psc(outcstep), v54); + buffer_st8(bottom_tm_blob_data, v_tm_offset + 35 * psc(outcstep), v55); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp index d96d51e83..e12f03ade 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp @@ -72,83 +72,83 @@ void main() // load 36 #if NCNN_image_shader - int sy = gy * p.block_x + gx; + int sx = gy * psc(block_x) + gx; - afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz)); - afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz)); - afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz)); - afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz)); - afpvec8 v04 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz)); - afpvec8 v05 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz)); - afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz)); - afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz)); - afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz)); - afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz)); - afpvec8 v14 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz)); - afpvec8 v15 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz)); - afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz)); - afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz)); - afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz)); - afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz)); - afpvec8 v24 = image3d_ld8(top_tm_blob, ivec3(16, sy, gz)); - afpvec8 v25 = image3d_ld8(top_tm_blob, ivec3(17, sy, gz)); - afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(18, sy, gz)); - afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(19, sy, gz)); - afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(20, sy, gz)); - afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(21, sy, gz)); - afpvec8 v34 = image3d_ld8(top_tm_blob, ivec3(22, sy, gz)); - afpvec8 v35 = image3d_ld8(top_tm_blob, ivec3(23, sy, gz)); - afpvec8 v40 = image3d_ld8(top_tm_blob, ivec3(24, sy, gz)); - afpvec8 v41 = image3d_ld8(top_tm_blob, ivec3(25, sy, gz)); - afpvec8 v42 = image3d_ld8(top_tm_blob, ivec3(26, sy, gz)); - afpvec8 v43 = image3d_ld8(top_tm_blob, ivec3(27, sy, gz)); - afpvec8 v44 = image3d_ld8(top_tm_blob, ivec3(28, sy, gz)); - afpvec8 v45 = image3d_ld8(top_tm_blob, ivec3(29, sy, gz)); - afpvec8 v50 = image3d_ld8(top_tm_blob, ivec3(30, sy, gz)); - afpvec8 v51 = image3d_ld8(top_tm_blob, ivec3(31, sy, gz)); - afpvec8 v52 = image3d_ld8(top_tm_blob, ivec3(32, sy, gz)); - afpvec8 v53 = image3d_ld8(top_tm_blob, ivec3(33, sy, gz)); - afpvec8 v54 = image3d_ld8(top_tm_blob, ivec3(34, sy, gz)); - afpvec8 v55 = image3d_ld8(top_tm_blob, ivec3(35, sy, gz)); + afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 0)); + afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 1)); + afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 2)); + afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 3)); + afpvec8 v04 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 4)); + afpvec8 v05 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 5)); + afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 6)); + afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 7)); + afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 8)); + afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 9)); + afpvec8 v14 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 10)); + afpvec8 v15 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 11)); + afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 12)); + afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 13)); + afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 14)); + afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 15)); + afpvec8 v24 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 16)); + afpvec8 v25 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 17)); + afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 18)); + afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 19)); + afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 20)); + afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 21)); + afpvec8 v34 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 22)); + afpvec8 v35 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 23)); + afpvec8 v40 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 24)); + afpvec8 v41 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 25)); + afpvec8 v42 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 26)); + afpvec8 v43 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 27)); + afpvec8 v44 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 28)); + afpvec8 v45 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 29)); + afpvec8 v50 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 30)); + afpvec8 v51 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 31)); + afpvec8 v52 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 32)); + afpvec8 v53 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 33)); + afpvec8 v54 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 34)); + afpvec8 v55 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 35)); #else - int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 36; + int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx; - afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0); - afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1); - afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2); - afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3); - afpvec8 v04 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4); - afpvec8 v05 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5); - afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6); - afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7); - afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8); - afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9); - afpvec8 v14 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10); - afpvec8 v15 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11); - afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12); - afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13); - afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14); - afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15); - afpvec8 v24 = buffer_ld8(top_tm_blob_data, v_tm_offset + 16); - afpvec8 v25 = buffer_ld8(top_tm_blob_data, v_tm_offset + 17); - afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 18); - afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 19); - afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 20); - afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 21); - afpvec8 v34 = buffer_ld8(top_tm_blob_data, v_tm_offset + 22); - afpvec8 v35 = buffer_ld8(top_tm_blob_data, v_tm_offset + 23); - afpvec8 v40 = buffer_ld8(top_tm_blob_data, v_tm_offset + 24); - afpvec8 v41 = buffer_ld8(top_tm_blob_data, v_tm_offset + 25); - afpvec8 v42 = buffer_ld8(top_tm_blob_data, v_tm_offset + 26); - afpvec8 v43 = buffer_ld8(top_tm_blob_data, v_tm_offset + 27); - afpvec8 v44 = buffer_ld8(top_tm_blob_data, v_tm_offset + 28); - afpvec8 v45 = buffer_ld8(top_tm_blob_data, v_tm_offset + 29); - afpvec8 v50 = buffer_ld8(top_tm_blob_data, v_tm_offset + 30); - afpvec8 v51 = buffer_ld8(top_tm_blob_data, v_tm_offset + 31); - afpvec8 v52 = buffer_ld8(top_tm_blob_data, v_tm_offset + 32); - afpvec8 v53 = buffer_ld8(top_tm_blob_data, v_tm_offset + 33); - afpvec8 v54 = buffer_ld8(top_tm_blob_data, v_tm_offset + 34); - afpvec8 v55 = buffer_ld8(top_tm_blob_data, v_tm_offset + 35); + afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0 * psc(cstep)); + afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1 * psc(cstep)); + afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2 * psc(cstep)); + afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3 * psc(cstep)); + afpvec8 v04 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4 * psc(cstep)); + afpvec8 v05 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5 * psc(cstep)); + afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6 * psc(cstep)); + afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7 * psc(cstep)); + afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8 * psc(cstep)); + afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9 * psc(cstep)); + afpvec8 v14 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10 * psc(cstep)); + afpvec8 v15 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11 * psc(cstep)); + afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12 * psc(cstep)); + afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13 * psc(cstep)); + afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14 * psc(cstep)); + afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15 * psc(cstep)); + afpvec8 v24 = buffer_ld8(top_tm_blob_data, v_tm_offset + 16 * psc(cstep)); + afpvec8 v25 = buffer_ld8(top_tm_blob_data, v_tm_offset + 17 * psc(cstep)); + afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 18 * psc(cstep)); + afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 19 * psc(cstep)); + afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 20 * psc(cstep)); + afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 21 * psc(cstep)); + afpvec8 v34 = buffer_ld8(top_tm_blob_data, v_tm_offset + 22 * psc(cstep)); + afpvec8 v35 = buffer_ld8(top_tm_blob_data, v_tm_offset + 23 * psc(cstep)); + afpvec8 v40 = buffer_ld8(top_tm_blob_data, v_tm_offset + 24 * psc(cstep)); + afpvec8 v41 = buffer_ld8(top_tm_blob_data, v_tm_offset + 25 * psc(cstep)); + afpvec8 v42 = buffer_ld8(top_tm_blob_data, v_tm_offset + 26 * psc(cstep)); + afpvec8 v43 = buffer_ld8(top_tm_blob_data, v_tm_offset + 27 * psc(cstep)); + afpvec8 v44 = buffer_ld8(top_tm_blob_data, v_tm_offset + 28 * psc(cstep)); + afpvec8 v45 = buffer_ld8(top_tm_blob_data, v_tm_offset + 29 * psc(cstep)); + afpvec8 v50 = buffer_ld8(top_tm_blob_data, v_tm_offset + 30 * psc(cstep)); + afpvec8 v51 = buffer_ld8(top_tm_blob_data, v_tm_offset + 31 * psc(cstep)); + afpvec8 v52 = buffer_ld8(top_tm_blob_data, v_tm_offset + 32 * psc(cstep)); + afpvec8 v53 = buffer_ld8(top_tm_blob_data, v_tm_offset + 33 * psc(cstep)); + afpvec8 v54 = buffer_ld8(top_tm_blob_data, v_tm_offset + 34 * psc(cstep)); + afpvec8 v55 = buffer_ld8(top_tm_blob_data, v_tm_offset + 35 * psc(cstep)); #endif // const float otm[4][6] = { diff --git a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_gemm.comp b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd_gemm.comp similarity index 79% rename from src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_gemm.comp rename to src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd_gemm.comp index b55792a32..b13f33f31 100644 --- a/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd_gemm.comp @@ -22,11 +22,13 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -#define shape_constant_id_offset 0 +layout (constant_id = 0) const int batch = 1; + +#define shape_constant_id_offset 1 layout (constant_id = shape_constant_id_offset + 0) const int c = 0; layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0; -layout (constant_id = shape_constant_id_offset + 2) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; layout (constant_id = shape_constant_id_offset + 3) const int outc = 0; layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0; @@ -45,18 +47,18 @@ layout (push_constant) uniform parameter int c; int cstep; - int outh; + int outw; int outc; int outcstep; } p; void main() { - int gx = int(gl_GlobalInvocationID.x); - int gy = int(gl_GlobalInvocationID.y) * 4; + int gx = int(gl_GlobalInvocationID.x) * 4; + int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= 36 || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch) return; afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); @@ -65,23 +67,21 @@ void main() afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f)); #if NCNN_image_shader - int wx = gx * 8; - for (int z = 0; z < psc(c); z++) { - afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z)); - afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z)); - afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z)); - afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z)); - - afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz)); - afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz)); - afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz)); - afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz)); - afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz)); + afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx + 0, z, gz)); + afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx + 1, z, gz)); + afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx + 2, z, gz)); + afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx + 3, z, gz)); + + afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 0, gy, gz)); + afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 1, gy, gz)); + afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 2, gy, gz)); + afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 3, gy, gz)); + afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 4, gy, gz)); + afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 5, gy, gz)); + afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 6, gy, gz)); + afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 7, gy, gz)); // sum += v * k sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -121,15 +121,15 @@ void main() sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); } #else - int v_offset = gy * 36 + gx; - int w_offset = (gz * psc(c) * 36 + gx) * 8; + int v_offset = gz * psc(cstep) + gx; + int w_offset = (gz * psc(c) * psc(outc) + gy * psc(c)) * 8; for (int z = 0; z < psc(c); z++) { afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0); - afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 36); - afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 72); - afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 108); + afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 1); + afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 2); + afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 3); afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0); afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1); @@ -177,22 +177,22 @@ void main() sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); - v_offset += psc(cstep); - w_offset += 36 * 8; + v_offset += psc(outw); + w_offset += 8; } #endif #if NCNN_image_shader - image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0); - image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1); - image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2); - image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3); + image3d_st8(top_tm_blob, ivec3(gx + 0, gy, gz), sum0); + image3d_st8(top_tm_blob, ivec3(gx + 1, gy, gz), sum1); + image3d_st8(top_tm_blob, ivec3(gx + 2, gy, gz), sum2); + image3d_st8(top_tm_blob, ivec3(gx + 3, gy, gz), sum3); #else - int gi = gz * psc(outcstep) + gy * 36 + gx; + int gi = gz * psc(outcstep) + gy * psc(outw) + gx; buffer_st8(top_tm_blob_data, gi + 0, sum0); - if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 36, sum1); - if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 72, sum2); - if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 108, sum3); + if (gx + 1 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 1, sum1); + if (gx + 2 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 2, sum2); + if (gx + 3 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_gemm.comp b/src/layer/vulkan/shader/convolution_pack8_gemm.comp index 48fa00f70..c2a0e8f03 100644 --- a/src/layer/vulkan/shader/convolution_pack8_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack8_gemm.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -73,9 +65,8 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(w) || gy >= psc(outc)) return; afpvec8 sum0; @@ -86,9 +77,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld8(bias_data, gz); + sum0 = buffer_ld8(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -107,125 +98,118 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z)); - afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z)); - afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z)); - afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z)); - - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(kk * 8 + 0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(kk * 8 + 1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(kk * 8 + 2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(kk * 8 + 3, z, gz)); - afpvec8 k4 = image3d_ld8(weight_blob, ivec3(kk * 8 + 4, z, gz)); - afpvec8 k5 = image3d_ld8(weight_blob, ivec3(kk * 8 + 5, z, gz)); - afpvec8 k6 = image3d_ld8(weight_blob, ivec3(kk * 8 + 6, z, gz)); - afpvec8 k7 = image3d_ld8(weight_blob, ivec3(kk * 8 + 7, z, gz)); - - // sum += v * k; - sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); - sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); - sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); - sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); - - sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); - sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); - sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); - sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); - - sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); - sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); - sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); - sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); - - sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); - sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); - sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); - sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); - } + afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0)); + afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0)); + afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0)); + afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 8 + 0, gy, 0)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 8 + 1, gy, 0)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 8 + 2, gy, 0)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 8 + 3, gy, 0)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(z * 8 + 4, gy, 0)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(z * 8 + 5, gy, 0)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(z * 8 + 6, gy, 0)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(z * 8 + 7, gy, 0)); + + // sum += v * k; + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); } #else - int w_offset = gz * psc(c) * maxk; + int v_offset = gx; + int w_offset = gy * psc(h) * 8; - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - int v_offset = gx + z * psc(cstep); - - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); - afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); - afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); - afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); - - afpvec8 k0 = buffer_ld8(weight_data, w_offset * 8 + 0); - afpvec8 k1 = buffer_ld8(weight_data, w_offset * 8 + 1); - afpvec8 k2 = buffer_ld8(weight_data, w_offset * 8 + 2); - afpvec8 k3 = buffer_ld8(weight_data, w_offset * 8 + 3); - afpvec8 k4 = buffer_ld8(weight_data, w_offset * 8 + 4); - afpvec8 k5 = buffer_ld8(weight_data, w_offset * 8 + 5); - afpvec8 k6 = buffer_ld8(weight_data, w_offset * 8 + 6); - afpvec8 k7 = buffer_ld8(weight_data, w_offset * 8 + 7); - - // sum += v * k; - sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); - sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); - sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); - sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); - - sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); - sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); - sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); - sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); - - sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); - sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); - sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); - sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); - - sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); - sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); - sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); - sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); - - v_offset += psc(outw) * psc(outh); - w_offset += 1; - } + afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); + afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); + afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); + afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); + + afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3); + afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4); + afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5); + afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6); + afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7); + + // sum += v * k; + sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]); + sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]); + sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]); + sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]); + + sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]); + sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]); + sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]); + sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]); + + sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]); + sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]); + sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]); + sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]); + + sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]); + sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]); + sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]); + sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); + + v_offset += psc(w); + w_offset += 8; } #endif @@ -305,16 +289,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st8(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8_im2col.comp b/src/layer/vulkan/shader/convolution_pack8_im2col.comp index 6d8fd4baf..1732361cd 100644 --- a/src/layer/vulkan/shader/convolution_pack8_im2col.comp +++ b/src/layer/vulkan/shader/convolution_pack8_im2col.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const int stride_w = 1; layout (constant_id = 5) const int stride_h = 1; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -52,42 +48,40 @@ layout (binding = 1) writeonly buffer col_blob { sfpvec8 col_blob_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); const int maxk = kernel_w * kernel_h; - if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc)) + if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c)) return; - int sy = gx / psc(outw); - int sx = gx % psc(outw); + const int sy = gx / psc(outw); + const int sx = gx % psc(outw); + + const int sz = gy / maxk; + const int k = gy % maxk; - int ky = gy / kernel_w; - int kx = gy % kernel_w; + const int ky = k / kernel_w; + const int kx = k % kernel_w; #if NCNN_image_shader - image3d_cp8(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz)); + image3d_cp8(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz)); #else - const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; + const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w; - const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx; + const int gi = gy * psc(outw) * psc(outh) + gx; buffer_cp8(col_blob_data, gi, bottom_blob_data, v_offset); #endif diff --git a/src/layer/vulkan/shader/convolution_pack8to1_gemm.comp b/src/layer/vulkan/shader/convolution_pack8to1_gemm.comp index 33dbc0a62..199d55620 100644 --- a/src/layer/vulkan/shader/convolution_pack8to1_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack8to1_gemm.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -73,9 +65,8 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(w) || gy >= psc(outc)) return; afp sum0; @@ -86,9 +77,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld1(bias_data, gz); + sum0 = buffer_ld1(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -107,49 +98,42 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z)); - afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z)); - afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z)); - afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z)); - - afpvec8 k = image3d_ld8(weight_blob, ivec3(kk, z, gz)); - - // sum += dot(v, k); - sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); - sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]); - sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]); - sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); - } + afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0)); + afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0)); + afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0)); + afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0)); + + afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0)); + + // sum += dot(v, k); + sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); + sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]); + sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]); + sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); } #else - int w_offset = gz * psc(c) * maxk; + int v_offset = gx; + int w_offset = gy * psc(h); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - int v_offset = gx + z * psc(cstep); - - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); - afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); - afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); - afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); - - afpvec8 k = buffer_ld8(weight_data, w_offset); - - // sum += dot(v, k); - sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); - sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]); - sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]); - sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); - - v_offset += psc(outw) * psc(outh); - w_offset += 1; - } + afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); + afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); + afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); + afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); + + afpvec8 k = buffer_ld8(weight_data, w_offset); + + // sum += dot(v, k); + sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); + sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]); + sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]); + sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); + + v_offset += psc(w); + w_offset += 1; } #endif @@ -205,16 +189,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st1(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/convolution_pack8to4_gemm.comp b/src/layer/vulkan/shader/convolution_pack8to4_gemm.comp index f7626c98b..7df5a8fdb 100644 --- a/src/layer/vulkan/shader/convolution_pack8to4_gemm.comp +++ b/src/layer/vulkan/shader/convolution_pack8to4_gemm.comp @@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0; layout (constant_id = 5) const float activation_param_1 = 0; #define shape_constant_id_offset 6 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -73,9 +65,8 @@ void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc)) + if (gx >= psc(w) || gy >= psc(outc)) return; afpvec4 sum0; @@ -86,9 +77,9 @@ void main() if (bias_term == 1) { #if NCNN_image_shader - sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0)); + sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0)); #else - sum0 = buffer_ld4(bias_data, gz); + sum0 = buffer_ld4(bias_data, gy); #endif sum1 = sum0; sum2 = sum0; @@ -107,85 +98,78 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z)); - afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z)); - afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z)); - afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z)); - - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(kk * 4 + 0, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(kk * 4 + 1, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(kk * 4 + 2, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(kk * 4 + 3, z, gz)); - - // sum += v * k; - sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - - sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - - sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - - sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - } + afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0)); + afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0)); + afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0)); + afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0)); + + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 4 + 0, gy, 0)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 4 + 1, gy, 0)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 4 + 2, gy, 0)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 4 + 3, gy, 0)); + + // sum += v * k; + sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + + sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + + sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + + sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); } #else - int w_offset = gz * psc(c) * maxk; + int v_offset = gx; + int w_offset = gy * psc(h) * 4; - for (int z = 0; z < psc(c); z++) + for (int z = 0; z < psc(h); z++) { - int v_offset = gx + z * psc(cstep); - - for (int kk = 0; kk < maxk; kk++) - { - afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); - afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); - afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); - afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); - - afpvec8 k0 = buffer_ld8(weight_data, w_offset * 4 + 0); - afpvec8 k1 = buffer_ld8(weight_data, w_offset * 4 + 1); - afpvec8 k2 = buffer_ld8(weight_data, w_offset * 4 + 2); - afpvec8 k3 = buffer_ld8(weight_data, w_offset * 4 + 3); - - // sum += v * k; - sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); - sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); - sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); - sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); - - sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); - sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); - sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); - sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); - - sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); - sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); - sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); - sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); - - sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); - sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); - sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); - sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); - - v_offset += psc(outw) * psc(outh); - w_offset += 1; - } + afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0); + afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1); + afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2); + afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3); + + afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3); + + // sum += v * k; + sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); + sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]); + sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]); + sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]); + + sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]); + sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]); + sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]); + sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]); + + sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]); + sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]); + sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]); + sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]); + + sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]); + sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]); + sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]); + sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); + + v_offset += psc(w); + w_offset += 4; } #endif @@ -241,16 +225,16 @@ void main() ivec4 sy4 = gx4 / psc(outw); ivec4 sx4 = gx4 % psc(outw); - image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0); - image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1); - image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2); - image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3); + image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0); + image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1); + image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2); + image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3); #else - const int gi = gz * psc(outcstep) + gx; + const int gi = gy * psc(outcstep) + gx; buffer_st4(top_blob_data, gi, sum0); - if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1); - if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2); - if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3); + if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1); + if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2); + if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3); #endif } diff --git a/src/layer/vulkan/shader/deconvolution_col2im.comp b/src/layer/vulkan/shader/deconvolution_col2im.comp index 525bcd99b..d65a5dc0e 100644 --- a/src/layer/vulkan/shader/deconvolution_col2im.comp +++ b/src/layer/vulkan/shader/deconvolution_col2im.comp @@ -33,17 +33,13 @@ layout (constant_id = 8) const float activation_param_0 = 0; layout (constant_id = 9) const float activation_param_1 = 0; #define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -57,13 +53,9 @@ layout (binding = 2) readonly buffer bias_blob { sfp bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -94,6 +86,8 @@ void main() sum = afp(0.f); } + const int maxk = kernel_w * kernel_h; + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; @@ -116,9 +110,9 @@ void main() w_k /= dilation_w; #if NCNN_image_shader - sum += image3d_ld1(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz)); + sum += image3d_ld1(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0)); #else - const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; + const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; sum += buffer_ld1(col_blob_data, gi); #endif diff --git a/src/layer/vulkan/shader/deconvolution_gemm.comp b/src/layer/vulkan/shader/deconvolution_gemm.comp index 8d19d63a7..994f7b59e 100644 --- a/src/layer/vulkan/shader/deconvolution_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_gemm.comp @@ -21,21 +21,16 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define LOCAL_MEMORY_UNROLL_INCH 8 + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; +#if NCNN_shader_local_memory +shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfp tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) return; - - const int maxk = kernel_w * kernel_h; +#endif afp sum0 = afp(0.f); afp sum1 = afp(0.f); @@ -91,7 +86,7 @@ void main() afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afp k = image3d_ld1(weight_blob, ivec3(gy, z, gz)); + afp k = image3d_ld1(weight_blob, ivec3(z, gy, 0)); sum0 += v0 * k; sum1 += v1 * k; @@ -100,8 +95,92 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afp k = lfp2afp(tmp_k[ly][z4]); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afp k = lfp2afp(tmp_k[ly][z4]); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + } +#else for (int z = 0; z < psc(c); z++) { afp v0 = buffer_ld1(bottom_blob_data, v_offset + 0); @@ -117,17 +196,23 @@ void main() sum3 += v3 * k; v_offset += psc(cstep); - w_offset += maxk; + w_offset += 1; } #endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) + return; +#endif #if NCNN_image_shader - image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st1(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp index d5849c0d4..0e8d4e9c6 100644 --- a/src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp @@ -21,21 +21,16 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define LOCAL_MEMORY_UNROLL_INCH 8 + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; +#if NCNN_shader_local_memory +shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) return; - - const int maxk = kernel_w * kernel_h; +#endif afpvec4 sum0 = afpvec4(0.f); afpvec4 sum1 = afpvec4(0.f); @@ -91,7 +86,7 @@ void main() afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k = image3d_ld4(weight_blob, ivec3(gy, z, gz)); + afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0)); sum0 += v0 * k; sum1 += v1 * k; @@ -100,8 +95,92 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afp v0 = lfp2afp(tmp_v[lx][z4][0]); + afp v1 = lfp2afp(tmp_v[lx][z4][1]); + afp v2 = lfp2afp(tmp_v[lx][z4][2]); + afp v3 = lfp2afp(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + } +#else for (int z = 0; z < psc(c); z++) { afp v0 = buffer_ld1(bottom_blob_data, v_offset + 0); @@ -117,17 +196,23 @@ void main() sum3 += v3 * k; v_offset += psc(cstep); - w_offset += maxk; + w_offset += 1; } #endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) + return; +#endif #if NCNN_image_shader - image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st4(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp index 02bff4151..3e35198af 100644 --- a/src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp @@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outh)) return; - const int maxk = kernel_w * kernel_h; - afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); @@ -92,7 +78,7 @@ void main() afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z)); afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k = image3d_ld8(weight_blob, ivec3(gy, z, gz)); + afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0)); // sum += v * k; sum0[0] += v0 * k[0]; @@ -109,7 +95,7 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c); for (int z = 0; z < psc(c); z++) { @@ -134,17 +120,17 @@ void main() sum3[1] += v3 * k[1]; v_offset += psc(cstep); - w_offset += maxk; + w_offset += 1; } #endif #if NCNN_image_shader - image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st8(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack4_col2im.comp b/src/layer/vulkan/shader/deconvolution_pack4_col2im.comp index f7c985c78..8ef5e1678 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4_col2im.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4_col2im.comp @@ -33,17 +33,13 @@ layout (constant_id = 8) const float activation_param_0 = 0; layout (constant_id = 9) const float activation_param_1 = 0; #define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -57,13 +53,9 @@ layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -94,6 +86,8 @@ void main() sum = afpvec4(0.f); } + const int maxk = kernel_w * kernel_h; + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; @@ -116,9 +110,9 @@ void main() w_k /= dilation_w; #if NCNN_image_shader - sum += image3d_ld4(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz)); + sum += image3d_ld4(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0)); #else - const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; + const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; sum += buffer_ld4(col_blob_data, gi); #endif diff --git a/src/layer/vulkan/shader/deconvolution_pack4_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack4_gemm.comp index 7beb49b6c..0a2f545d1 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4_gemm.comp @@ -21,21 +21,16 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define LOCAL_MEMORY_UNROLL_INCH 8 + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -44,39 +39,34 @@ layout (binding = 2) uniform unfp sampler3D weight_blob; #else layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer col_blob { sfpvec4 col_blob_data[]; }; -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) -// GL_EXT_shader_16bit_storage does not define f16mat4 type :( layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; -#else -layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; -#endif #endif layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) return; - - const int maxk = kernel_w * kernel_h; +#endif afpvec4 sum0 = afpvec4(0.f); afpvec4 sum1 = afpvec4(0.f); @@ -85,7 +75,6 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - ivec4 gy4 = gy * 4 + ivec4(0, 1, 2, 3); ivec4 sy4 = gx4 / psc(w); ivec4 sx4 = gx4 % psc(w); @@ -98,10 +87,10 @@ void main() afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); afpmat4 k = afpmat4( - image3d_ld4(weight_blob, ivec3(gy4.r, z, gz)), - image3d_ld4(weight_blob, ivec3(gy4.g, z, gz)), - image3d_ld4(weight_blob, ivec3(gy4.b, z, gz)), - image3d_ld4(weight_blob, ivec3(gy4.a, z, gz)) + image3d_ld4(weight_blob, ivec3(z * 4 + 0, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 1, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 2, gy, 0)), + image3d_ld4(weight_blob, ivec3(z * 4 + 3, gy, 0)) ); sum0 += v0 * k; @@ -111,8 +100,102 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c) * 4; + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep); + w_offset += LOCAL_MEMORY_UNROLL_INCH * 4; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]); + afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]); + afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]); + afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]); + + afpmat4 k = afpmat4(k0, k1, k2, k3); + + sum0 += v0 * k; + sum1 += v1 * k; + sum2 += v2 * k; + sum3 += v3 * k; + } + } +#else for (int z = 0; z < psc(c); z++) { afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0); @@ -120,17 +203,12 @@ void main() afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2); afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3); -#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic) - // GL_EXT_shader_16bit_storage does not define f16mat4 type :( afpmat4 k = afpmat4( - buffer_ld4(weight_data, w_offset * 4 + 0), - buffer_ld4(weight_data, w_offset * 4 + 1), - buffer_ld4(weight_data, w_offset * 4 + 2), - buffer_ld4(weight_data, w_offset * 4 + 3) + buffer_ld4(weight_data, w_offset + 0), + buffer_ld4(weight_data, w_offset + 1), + buffer_ld4(weight_data, w_offset + 2), + buffer_ld4(weight_data, w_offset + 3) ); -#else - afpmat4 k = afpmat4(weight_data[w_offset]); -#endif sum0 += v0 * k; sum1 += v1 * k; @@ -138,17 +216,23 @@ void main() sum3 += v3 * k; v_offset += psc(cstep); - w_offset += maxk; + w_offset += 4; } #endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) + return; +#endif #if NCNN_image_shader - image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st4(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp index 99444f1a6..ee9ac4642 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp @@ -21,21 +21,16 @@ #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define LOCAL_MEMORY_UNROLL_INCH 8 + +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; +#if NCNN_shader_local_memory +shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4]; +shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH]; +#endif + void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) +#if !NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) return; - - const int maxk = kernel_w * kernel_h; +#endif afp sum0 = afp(0.f); afp sum1 = afp(0.f); @@ -91,7 +86,7 @@ void main() afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k = image3d_ld4(weight_blob, ivec3(gy, z, gz)); + afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0)); sum0 += dot(v0, k); sum1 += dot(v1, k); @@ -100,8 +95,92 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c); + +#if NCNN_shader_local_memory + const int lx = int(gl_LocalInvocationID.x); + const int ly = int(gl_LocalInvocationID.y); + int z = 0; + for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH) + { + if (ly < 4) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); + + sum0 += dot(v0, k); + sum1 += dot(v1, k); + sum2 += dot(v2, k); + sum3 += dot(v3, k); + } + + v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep); + w_offset += LOCAL_MEMORY_UNROLL_INCH; + + barrier(); + } + + if (z < psc(c)) + { + const int remain = psc(c) - z; + + if (ly < 4) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]); + } + } + + if (lx == 0) + { + for (int z4 = 0; z4 < remain; z4++) + { + tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]); + } + } + + barrier(); + + for (int z4 = 0; z4 < remain; z4++) + { + afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]); + afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]); + afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]); + afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]); + + afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]); + + sum0 += dot(v0, k); + sum1 += dot(v1, k); + sum2 += dot(v2, k); + sum3 += dot(v3, k); + } + } +#else for (int z = 0; z < psc(c); z++) { afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0); @@ -117,17 +196,23 @@ void main() sum3 += dot(v3, k); v_offset += psc(cstep); - w_offset += maxk; + w_offset += 1; } #endif +#endif + +#if NCNN_shader_local_memory + if (gx >= psc(outw) || gy >= psc(outh)) + return; +#endif #if NCNN_image_shader - image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st1(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp index d5d43fd85..493d90f0d 100644 --- a/src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp @@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outh)) return; - const int maxk = kernel_w * kernel_h; - afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); @@ -81,8 +67,6 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - ivec4 gy4 = gy * 8 + ivec4(0, 1, 2, 3); - ivec4 gyy4 = gy4 + 4; ivec4 sy4 = gx4 / psc(w); ivec4 sx4 = gx4 % psc(w); @@ -94,14 +78,14 @@ void main() afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec4 k0 = image3d_ld4(weight_blob, ivec3(gy4.r, z, gz)); - afpvec4 k1 = image3d_ld4(weight_blob, ivec3(gy4.g, z, gz)); - afpvec4 k2 = image3d_ld4(weight_blob, ivec3(gy4.b, z, gz)); - afpvec4 k3 = image3d_ld4(weight_blob, ivec3(gy4.a, z, gz)); - afpvec4 k4 = image3d_ld4(weight_blob, ivec3(gyy4.r, z, gz)); - afpvec4 k5 = image3d_ld4(weight_blob, ivec3(gyy4.g, z, gz)); - afpvec4 k6 = image3d_ld4(weight_blob, ivec3(gyy4.b, z, gz)); - afpvec4 k7 = image3d_ld4(weight_blob, ivec3(gyy4.a, z, gz)); + afpvec4 k0 = image3d_ld4(weight_blob, ivec3(z * 8 + 0, gy, 0)); + afpvec4 k1 = image3d_ld4(weight_blob, ivec3(z * 8 + 1, gy, 0)); + afpvec4 k2 = image3d_ld4(weight_blob, ivec3(z * 8 + 2, gy, 0)); + afpvec4 k3 = image3d_ld4(weight_blob, ivec3(z * 8 + 3, gy, 0)); + afpvec4 k4 = image3d_ld4(weight_blob, ivec3(z * 8 + 4, gy, 0)); + afpvec4 k5 = image3d_ld4(weight_blob, ivec3(z * 8 + 5, gy, 0)); + afpvec4 k6 = image3d_ld4(weight_blob, ivec3(z * 8 + 6, gy, 0)); + afpvec4 k7 = image3d_ld4(weight_blob, ivec3(z * 8 + 7, gy, 0)); // sum += v * k; sum0[0].r += dot(v0, k0); @@ -142,7 +126,7 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c) * 8; for (int z = 0; z < psc(c); z++) { @@ -151,14 +135,14 @@ void main() afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2); afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3); - afpvec4 k0 = buffer_ld4(weight_data, w_offset * 8 + 0); - afpvec4 k1 = buffer_ld4(weight_data, w_offset * 8 + 1); - afpvec4 k2 = buffer_ld4(weight_data, w_offset * 8 + 2); - afpvec4 k3 = buffer_ld4(weight_data, w_offset * 8 + 3); - afpvec4 k4 = buffer_ld4(weight_data, w_offset * 8 + 4); - afpvec4 k5 = buffer_ld4(weight_data, w_offset * 8 + 5); - afpvec4 k6 = buffer_ld4(weight_data, w_offset * 8 + 6); - afpvec4 k7 = buffer_ld4(weight_data, w_offset * 8 + 7); + afpvec4 k0 = buffer_ld4(weight_data, w_offset + 0); + afpvec4 k1 = buffer_ld4(weight_data, w_offset + 1); + afpvec4 k2 = buffer_ld4(weight_data, w_offset + 2); + afpvec4 k3 = buffer_ld4(weight_data, w_offset + 3); + afpvec4 k4 = buffer_ld4(weight_data, w_offset + 4); + afpvec4 k5 = buffer_ld4(weight_data, w_offset + 5); + afpvec4 k6 = buffer_ld4(weight_data, w_offset + 6); + afpvec4 k7 = buffer_ld4(weight_data, w_offset + 7); // sum += v * k; sum0[0].r += dot(v0, k0); @@ -198,17 +182,17 @@ void main() sum3[1].a += dot(v3, k7); v_offset += psc(cstep); - w_offset += maxk; + w_offset += 8; } #endif #if NCNN_image_shader - image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st8(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack8_col2im.comp b/src/layer/vulkan/shader/deconvolution_pack8_col2im.comp index 1a9d5bf48..f395f81ec 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8_col2im.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8_col2im.comp @@ -34,17 +34,13 @@ layout (constant_id = 8) const float activation_param_0 = 0; layout (constant_id = 9) const float activation_param_1 = 0; #define shape_constant_id_offset 10 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; + +layout (constant_id = shape_constant_id_offset + 2) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 3) const int outh = 0; +layout (constant_id = shape_constant_id_offset + 4) const int outc = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D col_blob; @@ -58,13 +54,9 @@ layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; - int c; - int cstep; - int outdims; int outw; int outh; int outc; @@ -95,6 +87,8 @@ void main() sum = afpvec8(afpvec4(0.f), afpvec4(0.f)); } + const int maxk = kernel_w * kernel_h; + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; @@ -117,9 +111,9 @@ void main() w_k /= dilation_w; #if NCNN_image_shader - sum += image3d_ld8(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz)); + sum += image3d_ld8(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0)); #else - const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; + const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx; sum += buffer_ld8(col_blob_data, gi); #endif diff --git a/src/layer/vulkan/shader/deconvolution_pack8_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack8_gemm.comp index 2ce5f0413..e8b6bd0d4 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8_gemm.comp @@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outh)) return; - const int maxk = kernel_w * kernel_h; - afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f)); afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f)); @@ -81,8 +67,6 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - ivec4 gy4 = gy * 8 + ivec4(0, 1, 2, 3); - ivec4 gyy4 = gy4 + 4; ivec4 sy4 = gx4 / psc(w); ivec4 sx4 = gx4 % psc(w); @@ -94,14 +78,14 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(gy4.r, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(gy4.g, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(gy4.b, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(gy4.a, z, gz)); - afpvec8 k4 = image3d_ld8(weight_blob, ivec3(gyy4.r, z, gz)); - afpvec8 k5 = image3d_ld8(weight_blob, ivec3(gyy4.g, z, gz)); - afpvec8 k6 = image3d_ld8(weight_blob, ivec3(gyy4.b, z, gz)); - afpvec8 k7 = image3d_ld8(weight_blob, ivec3(gyy4.a, z, gz)); + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 8 + 0, gy, 0)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 8 + 1, gy, 0)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 8 + 2, gy, 0)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 8 + 3, gy, 0)); + afpvec8 k4 = image3d_ld8(weight_blob, ivec3(z * 8 + 4, gy, 0)); + afpvec8 k5 = image3d_ld8(weight_blob, ivec3(z * 8 + 5, gy, 0)); + afpvec8 k6 = image3d_ld8(weight_blob, ivec3(z * 8 + 6, gy, 0)); + afpvec8 k7 = image3d_ld8(weight_blob, ivec3(z * 8 + 7, gy, 0)); // sum += v * k sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -142,7 +126,7 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c) * 8; for (int z = 0; z < psc(c); z++) { @@ -151,14 +135,14 @@ void main() afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2); afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3); - afpvec8 k0 = buffer_ld8(weight_data, w_offset * 8 + 0); - afpvec8 k1 = buffer_ld8(weight_data, w_offset * 8 + 1); - afpvec8 k2 = buffer_ld8(weight_data, w_offset * 8 + 2); - afpvec8 k3 = buffer_ld8(weight_data, w_offset * 8 + 3); - afpvec8 k4 = buffer_ld8(weight_data, w_offset * 8 + 4); - afpvec8 k5 = buffer_ld8(weight_data, w_offset * 8 + 5); - afpvec8 k6 = buffer_ld8(weight_data, w_offset * 8 + 6); - afpvec8 k7 = buffer_ld8(weight_data, w_offset * 8 + 7); + afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3); + afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4); + afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5); + afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6); + afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7); // sum += v * k sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -198,17 +182,17 @@ void main() sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]); v_offset += psc(cstep); - w_offset += maxk; + w_offset += 8; } #endif #if NCNN_image_shader - image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st8(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp index bb2444c6b..9c5855ca4 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp @@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outh)) return; - const int maxk = kernel_w * kernel_h; - afp sum0 = afp(0.f); afp sum1 = afp(0.f); afp sum2 = afp(0.f); @@ -92,7 +78,7 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k = image3d_ld8(weight_blob, ivec3(gy, z, gz)); + afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0)); // sum += dot(v, k); sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]); @@ -102,7 +88,7 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c); for (int z = 0; z < psc(c); z++) { @@ -120,17 +106,17 @@ void main() sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]); v_offset += psc(cstep); - w_offset += maxk; + w_offset += 1; } #endif #if NCNN_image_shader - image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st1(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1); diff --git a/src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp b/src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp index 542cd7b86..061c2a54b 100644 --- a/src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp +++ b/src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp @@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; }; #extension GL_EXT_shader_explicit_arithmetic_types_float16: require #endif -layout (constant_id = 0) const int kernel_w = 1; -layout (constant_id = 1) const int kernel_h = 1; - -#define shape_constant_id_offset 2 -layout (constant_id = shape_constant_id_offset + 0) const int dims = 0; -layout (constant_id = shape_constant_id_offset + 1) const int w = 0; -layout (constant_id = shape_constant_id_offset + 2) const int h = 0; -layout (constant_id = shape_constant_id_offset + 3) const int c = 0; -layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0; - -layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0; -layout (constant_id = shape_constant_id_offset + 6) const int outw = 0; -layout (constant_id = shape_constant_id_offset + 7) const int outh = 0; -layout (constant_id = shape_constant_id_offset + 8) const int outc = 0; -layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0; +#define shape_constant_id_offset 0 +layout (constant_id = shape_constant_id_offset + 0) const int w = 0; +layout (constant_id = shape_constant_id_offset + 1) const int h = 0; +layout (constant_id = shape_constant_id_offset + 2) const int c = 0; +layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0; + +layout (constant_id = shape_constant_id_offset + 4) const int outw = 0; +layout (constant_id = shape_constant_id_offset + 5) const int outh = 0; #if NCNN_image_shader layout (binding = 0) uniform unfp sampler3D bottom_blob; @@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; }; layout (push_constant) uniform parameter { - int dims; int w; int h; int c; int cstep; - int outdims; int outw; int outh; - int outc; - int outcstep; } p; void main() { int gx = int(gl_GlobalInvocationID.x) * 4; int gy = int(gl_GlobalInvocationID.y); - int gz = int(gl_GlobalInvocationID.z); - if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc)) + if (gx >= psc(outw) || gy >= psc(outh)) return; - const int maxk = kernel_w * kernel_h; - afpvec4 sum0 = afpvec4(0.f); afpvec4 sum1 = afpvec4(0.f); afpvec4 sum2 = afpvec4(0.f); @@ -81,7 +67,6 @@ void main() #if NCNN_image_shader ivec4 gx4 = gx + ivec4(0, 1, 2, 3); - ivec4 gy4 = gy * 4 + ivec4(0, 1, 2, 3); ivec4 sy4 = gx4 / psc(w); ivec4 sx4 = gx4 % psc(w); @@ -93,10 +78,10 @@ void main() afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z)); afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z)); - afpvec8 k0 = image3d_ld8(weight_blob, ivec3(gy4.r, z, gz)); - afpvec8 k1 = image3d_ld8(weight_blob, ivec3(gy4.g, z, gz)); - afpvec8 k2 = image3d_ld8(weight_blob, ivec3(gy4.b, z, gz)); - afpvec8 k3 = image3d_ld8(weight_blob, ivec3(gy4.a, z, gz)); + afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 4 + 0, gy, 0)); + afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 4 + 1, gy, 0)); + afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 4 + 2, gy, 0)); + afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 4 + 3, gy, 0)); // sum += v * k sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -121,7 +106,7 @@ void main() } #else int v_offset = gx; - int w_offset = gz * psc(c) * maxk + gy; + int w_offset = gy * psc(c) * 4; for (int z = 0; z < psc(c); z++) { @@ -130,10 +115,10 @@ void main() afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2); afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3); - afpvec8 k0 = buffer_ld8(weight_data, w_offset * 4 + 0); - afpvec8 k1 = buffer_ld8(weight_data, w_offset * 4 + 1); - afpvec8 k2 = buffer_ld8(weight_data, w_offset * 4 + 2); - afpvec8 k3 = buffer_ld8(weight_data, w_offset * 4 + 3); + afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0); + afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1); + afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2); + afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3); // sum += v * k sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]); @@ -157,17 +142,17 @@ void main() sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]); v_offset += psc(cstep); - w_offset += maxk; + w_offset += 4; } #endif #if NCNN_image_shader - image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0); - image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1); - image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2); - image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3); + image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0); + image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1); + image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2); + image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3); #else - const int gi = gz * psc(outcstep) + gy * psc(outw) + gx; + const int gi = gy * psc(outw) + gx; buffer_st4(col_blob_data, gi, sum0); if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1); diff --git a/src/net.cpp b/src/net.cpp index 0fb50f78f..75fa13b7a 100644 --- a/src/net.cpp +++ b/src/net.cpp @@ -1416,6 +1416,9 @@ int Net::load_param(const DataReader& dr) if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false; + // enable local memory optimization on discrete gpu only + if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false; + // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } @@ -1629,6 +1632,9 @@ int Net::load_param_bin(const DataReader& dr) if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false; + // enable local memory optimization on discrete gpu only + if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false; + // fp16a makes no sense when fp16 storage disabled if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false; } diff --git a/src/option.cpp b/src/option.cpp index 454ea2f9b..59ef948e8 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -65,6 +65,8 @@ Option::Option() flush_denormals = 3; use_local_pool_allocator = true; + + use_shader_local_memory = true; } } // namespace ncnn diff --git a/src/option.h b/src/option.h index 4ee698897..6d695cd1c 100644 --- a/src/option.h +++ b/src/option.h @@ -129,7 +129,9 @@ public: bool use_local_pool_allocator; - bool use_reserved_1; + // enable local memory optimization for gpu inference + bool use_shader_local_memory; + bool use_reserved_2; bool use_reserved_3; bool use_reserved_4;