Browse Source

massive vulkan optimization part2 (#3621)

* vulkan local memory optimization for conv1x1 pack4 and winograd on dgpu

* unified innerproduct pipeline creation

* reorder deconvolution weight layout

* flexible local memory data type

* more local memory optimization for conv/deconv gemm
tags/20220420
nihui GitHub 4 years ago
parent
commit
cfcb1cffa9
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
46 changed files with 2917 additions and 2721 deletions
  1. +62
    -0
      src/gpu.cpp
  2. +203
    -165
      src/layer/vulkan/convolution_vulkan.cpp
  3. +92
    -78
      src/layer/vulkan/deconvolution_vulkan.cpp
  4. +45
    -302
      src/layer/vulkan/innerproduct_vulkan.cpp
  5. +0
    -8
      src/layer/vulkan/innerproduct_vulkan.h
  6. +130
    -47
      src/layer/vulkan/shader/convolution_gemm.comp
  7. +18
    -24
      src/layer/vulkan/shader/convolution_im2col.comp
  8. +130
    -47
      src/layer/vulkan/shader/convolution_pack1to4_gemm.comp
  9. +51
    -67
      src/layer/vulkan/shader/convolution_pack1to8_gemm.comp
  10. +116
    -21
      src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
  11. +0
    -139
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp
  12. +37
    -37
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp
  13. +37
    -37
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
  14. +0
    -139
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_gemm.comp
  15. +77
    -77
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp
  16. +75
    -75
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp
  17. +238
    -0
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp
  18. +152
    -69
      src/layer/vulkan/shader/convolution_pack4_gemm.comp
  19. +18
    -24
      src/layer/vulkan/shader/convolution_pack4_im2col.comp
  20. +130
    -47
      src/layer/vulkan/shader/convolution_pack4to1_gemm.comp
  21. +124
    -140
      src/layer/vulkan/shader/convolution_pack4to8_gemm.comp
  22. +0
    -198
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp
  23. +37
    -37
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp
  24. +37
    -37
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
  25. +76
    -76
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp
  26. +74
    -74
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp
  27. +36
    -36
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd_gemm.comp
  28. +124
    -140
      src/layer/vulkan/shader/convolution_pack8_gemm.comp
  29. +18
    -24
      src/layer/vulkan/shader/convolution_pack8_im2col.comp
  30. +49
    -65
      src/layer/vulkan/shader/convolution_pack8to1_gemm.comp
  31. +84
    -100
      src/layer/vulkan/shader/convolution_pack8to4_gemm.comp
  32. +11
    -17
      src/layer/vulkan/shader/deconvolution_col2im.comp
  33. +116
    -31
      src/layer/vulkan/shader/deconvolution_gemm.comp
  34. +116
    -31
      src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp
  35. +17
    -31
      src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp
  36. +11
    -17
      src/layer/vulkan/shader/deconvolution_pack4_col2im.comp
  37. +133
    -49
      src/layer/vulkan/shader/deconvolution_pack4_gemm.comp
  38. +116
    -31
      src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp
  39. +32
    -48
      src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp
  40. +11
    -17
      src/layer/vulkan/shader/deconvolution_pack8_col2im.comp
  41. +32
    -48
      src/layer/vulkan/shader/deconvolution_pack8_gemm.comp
  42. +17
    -31
      src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp
  43. +24
    -39
      src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp
  44. +6
    -0
      src/net.cpp
  45. +2
    -0
      src/option.cpp
  46. +3
    -1
      src/option.h

+ 62
- 0
src/gpu.cpp View File

@@ -3210,6 +3210,63 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("afpmat4", "mat4"));
}

if (opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("lfp", "float16_t"));
custom_defines.push_back(std::make_pair("lfpvec4", "f16vec4"));
}
else if (opt.use_fp16_storage || opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("lfp", "float"));
custom_defines.push_back(std::make_pair("lfpvec4", "uvec2"));
}
else
{
custom_defines.push_back(std::make_pair("lfp", "float"));
custom_defines.push_back(std::make_pair("lfpvec4", "vec4"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v"));
}
else if (opt.use_fp16_packed && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "float16_t(v)"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "f16vec4(vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y)))"));
}
else if (opt.use_fp16_storage)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "uvec2(packHalf2x16(vec4(v).rg),packHalf2x16(vec4(v).ba))"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))"));
}
else if (opt.use_fp16_packed)
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "vec4(unpackHalf2x16(v.x),unpackHalf2x16(v.y))"));
}
else
{
custom_defines.push_back(std::make_pair("sfp2lfp(v)", "v"));
custom_defines.push_back(std::make_pair("sfp2lfpvec4(v)", "v"));

custom_defines.push_back(std::make_pair("lfp2afp(v)", "v"));
custom_defines.push_back(std::make_pair("lfp2afpvec4(v)", "v"));
}

if (opt.use_fp16_storage && opt.use_fp16_arithmetic)
{
custom_defines.push_back(std::make_pair("buffer_ld1(buf,i)", "buf[i]"));
@@ -3546,6 +3603,11 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
}
}

if (opt.use_shader_local_memory)
{
custom_defines.push_back(std::make_pair("NCNN_shader_local_memory", "1"));
}

std::string preamble;
std::vector<std::string> processes;



+ 203
- 165
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -180,16 +180,16 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
block_y = outh_bordered / 4;

shape_winograd_bordered = Mat(w_bordered, h_bordered, shape.c, (void*)0);
shape_winograd_input_transformed = Mat(36, block_x * block_y, shape.c, (void*)0);
shape_winograd_gemm = Mat(36, block_x * block_y, out_shape.c, (void*)0);
shape_winograd_input_transformed = Mat(block_x * block_y, shape.c, 36, (void*)0);
shape_winograd_gemm = Mat(block_x * block_y, out_shape.c, 36, (void*)0);
shape_winograd_out_bordered = Mat(outw_bordered, outh_bordered, out_shape.c, (void*)0);
}

if (shape_winograd_bordered.dims == 3) shape_winograd_bordered_packed = Mat(shape_winograd_bordered.w, shape_winograd_bordered.h, shape_winograd_bordered.c / elempack, (void*)0, elemsize, elempack);

if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h, shape_winograd_input_transformed.c / elempack, (void*)0, elemsize, elempack);
if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h / elempack, 36, (void*)0, elemsize, elempack);

if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h, shape_winograd_gemm.c / out_elempack, (void*)0, out_elemsize, out_elempack);
if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h / out_elempack, 36, (void*)0, out_elemsize, out_elempack);

if (shape_winograd_out_bordered.dims == 3) shape_winograd_out_bordered_packed = Mat(shape_winograd_out_bordered.w, shape_winograd_out_bordered.h, shape_winograd_out_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack);

@@ -203,7 +203,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
opt.use_image_storage = false;
}

Mat weight_data_packed_tm(36, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
Mat weight_data_packed_tm(num_input / elempack, num_output / out_elempack, 36, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
if (!vkdev->shape_support_image_storage(weight_data_packed_tm))
{
support_image_storage = false;
@@ -231,16 +231,16 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
block_y = outh_bordered / 2;

shape_winograd_bordered = Mat(w_bordered, h_bordered, shape.c, (void*)0);
shape_winograd_input_transformed = Mat(16, block_x * block_y, shape.c, (void*)0);
shape_winograd_gemm = Mat(16, block_x * block_y, out_shape.c, (void*)0);
shape_winograd_input_transformed = Mat(block_x * block_y, shape.c, 16, (void*)0);
shape_winograd_gemm = Mat(block_x * block_y, out_shape.c, 16, (void*)0);
shape_winograd_out_bordered = Mat(outw_bordered, outh_bordered, out_shape.c, (void*)0);
}

if (shape_winograd_bordered.dims == 3) shape_winograd_bordered_packed = Mat(shape_winograd_bordered.w, shape_winograd_bordered.h, shape_winograd_bordered.c / elempack, (void*)0, elemsize, elempack);

if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h, shape_winograd_input_transformed.c / elempack, (void*)0, elemsize, elempack);
if (shape_winograd_input_transformed.dims == 3) shape_winograd_input_transformed_packed = Mat(shape_winograd_input_transformed.w, shape_winograd_input_transformed.h / elempack, 16, (void*)0, elemsize, elempack);

if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h, shape_winograd_gemm.c / out_elempack, (void*)0, out_elemsize, out_elempack);
if (shape_winograd_gemm.dims == 3) shape_winograd_gemm_packed = Mat(shape_winograd_gemm.w, shape_winograd_gemm.h / out_elempack, 16, (void*)0, out_elemsize, out_elempack);

if (shape_winograd_out_bordered.dims == 3) shape_winograd_out_bordered_packed = Mat(shape_winograd_out_bordered.w, shape_winograd_out_bordered.h, shape_winograd_out_bordered.c / out_elempack, (void*)0, out_elemsize, out_elempack);

@@ -254,7 +254,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
opt.use_image_storage = false;
}

Mat weight_data_packed_tm(16, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
Mat weight_data_packed_tm(num_input / elempack, num_output / out_elempack, 16, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
if (!vkdev->shape_support_image_storage(weight_data_packed_tm))
{
support_image_storage = false;
@@ -273,10 +273,10 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
// im2col + gemm
if (shape.dims != 0 && out_shape.dims != 0)
{
shape_col = Mat(out_shape.w * out_shape.h, kernel_w * kernel_h, shape.c, (void*)0);
shape_col = Mat(out_shape.w * out_shape.h, kernel_w * kernel_h * shape.c, (void*)0);
}

Mat shape_col_packed = Mat(shape_col.w, shape_col.h, shape_col.c / elempack, (void*)0, elemsize, elempack);
if (shape_col.dims == 2) shape_col_packed = Mat(shape_col.w, shape_col.h / elempack, (void*)0, elemsize, elempack);

// check blob shape
if (!vkdev->shape_support_image_storage(shape_col_packed))
@@ -363,7 +363,14 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;

pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack));
if (opt.use_shader_local_memory)
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8);
}
else
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack));
}
pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
}
else if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 32 && num_output >= 32 && ((elempack == 4 && out_elempack == 4) || (elempack == 8 && out_elempack == 8)))
@@ -433,19 +440,27 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
}

{
std::vector<vk_specialization_type> specializations(0 + 5);
specializations[0 + 0].i = shape_winograd_input_transformed_packed.c;
specializations[0 + 1].i = shape_winograd_input_transformed_packed.cstep;
specializations[0 + 2].i = shape_winograd_gemm_packed.h;
specializations[0 + 3].i = shape_winograd_gemm_packed.c;
specializations[0 + 4].i = shape_winograd_gemm_packed.cstep;
std::vector<vk_specialization_type> specializations(1 + 5);
specializations[0].i = 36;
specializations[1 + 0].i = shape_winograd_input_transformed_packed.h;
specializations[1 + 1].i = shape_winograd_input_transformed_packed.cstep;
specializations[1 + 2].i = shape_winograd_gemm_packed.w;
specializations[1 + 3].i = shape_winograd_gemm_packed.h;
specializations[1 + 4].i = shape_winograd_gemm_packed.cstep;

int shader_type_index = -1;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd43_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd43_gemm;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd_gemm;

pipeline_convolution_3x3s1d1_winograd43_gemm = new Pipeline(vkdev);
pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(4, 4, std::min(4, num_output / out_elempack));
if (opt.use_shader_local_memory)
{
pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_3x3s1d1_winograd43_gemm->set_local_size_xyz(4, std::min(4, num_output / out_elempack), 4);
}
pipeline_convolution_3x3s1d1_winograd43_gemm->create(shader_type_index, opt, specializations);
}

@@ -539,19 +554,27 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
}

{
std::vector<vk_specialization_type> specializations(0 + 5);
specializations[0 + 0].i = shape_winograd_input_transformed_packed.c;
specializations[0 + 1].i = shape_winograd_input_transformed_packed.cstep;
specializations[0 + 2].i = shape_winograd_gemm_packed.h;
specializations[0 + 3].i = shape_winograd_gemm_packed.c;
specializations[0 + 4].i = shape_winograd_gemm_packed.cstep;
std::vector<vk_specialization_type> specializations(1 + 5);
specializations[0].i = 16;
specializations[1 + 0].i = shape_winograd_input_transformed_packed.h;
specializations[1 + 1].i = shape_winograd_input_transformed_packed.cstep;
specializations[1 + 2].i = shape_winograd_gemm_packed.w;
specializations[1 + 3].i = shape_winograd_gemm_packed.h;
specializations[1 + 4].i = shape_winograd_gemm_packed.cstep;

int shader_type_index = -1;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd23_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd23_gemm;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_3x3s1d1_winograd_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_3x3s1d1_winograd_gemm;

pipeline_convolution_3x3s1d1_winograd23_gemm = new Pipeline(vkdev);
pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(4, 4, std::min(4, num_output / out_elempack));
if (opt.use_shader_local_memory)
{
pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_3x3s1d1_winograd23_gemm->set_local_size_xyz(4, std::min(4, num_output / out_elempack), 4);
}
pipeline_convolution_3x3s1d1_winograd23_gemm->create(shader_type_index, opt, specializations);
}

@@ -561,7 +584,7 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
specializations[1].i = activation_type;
specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[4 + 0].i = shape_winograd_gemm_packed.c;
specializations[4 + 0].i = shape_winograd_gemm_packed.h;
specializations[4 + 1].i = shape_winograd_gemm_packed.cstep;
specializations[4 + 2].i = block_x;
specializations[4 + 3].i = block_y;
@@ -581,30 +604,25 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
else if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && (num_input >= 16 && num_output >= 16))
{
{
std::vector<vk_specialization_type> specializations(6 + 10);
std::vector<vk_specialization_type> specializations(6 + 6);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6 + 0].i = shape_bordered_packed.dims;
specializations[6 + 1].i = shape_bordered_packed.w;
specializations[6 + 2].i = shape_bordered_packed.h;
specializations[6 + 3].i = shape_bordered_packed.c;
specializations[6 + 4].i = shape_bordered_packed.cstep;
specializations[6 + 5].i = shape_col_packed.dims;
specializations[6 + 6].i = out_shape_packed.w;
specializations[6 + 7].i = out_shape_packed.h;
specializations[6 + 8].i = shape_col_packed.c;
specializations[6 + 9].i = shape_col_packed.cstep;

Mat local_size_xyz(8, 1, std::min(4, num_input / elempack), (void*)0);
specializations[6 + 0].i = shape_bordered_packed.w;
specializations[6 + 1].i = shape_bordered_packed.h;
specializations[6 + 2].i = shape_bordered_packed.c;
specializations[6 + 3].i = shape_bordered_packed.cstep;
specializations[6 + 4].i = out_shape_packed.w;
specializations[6 + 5].i = out_shape_packed.h;

Mat local_size_xyz(8, std::min(4, num_input / elempack), 1, (void*)0);
if (shape_col_packed.dims != 0)
{
local_size_xyz.w = std::min(8, shape_col_packed.w);
local_size_xyz.h = std::min(1, shape_col_packed.h);
local_size_xyz.c = std::min(4, shape_col_packed.c);
local_size_xyz.h = std::min(4, shape_col_packed.h);
}

int shader_type_index = -1;
@@ -618,30 +636,25 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
}

{
std::vector<vk_specialization_type> specializations(6 + 10);
std::vector<vk_specialization_type> specializations(6 + 6);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = bias_term;
specializations[3].i = activation_type;
specializations[4].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[5].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[6 + 0].i = shape_col_packed.dims;
specializations[6 + 1].i = out_shape_packed.w;
specializations[6 + 2].i = out_shape_packed.h;
specializations[6 + 3].i = shape_col_packed.c;
specializations[6 + 4].i = shape_col_packed.cstep;
specializations[6 + 5].i = out_shape_packed.dims;
specializations[6 + 6].i = out_shape_packed.w;
specializations[6 + 7].i = out_shape_packed.h;
specializations[6 + 8].i = out_shape_packed.c;
specializations[6 + 9].i = out_shape_packed.cstep;

Mat local_size_xyz(16, 1, std::min(4, num_output / out_elempack), (void*)0);
specializations[6 + 0].i = shape_col_packed.w;
specializations[6 + 1].i = shape_col_packed.h;
specializations[6 + 2].i = out_shape_packed.w;
specializations[6 + 3].i = out_shape_packed.h;
specializations[6 + 4].i = out_shape_packed.c;
specializations[6 + 5].i = out_shape_packed.cstep;

Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
if (out_shape_packed.dims != 0)
{
local_size_xyz.w = std::min(16, out_shape_packed.w);
local_size_xyz.h = std::min(1, out_shape_packed.h);
local_size_xyz.c = std::min(4, out_shape_packed.c);
local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
local_size_xyz.h = std::min(4, out_shape_packed.c);
}

int shader_type_index = -1;
@@ -656,7 +669,14 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;

pipeline_convolution_gemm = new Pipeline(vkdev);
pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
if (opt.use_shader_local_memory)
{
pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
}
pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
}
}
@@ -764,9 +784,43 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
int elempack = opt.use_shader_pack8 && num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
int out_elempack = opt.use_shader_pack8 && num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;

bool is_conv1x1s1d1 = kernel_w == 1 && kernel_h == 1 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1;

// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
Mat weight_data_packed;
if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && (num_input >= 16 && num_output >= 16))
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

weight_data_packed.create(maxk * num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
float* g00 = weight_data_packed.row(q / out_elempack);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int k = 0; k < maxk; k++)
{
for (int i = 0; i < out_elempack; i++)
{
const Mat k0 = weight_data_r2.channel(q + i);

for (int j = 0; j < elempack; j++)
{
const float* k00 = k0.row(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}
else
{
Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);

@@ -860,18 +914,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}

// src = 36-inch-outch
// dst = 8a-8b-36-inch/8a-outch/8b
// dst = 8a-8b-inch/8a-outch/8b-36
Mat weight_data_tm_packed;
{
weight_data_tm_packed.create(36, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
weight_data_tm_packed.create(num_input / elempack, num_output / out_elempack, 36, (size_t)4 * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
for (int k = 0; k < 36; k++)
{
float* g00 = weight_data_tm_packed.channel(q / out_elempack);
float* g00 = weight_data_tm_packed.channel(k);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
for (int k = 0; k < 36; k++)
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int i = 0; i < out_elempack; i++)
{
@@ -950,18 +1004,18 @@ int Convolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
}

// src = 16-inch-outch
// dst = 8a-8b-16-inch/8a-outch/8b
// dst = 8a-8b-inch/8a-outch/8b-16
Mat weight_data_tm_packed;
{
weight_data_tm_packed.create(16, num_input / elempack, num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);
weight_data_tm_packed.create(num_input / elempack, num_output / out_elempack, 16, (size_t)4 * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
for (int k = 0; k < 16; k++)
{
float* g00 = weight_data_tm_packed.channel(q / out_elempack);
float* g00 = weight_data_tm_packed.channel(k);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
for (int k = 0; k < 16; k++)
for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int i = 0; i < out_elempack; i++)
{
@@ -1156,7 +1210,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform input
VkMat bottom_tm_blob;
{
bottom_tm_blob.create(36, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_tm_blob.create(block_x * block_y, channels, 36, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1176,7 +1230,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
VkMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;
dispatcher.c = bottom_tm_blob.h;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_transform_input, bindings, constants, dispatcher);
}
@@ -1184,7 +1238,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// gemm
VkMat top_tm_blob;
{
top_tm_blob.create(36, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_tm_blob.create(block_x * block_y, num_output / out_elempack, 36, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1194,16 +1248,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = weight_data_gpu_tm;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[0].i = bottom_tm_blob.h;
constants[1].i = bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[2].i = top_tm_blob.w;
constants[3].i = top_tm_blob.h;
constants[4].i = top_tm_blob.cstep;

VkMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;
dispatcher.w = (top_tm_blob.w + 3) / 4;
dispatcher.h = top_tm_blob.h;
dispatcher.c = 36;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_gemm, bindings, constants, dispatcher);
}
@@ -1221,7 +1275,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = bias_data_gpu;

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[0].i = top_tm_blob.h;
constants[1].i = top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
@@ -1299,7 +1353,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// transform input
VkMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_tm_blob.create(block_x * block_y, channels, 16, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1319,7 +1373,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
VkMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;
dispatcher.c = bottom_tm_blob.h;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher);
}
@@ -1327,7 +1381,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// gemm
VkMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_tm_blob.create(block_x * block_y, num_output / out_elempack, 16, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1337,16 +1391,16 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = weight_data_gpu_tm;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[0].i = bottom_tm_blob.h;
constants[1].i = bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[2].i = top_tm_blob.w;
constants[3].i = top_tm_blob.h;
constants[4].i = top_tm_blob.cstep;

VkMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;
dispatcher.w = (top_tm_blob.w + 3) / 4;
dispatcher.h = top_tm_blob.h;
dispatcher.c = 16;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher);
}
@@ -1364,7 +1418,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = bias_data_gpu;

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[0].i = top_tm_blob.h;
constants[1].i = top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
@@ -1410,7 +1464,7 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
// im2col
VkMat bottom_blob_col;
{
bottom_blob_col.create(outw * outh, maxk, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_blob_col.create(outw * outh, maxk * channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_blob_col.empty())
return -100;

@@ -1418,17 +1472,13 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[0] = bottom_blob_bordered;
bindings[1] = bottom_blob_col;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = bottom_blob_bordered.cstep;
constants[5].i = bottom_blob_col.dims;
constants[6].i = outw;
constants[7].i = outh;
constants[8].i = bottom_blob_col.c;
constants[9].i = bottom_blob_col.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = outw;
constants[5].i = outh;

cmd.record_pipeline(pipeline_convolution_im2col, bindings, constants, bottom_blob_col);
}
@@ -1445,22 +1495,18 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
bindings[2] = weight_data_gpu;
bindings[3] = bias_data_gpu;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_col.dims;
constants[1].i = outw;
constants[2].i = outh;
constants[3].i = bottom_blob_col.c;
constants[4].i = bottom_blob_col.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob_col.w;
constants[1].i = bottom_blob_col.h;
constants[2].i = top_blob.w;
constants[3].i = top_blob.h;
constants[4].i = top_blob.c;
constants[5].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
dispatcher.c = top_blob.c;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
}
@@ -1651,7 +1697,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
// transform input
VkImageMat bottom_tm_blob;
{
bottom_tm_blob.create(36, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_tm_blob.create(block_x * block_y, channels, 36, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1671,7 +1717,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;
dispatcher.c = bottom_tm_blob.h;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_transform_input, bindings, constants, dispatcher);
}
@@ -1679,7 +1725,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
// gemm
VkImageMat top_tm_blob;
{
top_tm_blob.create(36, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_tm_blob.create(block_x * block_y, num_output / out_elempack, 36, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1689,16 +1735,16 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
bindings[2] = weight_data_gpu_tm_image;

std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[0].i = bottom_tm_blob.h;
constants[1].i = 0; //bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[2].i = top_tm_blob.w;
constants[3].i = top_tm_blob.h;
constants[4].i = 0; //top_tm_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;
dispatcher.w = (top_tm_blob.w + 3) / 4;
dispatcher.h = top_tm_blob.h;
dispatcher.c = 36;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd43_gemm, bindings, constants, dispatcher);
}
@@ -1716,7 +1762,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
bindings[2] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[0].i = top_tm_blob.h;
constants[1].i = 0; //top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
@@ -1794,7 +1840,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
// transform input
VkImageMat bottom_tm_blob;
{
bottom_tm_blob.create(16, block_x * block_y, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_tm_blob.create(block_x * block_y, channels, 16, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_tm_blob.empty())
return -100;

@@ -1814,7 +1860,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
VkImageMat dispatcher;
dispatcher.w = block_x;
dispatcher.h = block_y;
dispatcher.c = bottom_tm_blob.c;
dispatcher.c = bottom_tm_blob.h;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_transform_input, bindings, constants, dispatcher);
}
@@ -1822,7 +1868,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
// gemm
VkImageMat top_tm_blob;
{
top_tm_blob.create(16, block_x * block_y, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_tm_blob.create(block_x * block_y, num_output / out_elempack, 16, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_tm_blob.empty())
return -100;

@@ -1834,14 +1880,14 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
std::vector<vk_constant_type> constants(5);
constants[0].i = bottom_tm_blob.c;
constants[1].i = 0; //bottom_tm_blob.cstep;
constants[2].i = top_tm_blob.h;
constants[3].i = top_tm_blob.c;
constants[2].i = top_tm_blob.w;
constants[3].i = top_tm_blob.h;
constants[4].i = 0; //top_tm_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = top_tm_blob.w;
dispatcher.h = (top_tm_blob.h + 3) / 4;
dispatcher.c = top_tm_blob.c;
dispatcher.w = (top_tm_blob.w + 3) / 4;
dispatcher.h = top_tm_blob.h;
dispatcher.c = 16;

cmd.record_pipeline(pipeline_convolution_3x3s1d1_winograd23_gemm, bindings, constants, dispatcher);
}
@@ -1859,7 +1905,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
bindings[2] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(7);
constants[0].i = top_tm_blob.c;
constants[0].i = top_tm_blob.h;
constants[1].i = 0; //top_tm_blob.cstep;
constants[2].i = block_x;
constants[3].i = block_y;
@@ -1905,7 +1951,7 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
// im2col
VkImageMat bottom_blob_col;
{
bottom_blob_col.create(outw * outh, maxk, channels, elemsize, elempack, opt.workspace_vkallocator);
bottom_blob_col.create(outw * outh, maxk * channels, elemsize, elempack, opt.workspace_vkallocator);
if (bottom_blob_col.empty())
return -100;

@@ -1913,17 +1959,13 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
bindings[0] = bottom_blob_bordered;
bindings[1] = bottom_blob_col;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_bordered.dims;
constants[1].i = bottom_blob_bordered.w;
constants[2].i = bottom_blob_bordered.h;
constants[3].i = bottom_blob_bordered.c;
constants[4].i = 0; // bottom_blob_bordered.cstep;
constants[5].i = bottom_blob_col.dims;
constants[6].i = outw;
constants[7].i = outh;
constants[8].i = bottom_blob_col.c;
constants[9].i = 0; // bottom_blob_col.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0; // bottom_blob_bordered.cstep;
constants[4].i = outw;
constants[5].i = outh;

cmd.record_pipeline(pipeline_convolution_im2col, bindings, constants, bottom_blob_col);
}
@@ -1940,22 +1982,18 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob_col.dims;
constants[1].i = outw;
constants[2].i = outh;
constants[3].i = bottom_blob_col.c;
constants[4].i = 0; // bottom_blob_col.cstep;
constants[5].i = top_blob.dims;
constants[6].i = top_blob.w;
constants[7].i = top_blob.h;
constants[8].i = top_blob.c;
constants[9].i = 0; // top_blob.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob_col.w;
constants[1].i = bottom_blob_col.h;
constants[2].i = top_blob.w;
constants[3].i = top_blob.h;
constants[4].i = top_blob.c;
constants[5].i = 0; // top_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
dispatcher.c = top_blob.c;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
}


+ 92
- 78
src/layer/vulkan/deconvolution_vulkan.cpp View File

@@ -144,10 +144,11 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
Mat out_shape_col;
if (shape.dims != 0 && out_shape.dims != 0)
{
out_shape_col = Mat(shape.w * shape.h, kernel_w * kernel_h, out_shape.c, (void*)0);
out_shape_col = Mat(shape.w * shape.h, kernel_w * kernel_h * out_shape.c, (void*)0);
}

Mat out_shape_col_packed = Mat(out_shape_col.w, out_shape_col.h, out_shape_col.c / out_elempack, (void*)0, out_elemsize, out_elempack);
Mat out_shape_col_packed;
if (out_shape_col.dims == 2) out_shape_col_packed = Mat(out_shape_col.w, out_shape_col.h / out_elempack, (void*)0, out_elemsize, out_elempack);

// check blob shape
if (!vkdev->shape_support_image_storage(out_shape_col_packed))
@@ -157,26 +158,19 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
}

{
std::vector<vk_specialization_type> specializations(2 + 10);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2 + 0].i = shape_packed.dims;
specializations[2 + 1].i = shape_packed.w;
specializations[2 + 2].i = shape_packed.h;
specializations[2 + 3].i = shape_packed.c;
specializations[2 + 4].i = shape_packed.cstep;
specializations[2 + 5].i = out_shape_col_packed.dims;
specializations[2 + 6].i = out_shape_col_packed.w;
specializations[2 + 7].i = out_shape_col_packed.h;
specializations[2 + 8].i = out_shape_col_packed.c;
specializations[2 + 9].i = out_shape_col_packed.cstep;

Mat local_size_xyz(8, 4, std::min(4, num_output / out_elempack), (void*)0);
std::vector<vk_specialization_type> specializations(0 + 6);
specializations[0 + 0].i = shape_packed.w;
specializations[0 + 1].i = shape_packed.h;
specializations[0 + 2].i = shape_packed.c;
specializations[0 + 3].i = shape_packed.cstep;
specializations[0 + 4].i = out_shape_col_packed.w;
specializations[0 + 5].i = out_shape_col_packed.h;

Mat local_size_xyz(8, std::min(4, num_output / out_elempack), 1, (void*)0);
if (out_shape_col_packed.dims != 0)
{
local_size_xyz.w = std::min(8, out_shape_col_packed.w);
local_size_xyz.h = std::min(4, out_shape_col_packed.h);
local_size_xyz.c = std::min(4, out_shape_col_packed.c);
}

int shader_type_index = -1;
@@ -191,12 +185,19 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::deconvolution_pack8to4_gemm;

pipeline_deconvolution_gemm = new Pipeline(vkdev);
pipeline_deconvolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
if (opt.use_shader_local_memory)
{
pipeline_deconvolution_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_deconvolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
}
pipeline_deconvolution_gemm->create(shader_type_index, opt, specializations);
}

{
std::vector<vk_specialization_type> specializations(10 + 10);
std::vector<vk_specialization_type> specializations(10 + 6);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
@@ -207,16 +208,12 @@ int Deconvolution_vulkan::create_pipeline(const Option& _opt)
specializations[7].i = activation_type;
specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[10 + 0].i = out_shape_col_packed.dims;
specializations[10 + 1].i = shape_packed.w;
specializations[10 + 2].i = shape_packed.h;
specializations[10 + 3].i = out_shape_col_packed.c;
specializations[10 + 4].i = out_shape_col_packed.cstep;
specializations[10 + 5].i = out_shape_bordered_packed.dims;
specializations[10 + 6].i = out_shape_bordered_packed.w;
specializations[10 + 7].i = out_shape_bordered_packed.h;
specializations[10 + 8].i = out_shape_bordered_packed.c;
specializations[10 + 9].i = out_shape_bordered_packed.cstep;
specializations[10 + 0].i = shape_packed.w;
specializations[10 + 1].i = shape_packed.h;
specializations[10 + 2].i = out_shape_bordered_packed.w;
specializations[10 + 3].i = out_shape_bordered_packed.h;
specializations[10 + 4].i = out_shape_bordered_packed.c;
specializations[10 + 5].i = out_shape_bordered_packed.cstep;

Mat local_size_xyz(8, 8, std::min(4, num_output / out_elempack), (void*)0);
if (out_shape_bordered_packed.dims != 0)
@@ -357,7 +354,40 @@ int Deconvolution_vulkan::upload_model(VkTransfer& cmd, const Option& opt)

// src = kw-kh-inch-outch
// dst = pa-pb-kw-kh-inch/pa-outch/pb
// dst = pa-pb-inch/pa-kw-kh-outch/pb (sgemm)
Mat weight_data_packed;
if (opt.use_sgemm_convolution)
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

weight_data_packed.create(num_input / elempack, maxk * num_output / out_elempack, (size_t)4 * elempack * out_elempack, elempack * out_elempack);

for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
{
for (int k = 0; k < maxk; k++)
{
float* g00 = weight_data_packed.row(q / out_elempack * maxk + k);

for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
{
for (int i = 0; i < out_elempack; i++)
{
const Mat k0 = weight_data_r2.channel(q + i);

for (int j = 0; j < elempack; j++)
{
const float* k00 = k0.row(p + j);

g00[0] = k00[k];

g00++;
}
}
}
}
}
}
else
{
Mat weight_data_r2 = weight_data_transposed.reshape(maxk, num_input, num_output);

@@ -446,7 +476,7 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
// gemm
VkMat top_blob_col;
{
top_blob_col.create(w * h, maxk, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_blob_col.create(w * h, maxk * num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_blob_col.empty())
return -100;

@@ -455,22 +485,18 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
bindings[1] = top_blob_col;
bindings[2] = weight_data_gpu;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = bottom_blob.cstep;
constants[5].i = top_blob_col.dims;
constants[6].i = top_blob_col.w;
constants[7].i = top_blob_col.h;
constants[8].i = top_blob_col.c;
constants[9].i = top_blob_col.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob.w;
constants[1].i = bottom_blob.h;
constants[2].i = bottom_blob.c;
constants[3].i = bottom_blob.cstep;
constants[4].i = top_blob_col.w;
constants[5].i = top_blob_col.h;

VkMat dispatcher;
dispatcher.w = (top_blob_col.w + 3) / 4;
dispatcher.h = top_blob_col.h;
dispatcher.c = top_blob_col.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_deconvolution_gemm, bindings, constants, dispatcher);
}
@@ -493,17 +519,13 @@ int Deconvolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkC
bindings[1] = top_blob_bordered;
bindings[2] = bias_data_gpu;

std::vector<vk_constant_type> constants(10);
constants[0].i = top_blob_col.dims;
constants[1].i = w;
constants[2].i = h;
constants[3].i = top_blob_col.c;
constants[4].i = top_blob_col.cstep;
constants[5].i = top_blob_bordered.dims;
constants[6].i = top_blob_bordered.w;
constants[7].i = top_blob_bordered.h;
constants[8].i = top_blob_bordered.c;
constants[9].i = top_blob_bordered.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = w;
constants[1].i = h;
constants[2].i = top_blob_bordered.w;
constants[3].i = top_blob_bordered.h;
constants[4].i = top_blob_bordered.c;
constants[5].i = top_blob_bordered.cstep;

cmd.record_pipeline(pipeline_deconvolution_col2im, bindings, constants, top_blob_bordered);
}
@@ -644,7 +666,7 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top
// gemm
VkImageMat top_blob_col;
{
top_blob_col.create(w * h, maxk, num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
top_blob_col.create(w * h, maxk * num_output / out_elempack, out_elemsize, out_elempack, opt.workspace_vkallocator);
if (top_blob_col.empty())
return -100;

@@ -653,22 +675,18 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top
bindings[1] = top_blob_col;
bindings[2] = weight_data_gpu_image;

std::vector<vk_constant_type> constants(10);
constants[0].i = bottom_blob.dims;
constants[1].i = bottom_blob.w;
constants[2].i = bottom_blob.h;
constants[3].i = bottom_blob.c;
constants[4].i = 0; //bottom_blob.cstep;
constants[5].i = top_blob_col.dims;
constants[6].i = top_blob_col.w;
constants[7].i = top_blob_col.h;
constants[8].i = top_blob_col.c;
constants[9].i = 0; //top_blob_col.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = bottom_blob.w;
constants[1].i = bottom_blob.h;
constants[2].i = bottom_blob.c;
constants[3].i = 0; // bottom_blob.cstep;
constants[4].i = top_blob_col.w;
constants[5].i = top_blob_col.h;

VkImageMat dispatcher;
dispatcher.w = (top_blob_col.w + 3) / 4;
dispatcher.h = top_blob_col.h;
dispatcher.c = top_blob_col.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_deconvolution_gemm, bindings, constants, dispatcher);
}
@@ -691,17 +709,13 @@ int Deconvolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top
bindings[1] = top_blob_bordered;
bindings[2] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(10);
constants[0].i = top_blob_col.dims;
constants[1].i = w;
constants[2].i = h;
constants[3].i = top_blob_col.c;
constants[4].i = 0; //top_blob_col.cstep;
constants[5].i = top_blob_bordered.dims;
constants[6].i = top_blob_bordered.w;
constants[7].i = top_blob_bordered.h;
constants[8].i = top_blob_bordered.c;
constants[9].i = 0; //top_blob_bordered.cstep;
std::vector<vk_constant_type> constants(6);
constants[0].i = w;
constants[1].i = h;
constants[2].i = top_blob_bordered.w;
constants[3].i = top_blob_bordered.h;
constants[4].i = top_blob_bordered.c;
constants[5].i = 0; //top_blob_bordered.cstep;

cmd.record_pipeline(pipeline_deconvolution_col2im, bindings, constants, top_blob_bordered);
}


+ 45
- 302
src/layer/vulkan/innerproduct_vulkan.cpp View File

@@ -27,14 +27,6 @@ InnerProduct_vulkan::InnerProduct_vulkan()
flatten = 0;

pipeline_innerproduct = 0;
pipeline_innerproduct_pack4 = 0;
pipeline_innerproduct_pack1to4 = 0;
pipeline_innerproduct_pack4to1 = 0;
pipeline_innerproduct_pack8 = 0;
pipeline_innerproduct_pack1to8 = 0;
pipeline_innerproduct_pack4to8 = 0;
pipeline_innerproduct_pack8to4 = 0;
pipeline_innerproduct_pack8to1 = 0;

pipeline_innerproduct_gemm = 0;
}
@@ -110,64 +102,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
local_size_xyz.c = 1;
}

{
pipeline_innerproduct_gemm = new Pipeline(vkdev);
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);

// pack1
if (in_elempack == 1 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations);
}

// pack4
if (in_elempack == 4 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations);
}

// pack1to4
if (in_elempack == 1 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations);
}

// pack4to1
if (in_elempack == 4 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations);
}

// pack8
if (in_elempack == 8 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations);
}

// pack1to8
if (in_elempack == 1 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations);
}

// pack4to8
if (in_elempack == 4 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations);
}

// pack8to4
if (in_elempack == 8 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations);
}

// pack8to1
if (in_elempack == 8 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations);
}
}
int shader_type_index = -1;
if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm;
if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp4;
if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to4;
if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to1;
if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp8;
if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to8;
if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to1;
if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to8;
if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to4;

pipeline_innerproduct_gemm = new Pipeline(vkdev);
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);

return 0;
}
@@ -268,77 +216,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)
local_size_xyz.c = 1;
}

// pack1
if (in_elempack == 1 && out_elempack == 1)
{
pipeline_innerproduct = new Pipeline(vkdev);
pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct->create(LayerShaderType::innerproduct, opt, specializations);
}

// pack4
if (in_elempack == 4 && out_elempack == 4)
{
pipeline_innerproduct_pack4 = new Pipeline(vkdev);
pipeline_innerproduct_pack4->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack4->create(LayerShaderType::innerproduct_pack4, opt, specializations);
}

// pack1to4
if (in_elempack == 1 && out_elempack == 4)
{
pipeline_innerproduct_pack1to4 = new Pipeline(vkdev);
pipeline_innerproduct_pack1to4->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack1to4->create(LayerShaderType::innerproduct_pack1to4, opt, specializations);
}
int shader_type_index = -1;
if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct;
if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack4;
if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack1to4;
if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_pack4to1;
if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack8;
if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack1to8;
if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_pack8to1;
if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_pack4to8;
if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_pack8to4;

// pack4to1
if (in_elempack == 4 && out_elempack == 1)
{
pipeline_innerproduct_pack4to1 = new Pipeline(vkdev);
pipeline_innerproduct_pack4to1->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack4to1->create(LayerShaderType::innerproduct_pack4to1, opt, specializations);
}

// pack8
if (in_elempack == 8 && out_elempack == 8)
{
pipeline_innerproduct_pack8 = new Pipeline(vkdev);
pipeline_innerproduct_pack8->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack8->create(LayerShaderType::innerproduct_pack8, opt, specializations);
}

// pack1to8
if (in_elempack == 1 && out_elempack == 8)
{
pipeline_innerproduct_pack1to8 = new Pipeline(vkdev);
pipeline_innerproduct_pack1to8->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack1to8->create(LayerShaderType::innerproduct_pack1to8, opt, specializations);
}

// pack4to8
if (in_elempack == 4 && out_elempack == 8)
{
pipeline_innerproduct_pack4to8 = new Pipeline(vkdev);
pipeline_innerproduct_pack4to8->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack4to8->create(LayerShaderType::innerproduct_pack4to8, opt, specializations);
}

// pack8to4
if (in_elempack == 8 && out_elempack == 4)
{
pipeline_innerproduct_pack8to4 = new Pipeline(vkdev);
pipeline_innerproduct_pack8to4->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack8to4->create(LayerShaderType::innerproduct_pack8to4, opt, specializations);
}

// pack8to1
if (in_elempack == 8 && out_elempack == 1)
{
pipeline_innerproduct_pack8to1 = new Pipeline(vkdev);
pipeline_innerproduct_pack8to1->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_pack8to1->create(LayerShaderType::innerproduct_pack8to1, opt, specializations);
}
pipeline_innerproduct = new Pipeline(vkdev);
pipeline_innerproduct->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct->create(shader_type_index, opt, specializations);

// gemm for no shape hint
if (shape.dims == 0)
@@ -361,64 +252,20 @@ int InnerProduct_vulkan::create_pipeline(const Option& _opt)

Mat local_size_xyz(std::min(16, num_output / out_elempack), 4, 1, (void*)0);

{
pipeline_innerproduct_gemm = new Pipeline(vkdev);
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);

// pack1
if (in_elempack == 1 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm, opt, specializations);
}

// pack4
if (in_elempack == 4 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4, opt, specializations);
}

// pack1to4
if (in_elempack == 1 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to4, opt, specializations);
}

// pack4to1
if (in_elempack == 4 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to1, opt, specializations);
}

// pack8
if (in_elempack == 8 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8, opt, specializations);
}

// pack1to8
if (in_elempack == 1 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp1to8, opt, specializations);
}

// pack4to8
if (in_elempack == 4 && out_elempack == 8)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp4to8, opt, specializations);
}

// pack8to4
if (in_elempack == 8 && out_elempack == 4)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to4, opt, specializations);
}

// pack8to1
if (in_elempack == 8 && out_elempack == 1)
{
pipeline_innerproduct_gemm->create(LayerShaderType::innerproduct_gemm_wp8to1, opt, specializations);
}
}
int shader_type_index = -1;
if (in_elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm;
if (in_elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp4;
if (in_elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to4;
if (in_elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to1;
if (in_elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp8;
if (in_elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp1to8;
if (in_elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to1;
if (in_elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::innerproduct_gemm_wp4to8;
if (in_elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::innerproduct_gemm_wp8to4;

pipeline_innerproduct_gemm = new Pipeline(vkdev);
pipeline_innerproduct_gemm->set_optimal_local_size_xyz(local_size_xyz);
pipeline_innerproduct_gemm->create(shader_type_index, opt, specializations);

return 0;
}
@@ -438,30 +285,6 @@ int InnerProduct_vulkan::destroy_pipeline(const Option& opt)
delete pipeline_innerproduct;
pipeline_innerproduct = 0;

delete pipeline_innerproduct_pack4;
pipeline_innerproduct_pack4 = 0;

delete pipeline_innerproduct_pack1to4;
pipeline_innerproduct_pack1to4 = 0;

delete pipeline_innerproduct_pack4to1;
pipeline_innerproduct_pack4to1 = 0;

delete pipeline_innerproduct_pack8;
pipeline_innerproduct_pack8 = 0;

delete pipeline_innerproduct_pack1to8;
pipeline_innerproduct_pack1to8 = 0;

delete pipeline_innerproduct_pack4to8;
pipeline_innerproduct_pack4to8 = 0;

delete pipeline_innerproduct_pack8to4;
pipeline_innerproduct_pack8to4 = 0;

delete pipeline_innerproduct_pack8to1;
pipeline_innerproduct_pack8to1 = 0;

delete pipeline_innerproduct_gemm;
pipeline_innerproduct_gemm = 0;

@@ -586,14 +409,12 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
constants[8].i = top_blob_unpacked.c;
constants[9].i = top_blob_unpacked.cstep;

const Pipeline* pipeline = pipeline_innerproduct_gemm;

VkMat dispatcher;
dispatcher.w = top_blob_unpacked.w / out_elempack;
dispatcher.h = top_blob_unpacked.h;
dispatcher.c = 1;

cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
cmd.record_pipeline(pipeline_innerproduct_gemm, bindings, constants, dispatcher);

// packing
if (elempack > 1)
@@ -645,45 +466,7 @@ int InnerProduct_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCo
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

const Pipeline* pipeline = 0;
if (in_elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_innerproduct;
}
else if (in_elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack4;
}
else if (in_elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack1to4;
}
else if (in_elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack4to1;
}
else if (in_elempack == 8 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack8;
}
else if (in_elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack1to8;
}
else if (in_elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack4to8;
}
else if (in_elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack8to4;
}
else if (in_elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
cmd.record_pipeline(pipeline_innerproduct, bindings, constants, top_blob);

return 0;
}
@@ -742,14 +525,12 @@ int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_
constants[8].i = top_blob_unpacked.c;
constants[9].i = 0; //top_blob_unpacked.cstep;

const Pipeline* pipeline = pipeline_innerproduct_gemm;

VkImageMat dispatcher;
dispatcher.w = top_blob_unpacked.w / out_elempack;
dispatcher.h = top_blob_unpacked.h;
dispatcher.c = 1;

cmd.record_pipeline(pipeline, bindings, constants, dispatcher);
cmd.record_pipeline(pipeline_innerproduct_gemm, bindings, constants, dispatcher);

// packing
if (elempack > 1)
@@ -801,45 +582,7 @@ int InnerProduct_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_
constants[8].i = top_blob.c;
constants[9].i = 0; //top_blob.cstep;

const Pipeline* pipeline = 0;
if (in_elempack == 1 && out_elempack == 1)
{
pipeline = pipeline_innerproduct;
}
else if (in_elempack == 4 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack4;
}
else if (in_elempack == 1 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack1to4;
}
else if (in_elempack == 4 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack4to1;
}
else if (in_elempack == 8 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack8;
}
else if (in_elempack == 1 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack1to8;
}
else if (in_elempack == 4 && out_elempack == 8)
{
pipeline = pipeline_innerproduct_pack4to8;
}
else if (in_elempack == 8 && out_elempack == 4)
{
pipeline = pipeline_innerproduct_pack8to4;
}
else if (in_elempack == 8 && out_elempack == 1)
{
pipeline = pipeline_innerproduct_pack8to1;
}

cmd.record_pipeline(pipeline, bindings, constants, top_blob);
cmd.record_pipeline(pipeline_innerproduct, bindings, constants, top_blob);

return 0;
}


+ 0
- 8
src/layer/vulkan/innerproduct_vulkan.h View File

@@ -43,14 +43,6 @@ public:
VkImageMat bias_data_gpu_image;

Pipeline* pipeline_innerproduct;
Pipeline* pipeline_innerproduct_pack4;
Pipeline* pipeline_innerproduct_pack1to4;
Pipeline* pipeline_innerproduct_pack4to1;
Pipeline* pipeline_innerproduct_pack8;
Pipeline* pipeline_innerproduct_pack1to8;
Pipeline* pipeline_innerproduct_pack4to8;
Pipeline* pipeline_innerproduct_pack8to4;
Pipeline* pipeline_innerproduct_pack8to1;

Pipeline* pipeline_innerproduct_gemm;
};


+ 130
- 47
src/layer/vulkan/shader/convolution_gemm.comp View File

@@ -21,6 +21,8 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int bias_term = 0;
@@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfp tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

afp sum0;
afp sum1;
@@ -85,9 +85,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld1(bias_data, gz);
sum0 = buffer_ld1(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -106,48 +106,131 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0));

afp k = image3d_ld1(weight_blob, ivec3(z, gy, 0));

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gx;
int w_offset = gy * psc(h);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH)
{
for (int kk = 0; kk < maxk; kk++)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

if (lx == 0)
{
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]);
}
}

barrier();

afp k = image3d_ld1(weight_blob, ivec3(kk, z, gz));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afp k = lfp2afp(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}
#else
int w_offset = gz * psc(c) * maxk;

for (int z = 0; z < psc(c); z++)
if (z < psc(h))
{
int v_offset = gx + z * psc(cstep);
const int remain = psc(h) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

for (int kk = 0; kk < maxk; kk++)
if (lx == 0)
{
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]);
}
}

barrier();

afp k = buffer_ld1(weight_data, w_offset);
for (int z4 = 0; z4 < remain; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afp k = lfp2afp(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
}
#else
for (int z = 0; z < psc(h); z++)
{
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);

afp k = buffer_ld1(weight_data, w_offset);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(w);
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

if (activation_type == 1)
@@ -202,16 +285,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st1(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3);
#endif
}

+ 18
- 24
src/layer/vulkan/shader/convolution_im2col.comp View File

@@ -29,17 +29,13 @@ layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -51,42 +47,40 @@ layout (binding = 1) writeonly buffer col_blob { sfp col_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

const int maxk = kernel_w * kernel_h;

if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c))
return;

int sy = gx / psc(outw);
int sx = gx % psc(outw);
const int sy = gx / psc(outw);
const int sx = gx % psc(outw);

const int sz = gy / maxk;
const int k = gy % maxk;

int ky = gy / kernel_w;
int kx = gy % kernel_w;
const int ky = k / kernel_w;
const int kx = k % kernel_w;

#if NCNN_image_shader
image3d_cp1(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz));
image3d_cp1(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz));
#else
const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;

const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx;
const int gi = gy * psc(outw) * psc(outh) + gx;

buffer_cp1(col_blob_data, gi, bottom_blob_data, v_offset);
#endif


+ 130
- 47
src/layer/vulkan/shader/convolution_pack1to4_gemm.comp View File

@@ -21,6 +21,8 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int bias_term = 0;
@@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

afpvec4 sum0;
afpvec4 sum1;
@@ -85,9 +85,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld4(bias_data, gz);
sum0 = buffer_ld4(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -106,48 +106,131 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0));

afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0));

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gx;
int w_offset = gy * psc(h);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH)
{
for (int kk = 0; kk < maxk; kk++)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

if (lx == 0)
{
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

afpvec4 k = image3d_ld4(weight_blob, ivec3(kk, z, gz));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}
#else
int w_offset = gz * psc(c) * maxk;

for (int z = 0; z < psc(c); z++)
if (z < psc(h))
{
int v_offset = gx + z * psc(cstep);
const int remain = psc(h) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

for (int kk = 0; kk < maxk; kk++)
if (lx == 0)
{
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

afpvec4 k = buffer_ld4(weight_data, w_offset);
for (int z4 = 0; z4 < remain; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
}
#else
for (int z = 0; z < psc(h); z++)
{
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);

afpvec4 k = buffer_ld4(weight_data, w_offset);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(w);
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

if (activation_type == 1)
@@ -202,16 +285,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3);
#endif
}

+ 51
- 67
src/layer/vulkan/shader/convolution_pack1to8_gemm.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -73,9 +65,8 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(w) || gy >= psc(outc))
return;

afpvec8 sum0;
@@ -86,9 +77,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld8(bias_data, gz);
sum0 = buffer_ld8(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -107,63 +98,56 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
for (int kk = 0; kk < maxk; kk++)
{
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, kk, z));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, kk, z));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, kk, z));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, kk, z));
afp v0 = image3d_ld1(col_blob, ivec3(gx4.r, z, 0));
afp v1 = image3d_ld1(col_blob, ivec3(gx4.g, z, 0));
afp v2 = image3d_ld1(col_blob, ivec3(gx4.b, z, 0));
afp v3 = image3d_ld1(col_blob, ivec3(gx4.a, z, 0));

afpvec8 k = image3d_ld8(weight_blob, ivec3(kk, z, gz));
afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0));

// sum += v * k;
sum0[0] += v0 * k[0];
sum0[1] += v0 * k[1];
// sum += v * k;
sum0[0] += v0 * k[0];
sum0[1] += v0 * k[1];

sum1[0] += v1 * k[0];
sum1[1] += v1 * k[1];
sum1[0] += v1 * k[0];
sum1[1] += v1 * k[1];

sum2[0] += v2 * k[0];
sum2[1] += v2 * k[1];
sum2[0] += v2 * k[0];
sum2[1] += v2 * k[1];

sum3[0] += v3 * k[0];
sum3[1] += v3 * k[1];
}
sum3[0] += v3 * k[0];
sum3[1] += v3 * k[1];
}
#else
int w_offset = gz * psc(c) * maxk;
int v_offset = gx;
int w_offset = gy * psc(h);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
int v_offset = gx + z * psc(cstep);
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);

for (int kk = 0; kk < maxk; kk++)
{
afp v0 = buffer_ld1(col_blob_data, v_offset + 0);
afp v1 = buffer_ld1(col_blob_data, v_offset + 1);
afp v2 = buffer_ld1(col_blob_data, v_offset + 2);
afp v3 = buffer_ld1(col_blob_data, v_offset + 3);
afpvec8 k = buffer_ld8(weight_data, w_offset);

afpvec8 k = buffer_ld8(weight_data, w_offset);
// sum += v * k;
sum0[0] += v0 * k[0];
sum0[1] += v0 * k[1];

// sum += v * k;
sum0[0] += v0 * k[0];
sum0[1] += v0 * k[1];
sum1[0] += v1 * k[0];
sum1[1] += v1 * k[1];

sum1[0] += v1 * k[0];
sum1[1] += v1 * k[1];
sum2[0] += v2 * k[0];
sum2[1] += v2 * k[1];

sum2[0] += v2 * k[0];
sum2[1] += v2 * k[1];
sum3[0] += v3 * k[0];
sum3[1] += v3 * k[1];

sum3[0] += v3 * k[0];
sum3[1] += v3 * k[1];

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
v_offset += psc(w);
w_offset += 1;
}
#endif

@@ -243,16 +227,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3);
#endif
}

+ 116
- 21
src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp View File

@@ -21,6 +21,8 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -53,12 +55,7 @@ layout (binding = 3) uniform unfp sampler3D bias_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif

@@ -77,22 +74,25 @@ layout (push_constant) uniform parameter
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4];
#endif

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if !NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
return;
#endif
#endif

afpvec4 sum0;
@@ -146,9 +146,103 @@ void main()
sum3 += v3 * k;
}
#else
int w_offset = gz * psc(c);
int w_offset = gz * psc(c) * 4;
int v_offset = gx;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int lz = int(gl_LocalInvocationID.z);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (lz < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep);
w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (lz < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0);
@@ -156,26 +250,22 @@ void main()
afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
buffer_ld4(weight_data, w_offset * 4 + 0),
buffer_ld4(weight_data, w_offset * 4 + 1),
buffer_ld4(weight_data, w_offset * 4 + 2),
buffer_ld4(weight_data, w_offset * 4 + 3)
buffer_ld4(weight_data, w_offset + 0),
buffer_ld4(weight_data, w_offset + 1),
buffer_ld4(weight_data, w_offset + 2),
buffer_ld4(weight_data, w_offset + 3)
);
#else
afpmat4 k = sfp2afpmat4(weight_data[w_offset]);
#endif

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

w_offset += 1;
w_offset += 4;
v_offset += psc(cstep);
}
#endif
#endif

if (activation_type == 1)
@@ -232,6 +322,11 @@ void main()
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
#else
#if NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
return;
#endif

int gi = gz * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi + 0, sum0);


+ 0
- 139
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_gemm.comp View File

@@ -1,139 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; };
#else
layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; };
#endif
#endif

layout (push_constant) uniform parameter
{
int c;
int cstep;

int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y) * 4;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc))
return;

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
afpvec4 sum2 = afpvec4(0.f);
afpvec4 sum3 = afpvec4(0.f);

#if NCNN_image_shader
int wx = gx * 4;

for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gy * 16 + gx;
int w_offset = gz * psc(c) * 16 + gx;

for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 16);
afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 32);
afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 48);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
buffer_ld4(weight_tm_data, w_offset * 4 + 0),
buffer_ld4(weight_tm_data, w_offset * 4 + 1),
buffer_ld4(weight_tm_data, w_offset * 4 + 2),
buffer_ld4(weight_tm_data, w_offset * 4 + 3)
);
#else
afpmat4 k = sfpmat4(weight_tm_data[w_offset]);
#endif

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(cstep);
w_offset += 16;
}
#endif

#if NCNN_image_shader
image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 16 + gx;

buffer_st4(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 16, sum1);
if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 32, sum2);
if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 48, sum3);
#endif
}

+ 37
- 37
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_input.comp View File

@@ -59,7 +59,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 4x4
@@ -161,42 +161,42 @@ void main()

// store 16
#if NCNN_image_shader
int y = gy * p.block_x + gx;
image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v10);
image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v11);
image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v12);
image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v13);
image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v20);
image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v21);
image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v22);
image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v23);
image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v30);
image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v31);
image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v32);
image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v33);
int x = gy * psc(block_x) + gx;
image3d_st4(bottom_tm_blob, ivec3(x, gz, 0), v00);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 1), v01);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 2), v02);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 3), v03);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 4), v10);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 5), v11);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 6), v12);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 7), v13);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 8), v20);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 9), v21);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 10), v22);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 11), v23);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 12), v30);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 13), v31);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 14), v32);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 15), v33);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;
buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v10);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v11);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v12);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v13);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v20);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v21);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v22);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v23);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v30);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v31);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v32);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v33);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v10);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v11);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v12);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v13);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v20);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v21);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v22);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v23);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v30);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v31);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v32);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v33);
#endif
}

+ 37
- 37
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp View File

@@ -66,48 +66,48 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 16
#if NCNN_image_shader
int sy = gy * p.block_x + gx;
afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz));
afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz));
afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz));
afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz));
afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz));
afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz));
afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz));
afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz));
afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz));
afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz));
afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz));
afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz));
afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz));
afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz));
afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz));
afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz));
int sx = gy * psc(block_x) + gx;
afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 0));
afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 1));
afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 2));
afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 3));
afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 4));
afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 5));
afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 6));
afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 7));
afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 8));
afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 9));
afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 10));
afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 11));
afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 12));
afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 13));
afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 14));
afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 15));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;
afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0);
afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1);
afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2);
afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3);
afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4);
afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5);
afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6);
afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7);
afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8);
afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9);
afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10);
afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11);
afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12);
afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13);
afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14);
afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
#endif

// const float itm[2][4] = {


+ 0
- 139
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_gemm.comp View File

@@ -1,139 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; };
#else
layout (binding = 2) readonly buffer weight_tm_blob { sfpmat4 weight_tm_data[]; };
#endif
#endif

layout (push_constant) uniform parameter
{
int c;
int cstep;

int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y) * 4;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= 36 || gy >= psc(outh) || gz >= psc(outc))
return;

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
afpvec4 sum2 = afpvec4(0.f);
afpvec4 sum3 = afpvec4(0.f);

#if NCNN_image_shader
int wx = gx * 4;

for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx, gy + 3, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_tm_blob, ivec3(wx + 0, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 1, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 2, z, gz)),
image3d_ld4(weight_tm_blob, ivec3(wx + 3, z, gz))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gy * 36 + gx;
int w_offset = gz * psc(c) * 36 + gx;

for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 36);
afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 72);
afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 108);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
buffer_ld4(weight_tm_data, w_offset * 4 + 0),
buffer_ld4(weight_tm_data, w_offset * 4 + 1),
buffer_ld4(weight_tm_data, w_offset * 4 + 2),
buffer_ld4(weight_tm_data, w_offset * 4 + 3)
);
#else
afpmat4 k = sfpmat4(weight_tm_data[w_offset]);
#endif

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(cstep);
w_offset += 36;
}
#endif

#if NCNN_image_shader
image3d_st4(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st4(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st4(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st4(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 36 + gx;

buffer_st4(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 36, sum1);
if (gy + 2 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 72, sum2);
if (gy + 3 < psc(outh)) buffer_st4(top_tm_blob_data, gi + 108, sum3);
#endif
}

+ 77
- 77
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_input.comp View File

@@ -59,7 +59,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 6x6
@@ -259,82 +259,82 @@ void main()

// store 36
#if NCNN_image_shader
int y = gy * p.block_x + gx;
image3d_st4(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st4(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st4(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st4(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st4(bottom_tm_blob, ivec3(4, y, gz), v04);
image3d_st4(bottom_tm_blob, ivec3(5, y, gz), v05);
image3d_st4(bottom_tm_blob, ivec3(6, y, gz), v10);
image3d_st4(bottom_tm_blob, ivec3(7, y, gz), v11);
image3d_st4(bottom_tm_blob, ivec3(8, y, gz), v12);
image3d_st4(bottom_tm_blob, ivec3(9, y, gz), v13);
image3d_st4(bottom_tm_blob, ivec3(10, y, gz), v14);
image3d_st4(bottom_tm_blob, ivec3(11, y, gz), v15);
image3d_st4(bottom_tm_blob, ivec3(12, y, gz), v20);
image3d_st4(bottom_tm_blob, ivec3(13, y, gz), v21);
image3d_st4(bottom_tm_blob, ivec3(14, y, gz), v22);
image3d_st4(bottom_tm_blob, ivec3(15, y, gz), v23);
image3d_st4(bottom_tm_blob, ivec3(16, y, gz), v24);
image3d_st4(bottom_tm_blob, ivec3(17, y, gz), v25);
image3d_st4(bottom_tm_blob, ivec3(18, y, gz), v30);
image3d_st4(bottom_tm_blob, ivec3(19, y, gz), v31);
image3d_st4(bottom_tm_blob, ivec3(20, y, gz), v32);
image3d_st4(bottom_tm_blob, ivec3(21, y, gz), v33);
image3d_st4(bottom_tm_blob, ivec3(22, y, gz), v34);
image3d_st4(bottom_tm_blob, ivec3(23, y, gz), v35);
image3d_st4(bottom_tm_blob, ivec3(24, y, gz), v40);
image3d_st4(bottom_tm_blob, ivec3(25, y, gz), v41);
image3d_st4(bottom_tm_blob, ivec3(26, y, gz), v42);
image3d_st4(bottom_tm_blob, ivec3(27, y, gz), v43);
image3d_st4(bottom_tm_blob, ivec3(28, y, gz), v44);
image3d_st4(bottom_tm_blob, ivec3(29, y, gz), v45);
image3d_st4(bottom_tm_blob, ivec3(30, y, gz), v50);
image3d_st4(bottom_tm_blob, ivec3(31, y, gz), v51);
image3d_st4(bottom_tm_blob, ivec3(32, y, gz), v52);
image3d_st4(bottom_tm_blob, ivec3(33, y, gz), v53);
image3d_st4(bottom_tm_blob, ivec3(34, y, gz), v54);
image3d_st4(bottom_tm_blob, ivec3(35, y, gz), v55);
int x = gy * psc(block_x) + gx;
image3d_st4(bottom_tm_blob, ivec3(x, gz, 0), v00);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 1), v01);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 2), v02);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 3), v03);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 4), v04);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 5), v05);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 6), v10);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 7), v11);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 8), v12);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 9), v13);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 10), v14);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 11), v15);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 12), v20);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 13), v21);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 14), v22);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 15), v23);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 16), v24);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 17), v25);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 18), v30);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 19), v31);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 20), v32);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 21), v33);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 22), v34);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 23), v35);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 24), v40);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 25), v41);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 26), v42);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 27), v43);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 28), v44);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 29), v45);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 30), v50);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 31), v51);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 32), v52);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 33), v53);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 34), v54);
image3d_st4(bottom_tm_blob, ivec3(x, gz, 35), v55);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 36;
buffer_st4(bottom_tm_blob_data, v_tm_offset + 0, v00);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 1, v01);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 2, v02);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 3, v03);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 4, v04);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 5, v05);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 6, v10);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 7, v11);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 8, v12);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 9, v13);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 10, v14);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 11, v15);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 12, v20);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 13, v21);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 14, v22);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 15, v23);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 16, v24);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 17, v25);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 18, v30);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 19, v31);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 20, v32);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 21, v33);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 22, v34);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 23, v35);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 24, v40);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 25, v41);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 26, v42);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 27, v43);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 28, v44);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 29, v45);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 30, v50);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 31, v51);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 32, v52);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 33, v53);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 34, v54);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 35, v55);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
buffer_st4(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v04);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v05);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v10);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v11);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v12);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v13);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v14);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v15);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v20);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v21);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v22);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v23);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 16 * psc(outcstep), v24);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 17 * psc(outcstep), v25);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 18 * psc(outcstep), v30);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 19 * psc(outcstep), v31);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 20 * psc(outcstep), v32);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 21 * psc(outcstep), v33);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 22 * psc(outcstep), v34);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 23 * psc(outcstep), v35);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 24 * psc(outcstep), v40);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 25 * psc(outcstep), v41);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 26 * psc(outcstep), v42);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 27 * psc(outcstep), v43);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 28 * psc(outcstep), v44);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 29 * psc(outcstep), v45);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 30 * psc(outcstep), v50);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 31 * psc(outcstep), v51);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 32 * psc(outcstep), v52);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 33 * psc(outcstep), v53);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 34 * psc(outcstep), v54);
buffer_st4(bottom_tm_blob_data, v_tm_offset + 35 * psc(outcstep), v55);
#endif
}

+ 75
- 75
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp View File

@@ -66,88 +66,88 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 36
#if NCNN_image_shader
int sy = gy * p.block_x + gx;
int sx = gy * psc(block_x) + gx;

afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(0, sy, gz));
afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(1, sy, gz));
afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(2, sy, gz));
afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(3, sy, gz));
afpvec4 v04 = image3d_ld4(top_tm_blob, ivec3(4, sy, gz));
afpvec4 v05 = image3d_ld4(top_tm_blob, ivec3(5, sy, gz));
afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(6, sy, gz));
afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(7, sy, gz));
afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(8, sy, gz));
afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(9, sy, gz));
afpvec4 v14 = image3d_ld4(top_tm_blob, ivec3(10, sy, gz));
afpvec4 v15 = image3d_ld4(top_tm_blob, ivec3(11, sy, gz));
afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(12, sy, gz));
afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(13, sy, gz));
afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(14, sy, gz));
afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(15, sy, gz));
afpvec4 v24 = image3d_ld4(top_tm_blob, ivec3(16, sy, gz));
afpvec4 v25 = image3d_ld4(top_tm_blob, ivec3(17, sy, gz));
afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(18, sy, gz));
afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(19, sy, gz));
afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(20, sy, gz));
afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(21, sy, gz));
afpvec4 v34 = image3d_ld4(top_tm_blob, ivec3(22, sy, gz));
afpvec4 v35 = image3d_ld4(top_tm_blob, ivec3(23, sy, gz));
afpvec4 v40 = image3d_ld4(top_tm_blob, ivec3(24, sy, gz));
afpvec4 v41 = image3d_ld4(top_tm_blob, ivec3(25, sy, gz));
afpvec4 v42 = image3d_ld4(top_tm_blob, ivec3(26, sy, gz));
afpvec4 v43 = image3d_ld4(top_tm_blob, ivec3(27, sy, gz));
afpvec4 v44 = image3d_ld4(top_tm_blob, ivec3(28, sy, gz));
afpvec4 v45 = image3d_ld4(top_tm_blob, ivec3(29, sy, gz));
afpvec4 v50 = image3d_ld4(top_tm_blob, ivec3(30, sy, gz));
afpvec4 v51 = image3d_ld4(top_tm_blob, ivec3(31, sy, gz));
afpvec4 v52 = image3d_ld4(top_tm_blob, ivec3(32, sy, gz));
afpvec4 v53 = image3d_ld4(top_tm_blob, ivec3(33, sy, gz));
afpvec4 v54 = image3d_ld4(top_tm_blob, ivec3(34, sy, gz));
afpvec4 v55 = image3d_ld4(top_tm_blob, ivec3(35, sy, gz));
afpvec4 v00 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 0));
afpvec4 v01 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 1));
afpvec4 v02 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 2));
afpvec4 v03 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 3));
afpvec4 v04 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 4));
afpvec4 v05 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 5));
afpvec4 v10 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 6));
afpvec4 v11 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 7));
afpvec4 v12 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 8));
afpvec4 v13 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 9));
afpvec4 v14 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 10));
afpvec4 v15 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 11));
afpvec4 v20 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 12));
afpvec4 v21 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 13));
afpvec4 v22 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 14));
afpvec4 v23 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 15));
afpvec4 v24 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 16));
afpvec4 v25 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 17));
afpvec4 v30 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 18));
afpvec4 v31 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 19));
afpvec4 v32 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 20));
afpvec4 v33 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 21));
afpvec4 v34 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 22));
afpvec4 v35 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 23));
afpvec4 v40 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 24));
afpvec4 v41 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 25));
afpvec4 v42 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 26));
afpvec4 v43 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 27));
afpvec4 v44 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 28));
afpvec4 v45 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 29));
afpvec4 v50 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 30));
afpvec4 v51 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 31));
afpvec4 v52 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 32));
afpvec4 v53 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 33));
afpvec4 v54 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 34));
afpvec4 v55 = image3d_ld4(top_tm_blob, ivec3(sx, gz, 35));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 36;
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;

afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0);
afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1);
afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2);
afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3);
afpvec4 v04 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4);
afpvec4 v05 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5);
afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6);
afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7);
afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8);
afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9);
afpvec4 v14 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10);
afpvec4 v15 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11);
afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12);
afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13);
afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14);
afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15);
afpvec4 v24 = buffer_ld4(top_tm_blob_data, v_tm_offset + 16);
afpvec4 v25 = buffer_ld4(top_tm_blob_data, v_tm_offset + 17);
afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 18);
afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 19);
afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 20);
afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 21);
afpvec4 v34 = buffer_ld4(top_tm_blob_data, v_tm_offset + 22);
afpvec4 v35 = buffer_ld4(top_tm_blob_data, v_tm_offset + 23);
afpvec4 v40 = buffer_ld4(top_tm_blob_data, v_tm_offset + 24);
afpvec4 v41 = buffer_ld4(top_tm_blob_data, v_tm_offset + 25);
afpvec4 v42 = buffer_ld4(top_tm_blob_data, v_tm_offset + 26);
afpvec4 v43 = buffer_ld4(top_tm_blob_data, v_tm_offset + 27);
afpvec4 v44 = buffer_ld4(top_tm_blob_data, v_tm_offset + 28);
afpvec4 v45 = buffer_ld4(top_tm_blob_data, v_tm_offset + 29);
afpvec4 v50 = buffer_ld4(top_tm_blob_data, v_tm_offset + 30);
afpvec4 v51 = buffer_ld4(top_tm_blob_data, v_tm_offset + 31);
afpvec4 v52 = buffer_ld4(top_tm_blob_data, v_tm_offset + 32);
afpvec4 v53 = buffer_ld4(top_tm_blob_data, v_tm_offset + 33);
afpvec4 v54 = buffer_ld4(top_tm_blob_data, v_tm_offset + 34);
afpvec4 v55 = buffer_ld4(top_tm_blob_data, v_tm_offset + 35);
afpvec4 v00 = buffer_ld4(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
afpvec4 v01 = buffer_ld4(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
afpvec4 v02 = buffer_ld4(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
afpvec4 v03 = buffer_ld4(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
afpvec4 v04 = buffer_ld4(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
afpvec4 v05 = buffer_ld4(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
afpvec4 v10 = buffer_ld4(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
afpvec4 v11 = buffer_ld4(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
afpvec4 v12 = buffer_ld4(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
afpvec4 v13 = buffer_ld4(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
afpvec4 v14 = buffer_ld4(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
afpvec4 v15 = buffer_ld4(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
afpvec4 v20 = buffer_ld4(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
afpvec4 v21 = buffer_ld4(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
afpvec4 v22 = buffer_ld4(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
afpvec4 v23 = buffer_ld4(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
afpvec4 v24 = buffer_ld4(top_tm_blob_data, v_tm_offset + 16 * psc(cstep));
afpvec4 v25 = buffer_ld4(top_tm_blob_data, v_tm_offset + 17 * psc(cstep));
afpvec4 v30 = buffer_ld4(top_tm_blob_data, v_tm_offset + 18 * psc(cstep));
afpvec4 v31 = buffer_ld4(top_tm_blob_data, v_tm_offset + 19 * psc(cstep));
afpvec4 v32 = buffer_ld4(top_tm_blob_data, v_tm_offset + 20 * psc(cstep));
afpvec4 v33 = buffer_ld4(top_tm_blob_data, v_tm_offset + 21 * psc(cstep));
afpvec4 v34 = buffer_ld4(top_tm_blob_data, v_tm_offset + 22 * psc(cstep));
afpvec4 v35 = buffer_ld4(top_tm_blob_data, v_tm_offset + 23 * psc(cstep));
afpvec4 v40 = buffer_ld4(top_tm_blob_data, v_tm_offset + 24 * psc(cstep));
afpvec4 v41 = buffer_ld4(top_tm_blob_data, v_tm_offset + 25 * psc(cstep));
afpvec4 v42 = buffer_ld4(top_tm_blob_data, v_tm_offset + 26 * psc(cstep));
afpvec4 v43 = buffer_ld4(top_tm_blob_data, v_tm_offset + 27 * psc(cstep));
afpvec4 v44 = buffer_ld4(top_tm_blob_data, v_tm_offset + 28 * psc(cstep));
afpvec4 v45 = buffer_ld4(top_tm_blob_data, v_tm_offset + 29 * psc(cstep));
afpvec4 v50 = buffer_ld4(top_tm_blob_data, v_tm_offset + 30 * psc(cstep));
afpvec4 v51 = buffer_ld4(top_tm_blob_data, v_tm_offset + 31 * psc(cstep));
afpvec4 v52 = buffer_ld4(top_tm_blob_data, v_tm_offset + 32 * psc(cstep));
afpvec4 v53 = buffer_ld4(top_tm_blob_data, v_tm_offset + 33 * psc(cstep));
afpvec4 v54 = buffer_ld4(top_tm_blob_data, v_tm_offset + 34 * psc(cstep));
afpvec4 v55 = buffer_ld4(top_tm_blob_data, v_tm_offset + 35 * psc(cstep));
#endif

// const float otm[4][6] = {


+ 238
- 0
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd_gemm.comp View File

@@ -0,0 +1,238 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2022 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int batch = 1;

#define shape_constant_id_offset 1
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec4 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec4 top_tm_blob_data[]; };
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec4 weight_tm_data[]; };
#endif

layout (push_constant) uniform parameter
{
int c;
int cstep;

int outw;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if !NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
#endif

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
afpvec4 sum2 = afpvec4(0.f);
afpvec4 sum3 = afpvec4(0.f);

#if NCNN_image_shader
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = image3d_ld4(bottom_tm_blob, ivec3(gx + 0, z, gz));
afpvec4 v1 = image3d_ld4(bottom_tm_blob, ivec3(gx + 1, z, gz));
afpvec4 v2 = image3d_ld4(bottom_tm_blob, ivec3(gx + 2, z, gz));
afpvec4 v3 = image3d_ld4(bottom_tm_blob, ivec3(gx + 3, z, gz));

afpmat4 k = afpmat4(
image3d_ld4(weight_tm_blob, ivec3(z * 4 + 0, gy, gz)),
image3d_ld4(weight_tm_blob, ivec3(z * 4 + 1, gy, gz)),
image3d_ld4(weight_tm_blob, ivec3(z * 4 + 2, gy, gz)),
image3d_ld4(weight_tm_blob, ivec3(z * 4 + 3, gy, gz))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gz * psc(cstep) + gx;
int w_offset = (gz * psc(c) * psc(outc) + gy * psc(c)) * 4;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_tm_blob_data[v_offset + z4 * psc(outw) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_tm_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(outw);
w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_tm_blob_data[v_offset + z4 * psc(outw) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_tm_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_tm_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(bottom_tm_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(bottom_tm_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(bottom_tm_blob_data, v_offset + 3);

afpmat4 k = afpmat4(
buffer_ld4(weight_tm_data, w_offset + 0),
buffer_ld4(weight_tm_data, w_offset + 1),
buffer_ld4(weight_tm_data, w_offset + 2),
buffer_ld4(weight_tm_data, w_offset + 3)
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(outw);
w_offset += 4;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;
#endif

#if NCNN_image_shader
image3d_st4(top_tm_blob, ivec3(gx + 0, gy, gz), sum0);
image3d_st4(top_tm_blob, ivec3(gx + 1, gy, gz), sum1);
image3d_st4(top_tm_blob, ivec3(gx + 2, gy, gz), sum2);
image3d_st4(top_tm_blob, ivec3(gx + 3, gy, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st4(top_tm_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw)) buffer_st4(top_tm_blob_data, gi + 3, sum3);
#endif
}

+ 152
- 69
src/layer/vulkan/shader/convolution_pack4_gemm.comp View File

@@ -21,6 +21,8 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int bias_term = 0;
@@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -49,38 +47,35 @@ layout (binding = 3) uniform unfp sampler3D bias_blob;
#else
layout (binding = 0) readonly buffer col_blob { sfpvec4 col_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };
#endif

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

afpvec4 sum0;
afpvec4 sum1;
@@ -90,9 +85,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld4(bias_data, gz);
sum0 = buffer_ld4(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -111,63 +106,151 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(z * 4 + 0, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 1, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 2, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 3, gy, 0))
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
#else
int v_offset = gx;
int w_offset = gy * psc(h) * 4;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH)
{
for (int kk = 0; kk < maxk; kk++)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(kk * 4 + 0, z, gz)),
image3d_ld4(weight_blob, ivec3(kk * 4 + 1, z, gz)),
image3d_ld4(weight_blob, ivec3(kk * 4 + 2, z, gz)),
image3d_ld4(weight_blob, ivec3(kk * 4 + 3, z, gz))
);
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);
afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);
afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w);
w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;

barrier();
}
#else
int w_offset = gz * psc(c) * maxk;

for (int z = 0; z < psc(c); z++)
if (z < psc(h))
{
int v_offset = gx + z * psc(cstep);
const int remain = psc(h) - z;

for (int kk = 0; kk < maxk; kk++)
if (ly < 4)
{
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
buffer_ld4(weight_data, w_offset * 4 + 0),
buffer_ld4(weight_data, w_offset * 4 + 1),
buffer_ld4(weight_data, w_offset * 4 + 2),
buffer_ld4(weight_data, w_offset * 4 + 3)
);
#else
afpmat4 k = afpmat4(weight_data[w_offset]);
#endif
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
}
#else
for (int z = 0; z < psc(h); z++)
{
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);

afpmat4 k = afpmat4(
buffer_ld4(weight_data, w_offset + 0),
buffer_ld4(weight_data, w_offset + 1),
buffer_ld4(weight_data, w_offset + 2),
buffer_ld4(weight_data, w_offset + 3)
);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;

v_offset += psc(w);
w_offset += 4;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

if (activation_type == 1)
@@ -222,16 +305,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3);
#endif
}

+ 18
- 24
src/layer/vulkan/shader/convolution_pack4_im2col.comp View File

@@ -29,17 +29,13 @@ layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -51,42 +47,40 @@ layout (binding = 1) writeonly buffer col_blob { sfpvec4 col_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

const int maxk = kernel_w * kernel_h;

if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c))
return;

int sy = gx / psc(outw);
int sx = gx % psc(outw);
const int sy = gx / psc(outw);
const int sx = gx % psc(outw);

const int sz = gy / maxk;
const int k = gy % maxk;

int ky = gy / kernel_w;
int kx = gy % kernel_w;
const int ky = k / kernel_w;
const int kx = k % kernel_w;

#if NCNN_image_shader
image3d_cp4(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz));
image3d_cp4(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz));
#else
const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;

const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx;
const int gi = gy * psc(outw) * psc(outh) + gx;

buffer_cp4(col_blob_data, gi, bottom_blob_data, v_offset);
#endif


+ 130
- 47
src/layer/vulkan/shader/convolution_pack4to1_gemm.comp View File

@@ -21,6 +21,8 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int bias_term = 0;
@@ -29,17 +31,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -55,27 +53,29 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

afp sum0;
afp sum1;
@@ -85,9 +85,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld1(bias_data, gz);
sum0 = buffer_ld1(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -106,48 +106,131 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0));

afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0));

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);
}
#else
int v_offset = gx;
int w_offset = gy * psc(h);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(h); z += LOCAL_MEMORY_UNROLL_INCH)
{
for (int kk = 0; kk < maxk; kk++)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

if (lx == 0)
{
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

afpvec4 k = image3d_ld4(weight_blob, ivec3(kk, z, gz));
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(w);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}
#else
int w_offset = gz * psc(c) * maxk;

for (int z = 0; z < psc(c); z++)
if (z < psc(h))
{
int v_offset = gx + z * psc(cstep);
const int remain = psc(h) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(col_blob_data[v_offset + z4 * psc(w) + ly]);
}
}

for (int kk = 0; kk < maxk; kk++)
if (lx == 0)
{
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

afpvec4 k = buffer_ld4(weight_data, w_offset);
for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
}
#else
for (int z = 0; z < psc(h); z++)
{
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);

afpvec4 k = buffer_ld4(weight_data, w_offset);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);

v_offset += psc(w);
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(w) || gy >= psc(outc))
return;
#endif

if (activation_type == 1)
@@ -202,16 +285,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st1(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3);
#endif
}

+ 124
- 140
src/layer/vulkan/shader/convolution_pack4to8_gemm.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -73,9 +65,8 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(w) || gy >= psc(outc))
return;

afpvec8 sum0;
@@ -86,9 +77,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld8(bias_data, gz);
sum0 = buffer_ld8(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -107,125 +98,118 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
for (int kk = 0; kk < maxk; kk++)
{
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, kk, z));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, kk, z));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, kk, z));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, kk, z));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(kk * 8 + 0, z, gz));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(kk * 8 + 1, z, gz));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(kk * 8 + 2, z, gz));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(kk * 8 + 3, z, gz));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(kk * 8 + 4, z, gz));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(kk * 8 + 5, z, gz));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(kk * 8 + 6, z, gz));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(kk * 8 + 7, z, gz));

// sum += v * k;
sum0[0].r += dot(v0, k0);
sum0[0].g += dot(v0, k1);
sum0[0].b += dot(v0, k2);
sum0[0].a += dot(v0, k3);
sum0[1].r += dot(v0, k4);
sum0[1].g += dot(v0, k5);
sum0[1].b += dot(v0, k6);
sum0[1].a += dot(v0, k7);

sum1[0].r += dot(v1, k0);
sum1[0].g += dot(v1, k1);
sum1[0].b += dot(v1, k2);
sum1[0].a += dot(v1, k3);
sum1[1].r += dot(v1, k4);
sum1[1].g += dot(v1, k5);
sum1[1].b += dot(v1, k6);
sum1[1].a += dot(v1, k7);

sum2[0].r += dot(v2, k0);
sum2[0].g += dot(v2, k1);
sum2[0].b += dot(v2, k2);
sum2[0].a += dot(v2, k3);
sum2[1].r += dot(v2, k4);
sum2[1].g += dot(v2, k5);
sum2[1].b += dot(v2, k6);
sum2[1].a += dot(v2, k7);

sum3[0].r += dot(v3, k0);
sum3[0].g += dot(v3, k1);
sum3[0].b += dot(v3, k2);
sum3[0].a += dot(v3, k3);
sum3[1].r += dot(v3, k4);
sum3[1].g += dot(v3, k5);
sum3[1].b += dot(v3, k6);
sum3[1].a += dot(v3, k7);
}
afpvec4 v0 = image3d_ld4(col_blob, ivec3(gx4.r, z, 0));
afpvec4 v1 = image3d_ld4(col_blob, ivec3(gx4.g, z, 0));
afpvec4 v2 = image3d_ld4(col_blob, ivec3(gx4.b, z, 0));
afpvec4 v3 = image3d_ld4(col_blob, ivec3(gx4.a, z, 0));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(z * 8 + 0, gy, 0));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(z * 8 + 1, gy, 0));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(z * 8 + 2, gy, 0));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(z * 8 + 3, gy, 0));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(z * 8 + 4, gy, 0));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(z * 8 + 5, gy, 0));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(z * 8 + 6, gy, 0));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(z * 8 + 7, gy, 0));

// sum += v * k;
sum0[0].r += dot(v0, k0);
sum0[0].g += dot(v0, k1);
sum0[0].b += dot(v0, k2);
sum0[0].a += dot(v0, k3);
sum0[1].r += dot(v0, k4);
sum0[1].g += dot(v0, k5);
sum0[1].b += dot(v0, k6);
sum0[1].a += dot(v0, k7);

sum1[0].r += dot(v1, k0);
sum1[0].g += dot(v1, k1);
sum1[0].b += dot(v1, k2);
sum1[0].a += dot(v1, k3);
sum1[1].r += dot(v1, k4);
sum1[1].g += dot(v1, k5);
sum1[1].b += dot(v1, k6);
sum1[1].a += dot(v1, k7);

sum2[0].r += dot(v2, k0);
sum2[0].g += dot(v2, k1);
sum2[0].b += dot(v2, k2);
sum2[0].a += dot(v2, k3);
sum2[1].r += dot(v2, k4);
sum2[1].g += dot(v2, k5);
sum2[1].b += dot(v2, k6);
sum2[1].a += dot(v2, k7);

sum3[0].r += dot(v3, k0);
sum3[0].g += dot(v3, k1);
sum3[0].b += dot(v3, k2);
sum3[0].a += dot(v3, k3);
sum3[1].r += dot(v3, k4);
sum3[1].g += dot(v3, k5);
sum3[1].b += dot(v3, k6);
sum3[1].a += dot(v3, k7);
}
#else
int w_offset = gz * psc(c) * maxk;
int v_offset = gx;
int w_offset = gy * psc(h) * 8;

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
int v_offset = gx + z * psc(cstep);

for (int kk = 0; kk < maxk; kk++)
{
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);

afpvec4 k0 = buffer_ld4(weight_data, w_offset * 8 + 0);
afpvec4 k1 = buffer_ld4(weight_data, w_offset * 8 + 1);
afpvec4 k2 = buffer_ld4(weight_data, w_offset * 8 + 2);
afpvec4 k3 = buffer_ld4(weight_data, w_offset * 8 + 3);
afpvec4 k4 = buffer_ld4(weight_data, w_offset * 8 + 4);
afpvec4 k5 = buffer_ld4(weight_data, w_offset * 8 + 5);
afpvec4 k6 = buffer_ld4(weight_data, w_offset * 8 + 6);
afpvec4 k7 = buffer_ld4(weight_data, w_offset * 8 + 7);

// sum += v * k;
sum0[0].r += dot(v0, k0);
sum0[0].g += dot(v0, k1);
sum0[0].b += dot(v0, k2);
sum0[0].a += dot(v0, k3);
sum0[1].r += dot(v0, k4);
sum0[1].g += dot(v0, k5);
sum0[1].b += dot(v0, k6);
sum0[1].a += dot(v0, k7);

sum1[0].r += dot(v1, k0);
sum1[0].g += dot(v1, k1);
sum1[0].b += dot(v1, k2);
sum1[0].a += dot(v1, k3);
sum1[1].r += dot(v1, k4);
sum1[1].g += dot(v1, k5);
sum1[1].b += dot(v1, k6);
sum1[1].a += dot(v1, k7);

sum2[0].r += dot(v2, k0);
sum2[0].g += dot(v2, k1);
sum2[0].b += dot(v2, k2);
sum2[0].a += dot(v2, k3);
sum2[1].r += dot(v2, k4);
sum2[1].g += dot(v2, k5);
sum2[1].b += dot(v2, k6);
sum2[1].a += dot(v2, k7);

sum3[0].r += dot(v3, k0);
sum3[0].g += dot(v3, k1);
sum3[0].b += dot(v3, k2);
sum3[0].a += dot(v3, k3);
sum3[1].r += dot(v3, k4);
sum3[1].g += dot(v3, k5);
sum3[1].b += dot(v3, k6);
sum3[1].a += dot(v3, k7);

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
afpvec4 v0 = buffer_ld4(col_blob_data, v_offset + 0);
afpvec4 v1 = buffer_ld4(col_blob_data, v_offset + 1);
afpvec4 v2 = buffer_ld4(col_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(col_blob_data, v_offset + 3);

afpvec4 k0 = buffer_ld4(weight_data, w_offset + 0);
afpvec4 k1 = buffer_ld4(weight_data, w_offset + 1);
afpvec4 k2 = buffer_ld4(weight_data, w_offset + 2);
afpvec4 k3 = buffer_ld4(weight_data, w_offset + 3);
afpvec4 k4 = buffer_ld4(weight_data, w_offset + 4);
afpvec4 k5 = buffer_ld4(weight_data, w_offset + 5);
afpvec4 k6 = buffer_ld4(weight_data, w_offset + 6);
afpvec4 k7 = buffer_ld4(weight_data, w_offset + 7);

// sum += v * k;
sum0[0].r += dot(v0, k0);
sum0[0].g += dot(v0, k1);
sum0[0].b += dot(v0, k2);
sum0[0].a += dot(v0, k3);
sum0[1].r += dot(v0, k4);
sum0[1].g += dot(v0, k5);
sum0[1].b += dot(v0, k6);
sum0[1].a += dot(v0, k7);

sum1[0].r += dot(v1, k0);
sum1[0].g += dot(v1, k1);
sum1[0].b += dot(v1, k2);
sum1[0].a += dot(v1, k3);
sum1[1].r += dot(v1, k4);
sum1[1].g += dot(v1, k5);
sum1[1].b += dot(v1, k6);
sum1[1].a += dot(v1, k7);

sum2[0].r += dot(v2, k0);
sum2[0].g += dot(v2, k1);
sum2[0].b += dot(v2, k2);
sum2[0].a += dot(v2, k3);
sum2[1].r += dot(v2, k4);
sum2[1].g += dot(v2, k5);
sum2[1].b += dot(v2, k6);
sum2[1].a += dot(v2, k7);

sum3[0].r += dot(v3, k0);
sum3[0].g += dot(v3, k1);
sum3[0].b += dot(v3, k2);
sum3[0].a += dot(v3, k3);
sum3[1].r += dot(v3, k4);
sum3[1].g += dot(v3, k5);
sum3[1].b += dot(v3, k6);
sum3[1].a += dot(v3, k7);

v_offset += psc(w);
w_offset += 8;
}
#endif

@@ -305,16 +289,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3);
#endif
}

+ 0
- 198
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_gemm.comp View File

@@ -1,198 +0,0 @@
// Tencent is pleased to support the open source community by making ncnn available.
//
// Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
//
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
// in compliance with the License. You may obtain a copy of the License at
//
// https://opensource.org/licenses/BSD-3-Clause
//
// Unless required by applicable law or agreed to in writing, software distributed
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
// specific language governing permissions and limitations under the License.

#version 450

#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#endif
#if NCNN_fp16_arithmetic
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_tm_blob;
layout (binding = 1, imfmtc4) writeonly uniform unfp image3D top_tm_blob;
layout (binding = 2) uniform unfp sampler3D weight_tm_blob;
#else
layout (binding = 0) readonly buffer bottom_tm_blob { sfpvec8 bottom_tm_blob_data[]; };
layout (binding = 1) writeonly buffer top_tm_blob { sfpvec8 top_tm_blob_data[]; };
layout (binding = 2) readonly buffer weight_tm_blob { sfpvec8 weight_tm_data[]; };
#endif

layout (push_constant) uniform parameter
{
int c;
int cstep;

int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y) * 4;
int gz = int(gl_GlobalInvocationID.z);

if (gx >= 16 || gy >= psc(outh) || gz >= psc(outc))
return;

afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));

#if NCNN_image_shader
int wx = gx * 8;

for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z));

afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else
int v_offset = gy * 16 + gx;
int w_offset = (gz * psc(c) * 16 + gx) * 8;

for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 16);
afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 32);
afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 48);

afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1);
afpvec8 k2 = buffer_ld8(weight_tm_data, w_offset + 2);
afpvec8 k3 = buffer_ld8(weight_tm_data, w_offset + 3);
afpvec8 k4 = buffer_ld8(weight_tm_data, w_offset + 4);
afpvec8 k5 = buffer_ld8(weight_tm_data, w_offset + 5);
afpvec8 k6 = buffer_ld8(weight_tm_data, w_offset + 6);
afpvec8 k7 = buffer_ld8(weight_tm_data, w_offset + 7);

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);

v_offset += psc(cstep);
w_offset += 16 * 8;
}
#endif

#if NCNN_image_shader
image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 16 + gx;

buffer_st8(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 16, sum1);
if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 32, sum2);
if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 48, sum3);
#endif
}

+ 37
- 37
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_input.comp View File

@@ -60,7 +60,7 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 4x4
@@ -162,42 +162,42 @@ void main()

// store 16
#if NCNN_image_shader
int y = gy * p.block_x + gx;
image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v10);
image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v11);
image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v12);
image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v13);
image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v20);
image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v21);
image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v22);
image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v23);
image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v30);
image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v31);
image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v32);
image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v33);
int x = gy * psc(block_x) + gx;
image3d_st8(bottom_tm_blob, ivec3(x, gz, 0), v00);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 1), v01);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 2), v02);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 3), v03);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 4), v10);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 5), v11);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 6), v12);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 7), v13);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 8), v20);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 9), v21);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 10), v22);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 11), v23);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 12), v30);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 13), v31);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 14), v32);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 15), v33);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 16;
buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v10);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v11);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v12);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v13);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v20);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v21);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v22);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v23);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v30);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v31);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v32);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v33);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
buffer_st8(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v10);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v11);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v12);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v13);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v20);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v21);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v22);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v23);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v30);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v31);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v32);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v33);
#endif
}

+ 37
- 37
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp View File

@@ -67,48 +67,48 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.block_x || gy >= p.block_y || gz >= psc(c))
if (gx >= psc(block_x) || gy >= psc(block_y) || gz >= psc(c))
return;

// load 16
#if NCNN_image_shader
int sy = gy * p.block_x + gx;
afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz));
afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz));
afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz));
afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz));
afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz));
afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz));
afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz));
afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz));
afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz));
afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz));
afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz));
afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz));
afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz));
afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz));
afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz));
afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz));
int sx = gy * psc(block_x) + gx;
afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 0));
afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 1));
afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 2));
afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 3));
afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 4));
afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 5));
afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 6));
afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 7));
afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 8));
afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 9));
afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 10));
afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 11));
afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 12));
afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 13));
afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 14));
afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 15));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 16;
afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0);
afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1);
afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2);
afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3);
afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4);
afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5);
afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6);
afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7);
afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8);
afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9);
afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10);
afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11);
afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12);
afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13);
afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14);
afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
#endif

// const float itm[2][4] = {


+ 76
- 76
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_input.comp View File

@@ -260,82 +260,82 @@ void main()

// store 36
#if NCNN_image_shader
int y = gy * p.block_x + gx;
image3d_st8(bottom_tm_blob, ivec3(0, y, gz), v00);
image3d_st8(bottom_tm_blob, ivec3(1, y, gz), v01);
image3d_st8(bottom_tm_blob, ivec3(2, y, gz), v02);
image3d_st8(bottom_tm_blob, ivec3(3, y, gz), v03);
image3d_st8(bottom_tm_blob, ivec3(4, y, gz), v04);
image3d_st8(bottom_tm_blob, ivec3(5, y, gz), v05);
image3d_st8(bottom_tm_blob, ivec3(6, y, gz), v10);
image3d_st8(bottom_tm_blob, ivec3(7, y, gz), v11);
image3d_st8(bottom_tm_blob, ivec3(8, y, gz), v12);
image3d_st8(bottom_tm_blob, ivec3(9, y, gz), v13);
image3d_st8(bottom_tm_blob, ivec3(10, y, gz), v14);
image3d_st8(bottom_tm_blob, ivec3(11, y, gz), v15);
image3d_st8(bottom_tm_blob, ivec3(12, y, gz), v20);
image3d_st8(bottom_tm_blob, ivec3(13, y, gz), v21);
image3d_st8(bottom_tm_blob, ivec3(14, y, gz), v22);
image3d_st8(bottom_tm_blob, ivec3(15, y, gz), v23);
image3d_st8(bottom_tm_blob, ivec3(16, y, gz), v24);
image3d_st8(bottom_tm_blob, ivec3(17, y, gz), v25);
image3d_st8(bottom_tm_blob, ivec3(18, y, gz), v30);
image3d_st8(bottom_tm_blob, ivec3(19, y, gz), v31);
image3d_st8(bottom_tm_blob, ivec3(20, y, gz), v32);
image3d_st8(bottom_tm_blob, ivec3(21, y, gz), v33);
image3d_st8(bottom_tm_blob, ivec3(22, y, gz), v34);
image3d_st8(bottom_tm_blob, ivec3(23, y, gz), v35);
image3d_st8(bottom_tm_blob, ivec3(24, y, gz), v40);
image3d_st8(bottom_tm_blob, ivec3(25, y, gz), v41);
image3d_st8(bottom_tm_blob, ivec3(26, y, gz), v42);
image3d_st8(bottom_tm_blob, ivec3(27, y, gz), v43);
image3d_st8(bottom_tm_blob, ivec3(28, y, gz), v44);
image3d_st8(bottom_tm_blob, ivec3(29, y, gz), v45);
image3d_st8(bottom_tm_blob, ivec3(30, y, gz), v50);
image3d_st8(bottom_tm_blob, ivec3(31, y, gz), v51);
image3d_st8(bottom_tm_blob, ivec3(32, y, gz), v52);
image3d_st8(bottom_tm_blob, ivec3(33, y, gz), v53);
image3d_st8(bottom_tm_blob, ivec3(34, y, gz), v54);
image3d_st8(bottom_tm_blob, ivec3(35, y, gz), v55);
int x = gy * psc(block_x) + gx;
image3d_st8(bottom_tm_blob, ivec3(x, gz, 0), v00);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 1), v01);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 2), v02);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 3), v03);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 4), v04);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 5), v05);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 6), v10);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 7), v11);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 8), v12);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 9), v13);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 10), v14);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 11), v15);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 12), v20);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 13), v21);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 14), v22);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 15), v23);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 16), v24);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 17), v25);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 18), v30);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 19), v31);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 20), v32);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 21), v33);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 22), v34);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 23), v35);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 24), v40);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 25), v41);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 26), v42);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 27), v43);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 28), v44);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 29), v45);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 30), v50);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 31), v51);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 32), v52);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 33), v53);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 34), v54);
image3d_st8(bottom_tm_blob, ivec3(x, gz, 35), v55);
#else
int v_tm_offset = gz * psc(outcstep) + (gy * p.block_x + gx) * 36;
buffer_st8(bottom_tm_blob_data, v_tm_offset + 0, v00);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 1, v01);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 2, v02);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 3, v03);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 4, v04);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 5, v05);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 6, v10);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 7, v11);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 8, v12);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 9, v13);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 10, v14);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 11, v15);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 12, v20);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 13, v21);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 14, v22);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 15, v23);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 16, v24);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 17, v25);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 18, v30);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 19, v31);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 20, v32);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 21, v33);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 22, v34);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 23, v35);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 24, v40);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 25, v41);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 26, v42);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 27, v43);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 28, v44);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 29, v45);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 30, v50);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 31, v51);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 32, v52);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 33, v53);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 34, v54);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 35, v55);
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;
buffer_st8(bottom_tm_blob_data, v_tm_offset + 0 * psc(outcstep), v00);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 1 * psc(outcstep), v01);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 2 * psc(outcstep), v02);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 3 * psc(outcstep), v03);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 4 * psc(outcstep), v04);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 5 * psc(outcstep), v05);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 6 * psc(outcstep), v10);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 7 * psc(outcstep), v11);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 8 * psc(outcstep), v12);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 9 * psc(outcstep), v13);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 10 * psc(outcstep), v14);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 11 * psc(outcstep), v15);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 12 * psc(outcstep), v20);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 13 * psc(outcstep), v21);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 14 * psc(outcstep), v22);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 15 * psc(outcstep), v23);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 16 * psc(outcstep), v24);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 17 * psc(outcstep), v25);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 18 * psc(outcstep), v30);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 19 * psc(outcstep), v31);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 20 * psc(outcstep), v32);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 21 * psc(outcstep), v33);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 22 * psc(outcstep), v34);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 23 * psc(outcstep), v35);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 24 * psc(outcstep), v40);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 25 * psc(outcstep), v41);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 26 * psc(outcstep), v42);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 27 * psc(outcstep), v43);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 28 * psc(outcstep), v44);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 29 * psc(outcstep), v45);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 30 * psc(outcstep), v50);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 31 * psc(outcstep), v51);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 32 * psc(outcstep), v52);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 33 * psc(outcstep), v53);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 34 * psc(outcstep), v54);
buffer_st8(bottom_tm_blob_data, v_tm_offset + 35 * psc(outcstep), v55);
#endif
}

+ 74
- 74
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp View File

@@ -72,83 +72,83 @@ void main()

// load 36
#if NCNN_image_shader
int sy = gy * p.block_x + gx;
int sx = gy * psc(block_x) + gx;

afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(0, sy, gz));
afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(1, sy, gz));
afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(2, sy, gz));
afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(3, sy, gz));
afpvec8 v04 = image3d_ld8(top_tm_blob, ivec3(4, sy, gz));
afpvec8 v05 = image3d_ld8(top_tm_blob, ivec3(5, sy, gz));
afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(6, sy, gz));
afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(7, sy, gz));
afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(8, sy, gz));
afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(9, sy, gz));
afpvec8 v14 = image3d_ld8(top_tm_blob, ivec3(10, sy, gz));
afpvec8 v15 = image3d_ld8(top_tm_blob, ivec3(11, sy, gz));
afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(12, sy, gz));
afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(13, sy, gz));
afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(14, sy, gz));
afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(15, sy, gz));
afpvec8 v24 = image3d_ld8(top_tm_blob, ivec3(16, sy, gz));
afpvec8 v25 = image3d_ld8(top_tm_blob, ivec3(17, sy, gz));
afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(18, sy, gz));
afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(19, sy, gz));
afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(20, sy, gz));
afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(21, sy, gz));
afpvec8 v34 = image3d_ld8(top_tm_blob, ivec3(22, sy, gz));
afpvec8 v35 = image3d_ld8(top_tm_blob, ivec3(23, sy, gz));
afpvec8 v40 = image3d_ld8(top_tm_blob, ivec3(24, sy, gz));
afpvec8 v41 = image3d_ld8(top_tm_blob, ivec3(25, sy, gz));
afpvec8 v42 = image3d_ld8(top_tm_blob, ivec3(26, sy, gz));
afpvec8 v43 = image3d_ld8(top_tm_blob, ivec3(27, sy, gz));
afpvec8 v44 = image3d_ld8(top_tm_blob, ivec3(28, sy, gz));
afpvec8 v45 = image3d_ld8(top_tm_blob, ivec3(29, sy, gz));
afpvec8 v50 = image3d_ld8(top_tm_blob, ivec3(30, sy, gz));
afpvec8 v51 = image3d_ld8(top_tm_blob, ivec3(31, sy, gz));
afpvec8 v52 = image3d_ld8(top_tm_blob, ivec3(32, sy, gz));
afpvec8 v53 = image3d_ld8(top_tm_blob, ivec3(33, sy, gz));
afpvec8 v54 = image3d_ld8(top_tm_blob, ivec3(34, sy, gz));
afpvec8 v55 = image3d_ld8(top_tm_blob, ivec3(35, sy, gz));
afpvec8 v00 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 0));
afpvec8 v01 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 1));
afpvec8 v02 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 2));
afpvec8 v03 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 3));
afpvec8 v04 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 4));
afpvec8 v05 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 5));
afpvec8 v10 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 6));
afpvec8 v11 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 7));
afpvec8 v12 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 8));
afpvec8 v13 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 9));
afpvec8 v14 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 10));
afpvec8 v15 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 11));
afpvec8 v20 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 12));
afpvec8 v21 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 13));
afpvec8 v22 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 14));
afpvec8 v23 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 15));
afpvec8 v24 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 16));
afpvec8 v25 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 17));
afpvec8 v30 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 18));
afpvec8 v31 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 19));
afpvec8 v32 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 20));
afpvec8 v33 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 21));
afpvec8 v34 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 22));
afpvec8 v35 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 23));
afpvec8 v40 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 24));
afpvec8 v41 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 25));
afpvec8 v42 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 26));
afpvec8 v43 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 27));
afpvec8 v44 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 28));
afpvec8 v45 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 29));
afpvec8 v50 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 30));
afpvec8 v51 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 31));
afpvec8 v52 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 32));
afpvec8 v53 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 33));
afpvec8 v54 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 34));
afpvec8 v55 = image3d_ld8(top_tm_blob, ivec3(sx, gz, 35));
#else
int v_tm_offset = gz * psc(cstep) + (gy * p.block_x + gx) * 36;
int v_tm_offset = gz * psc(block_x) * psc(block_y) + gy * psc(block_x) + gx;

afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0);
afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1);
afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2);
afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3);
afpvec8 v04 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4);
afpvec8 v05 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5);
afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6);
afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7);
afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8);
afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9);
afpvec8 v14 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10);
afpvec8 v15 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11);
afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12);
afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13);
afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14);
afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15);
afpvec8 v24 = buffer_ld8(top_tm_blob_data, v_tm_offset + 16);
afpvec8 v25 = buffer_ld8(top_tm_blob_data, v_tm_offset + 17);
afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 18);
afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 19);
afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 20);
afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 21);
afpvec8 v34 = buffer_ld8(top_tm_blob_data, v_tm_offset + 22);
afpvec8 v35 = buffer_ld8(top_tm_blob_data, v_tm_offset + 23);
afpvec8 v40 = buffer_ld8(top_tm_blob_data, v_tm_offset + 24);
afpvec8 v41 = buffer_ld8(top_tm_blob_data, v_tm_offset + 25);
afpvec8 v42 = buffer_ld8(top_tm_blob_data, v_tm_offset + 26);
afpvec8 v43 = buffer_ld8(top_tm_blob_data, v_tm_offset + 27);
afpvec8 v44 = buffer_ld8(top_tm_blob_data, v_tm_offset + 28);
afpvec8 v45 = buffer_ld8(top_tm_blob_data, v_tm_offset + 29);
afpvec8 v50 = buffer_ld8(top_tm_blob_data, v_tm_offset + 30);
afpvec8 v51 = buffer_ld8(top_tm_blob_data, v_tm_offset + 31);
afpvec8 v52 = buffer_ld8(top_tm_blob_data, v_tm_offset + 32);
afpvec8 v53 = buffer_ld8(top_tm_blob_data, v_tm_offset + 33);
afpvec8 v54 = buffer_ld8(top_tm_blob_data, v_tm_offset + 34);
afpvec8 v55 = buffer_ld8(top_tm_blob_data, v_tm_offset + 35);
afpvec8 v00 = buffer_ld8(top_tm_blob_data, v_tm_offset + 0 * psc(cstep));
afpvec8 v01 = buffer_ld8(top_tm_blob_data, v_tm_offset + 1 * psc(cstep));
afpvec8 v02 = buffer_ld8(top_tm_blob_data, v_tm_offset + 2 * psc(cstep));
afpvec8 v03 = buffer_ld8(top_tm_blob_data, v_tm_offset + 3 * psc(cstep));
afpvec8 v04 = buffer_ld8(top_tm_blob_data, v_tm_offset + 4 * psc(cstep));
afpvec8 v05 = buffer_ld8(top_tm_blob_data, v_tm_offset + 5 * psc(cstep));
afpvec8 v10 = buffer_ld8(top_tm_blob_data, v_tm_offset + 6 * psc(cstep));
afpvec8 v11 = buffer_ld8(top_tm_blob_data, v_tm_offset + 7 * psc(cstep));
afpvec8 v12 = buffer_ld8(top_tm_blob_data, v_tm_offset + 8 * psc(cstep));
afpvec8 v13 = buffer_ld8(top_tm_blob_data, v_tm_offset + 9 * psc(cstep));
afpvec8 v14 = buffer_ld8(top_tm_blob_data, v_tm_offset + 10 * psc(cstep));
afpvec8 v15 = buffer_ld8(top_tm_blob_data, v_tm_offset + 11 * psc(cstep));
afpvec8 v20 = buffer_ld8(top_tm_blob_data, v_tm_offset + 12 * psc(cstep));
afpvec8 v21 = buffer_ld8(top_tm_blob_data, v_tm_offset + 13 * psc(cstep));
afpvec8 v22 = buffer_ld8(top_tm_blob_data, v_tm_offset + 14 * psc(cstep));
afpvec8 v23 = buffer_ld8(top_tm_blob_data, v_tm_offset + 15 * psc(cstep));
afpvec8 v24 = buffer_ld8(top_tm_blob_data, v_tm_offset + 16 * psc(cstep));
afpvec8 v25 = buffer_ld8(top_tm_blob_data, v_tm_offset + 17 * psc(cstep));
afpvec8 v30 = buffer_ld8(top_tm_blob_data, v_tm_offset + 18 * psc(cstep));
afpvec8 v31 = buffer_ld8(top_tm_blob_data, v_tm_offset + 19 * psc(cstep));
afpvec8 v32 = buffer_ld8(top_tm_blob_data, v_tm_offset + 20 * psc(cstep));
afpvec8 v33 = buffer_ld8(top_tm_blob_data, v_tm_offset + 21 * psc(cstep));
afpvec8 v34 = buffer_ld8(top_tm_blob_data, v_tm_offset + 22 * psc(cstep));
afpvec8 v35 = buffer_ld8(top_tm_blob_data, v_tm_offset + 23 * psc(cstep));
afpvec8 v40 = buffer_ld8(top_tm_blob_data, v_tm_offset + 24 * psc(cstep));
afpvec8 v41 = buffer_ld8(top_tm_blob_data, v_tm_offset + 25 * psc(cstep));
afpvec8 v42 = buffer_ld8(top_tm_blob_data, v_tm_offset + 26 * psc(cstep));
afpvec8 v43 = buffer_ld8(top_tm_blob_data, v_tm_offset + 27 * psc(cstep));
afpvec8 v44 = buffer_ld8(top_tm_blob_data, v_tm_offset + 28 * psc(cstep));
afpvec8 v45 = buffer_ld8(top_tm_blob_data, v_tm_offset + 29 * psc(cstep));
afpvec8 v50 = buffer_ld8(top_tm_blob_data, v_tm_offset + 30 * psc(cstep));
afpvec8 v51 = buffer_ld8(top_tm_blob_data, v_tm_offset + 31 * psc(cstep));
afpvec8 v52 = buffer_ld8(top_tm_blob_data, v_tm_offset + 32 * psc(cstep));
afpvec8 v53 = buffer_ld8(top_tm_blob_data, v_tm_offset + 33 * psc(cstep));
afpvec8 v54 = buffer_ld8(top_tm_blob_data, v_tm_offset + 34 * psc(cstep));
afpvec8 v55 = buffer_ld8(top_tm_blob_data, v_tm_offset + 35 * psc(cstep));
#endif

// const float otm[4][6] = {


src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_gemm.comp → src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd_gemm.comp View File

@@ -22,11 +22,13 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#define shape_constant_id_offset 0
layout (constant_id = 0) const int batch = 1;

#define shape_constant_id_offset 1
layout (constant_id = shape_constant_id_offset + 0) const int c = 0;
layout (constant_id = shape_constant_id_offset + 1) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outcstep = 0;

@@ -45,18 +47,18 @@ layout (push_constant) uniform parameter
int c;
int cstep;

int outh;
int outw;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y) * 4;
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= 36 || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outc) || gz >= batch)
return;

afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
@@ -65,23 +67,21 @@ void main()
afpvec8 sum3 = afpvec8(afpvec4(0.f), afpvec4(0.f));

#if NCNN_image_shader
int wx = gx * 8;

for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 0, z));
afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 1, z));
afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 2, z));
afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx, gy + 3, z));
afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(wx + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(wx + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(wx + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(wx + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(wx + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(wx + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(wx + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(wx + 7, z, gz));
afpvec8 v0 = image3d_ld8(bottom_tm_blob, ivec3(gx + 0, z, gz));
afpvec8 v1 = image3d_ld8(bottom_tm_blob, ivec3(gx + 1, z, gz));
afpvec8 v2 = image3d_ld8(bottom_tm_blob, ivec3(gx + 2, z, gz));
afpvec8 v3 = image3d_ld8(bottom_tm_blob, ivec3(gx + 3, z, gz));
afpvec8 k0 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 0, gy, gz));
afpvec8 k1 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 1, gy, gz));
afpvec8 k2 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 2, gy, gz));
afpvec8 k3 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 3, gy, gz));
afpvec8 k4 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 4, gy, gz));
afpvec8 k5 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 5, gy, gz));
afpvec8 k6 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 6, gy, gz));
afpvec8 k7 = image3d_ld8(weight_tm_blob, ivec3(z * 8 + 7, gy, gz));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -121,15 +121,15 @@ void main()
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else
int v_offset = gy * 36 + gx;
int w_offset = (gz * psc(c) * 36 + gx) * 8;
int v_offset = gz * psc(cstep) + gx;
int w_offset = (gz * psc(c) * psc(outc) + gy * psc(c)) * 8;

for (int z = 0; z < psc(c); z++)
{
afpvec8 v0 = buffer_ld8(bottom_tm_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 36);
afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 72);
afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 108);
afpvec8 v1 = buffer_ld8(bottom_tm_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(bottom_tm_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(bottom_tm_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_tm_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_tm_data, w_offset + 1);
@@ -177,22 +177,22 @@ void main()
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);

v_offset += psc(cstep);
w_offset += 36 * 8;
v_offset += psc(outw);
w_offset += 8;
}
#endif

#if NCNN_image_shader
image3d_st8(top_tm_blob, ivec3(gx, gy + 0, gz), sum0);
image3d_st8(top_tm_blob, ivec3(gx, gy + 1, gz), sum1);
image3d_st8(top_tm_blob, ivec3(gx, gy + 2, gz), sum2);
image3d_st8(top_tm_blob, ivec3(gx, gy + 3, gz), sum3);
image3d_st8(top_tm_blob, ivec3(gx + 0, gy, gz), sum0);
image3d_st8(top_tm_blob, ivec3(gx + 1, gy, gz), sum1);
image3d_st8(top_tm_blob, ivec3(gx + 2, gy, gz), sum2);
image3d_st8(top_tm_blob, ivec3(gx + 3, gy, gz), sum3);
#else
int gi = gz * psc(outcstep) + gy * 36 + gx;
int gi = gz * psc(outcstep) + gy * psc(outw) + gx;

buffer_st8(top_tm_blob_data, gi + 0, sum0);
if (gy + 1 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 36, sum1);
if (gy + 2 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 72, sum2);
if (gy + 3 < psc(outh)) buffer_st8(top_tm_blob_data, gi + 108, sum3);
if (gx + 1 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw)) buffer_st8(top_tm_blob_data, gi + 3, sum3);
#endif
}

+ 124
- 140
src/layer/vulkan/shader/convolution_pack8_gemm.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -73,9 +65,8 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(w) || gy >= psc(outc))
return;

afpvec8 sum0;
@@ -86,9 +77,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld8(bias_data, gz);
sum0 = buffer_ld8(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -107,125 +98,118 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(kk * 8 + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(kk * 8 + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(kk * 8 + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(kk * 8 + 3, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(kk * 8 + 4, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(kk * 8 + 5, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(kk * 8 + 6, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(kk * 8 + 7, z, gz));

// sum += v * k;
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 8 + 0, gy, 0));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 8 + 1, gy, 0));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 8 + 2, gy, 0));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 8 + 3, gy, 0));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(z * 8 + 4, gy, 0));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(z * 8 + 5, gy, 0));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(z * 8 + 6, gy, 0));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(z * 8 + 7, gy, 0));

// sum += v * k;
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else
int w_offset = gz * psc(c) * maxk;
int v_offset = gx;
int w_offset = gy * psc(h) * 8;

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
int v_offset = gx + z * psc(cstep);

for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset * 8 + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset * 8 + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset * 8 + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset * 8 + 3);
afpvec8 k4 = buffer_ld8(weight_data, w_offset * 8 + 4);
afpvec8 k5 = buffer_ld8(weight_data, w_offset * 8 + 5);
afpvec8 k6 = buffer_ld8(weight_data, w_offset * 8 + 6);
afpvec8 k7 = buffer_ld8(weight_data, w_offset * 8 + 7);

// sum += v * k;
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3);
afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4);
afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5);
afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6);
afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7);

// sum += v * k;
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0[0].g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0[0].b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0[0].a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);
sum0[1].r += dot(v0[0], k4[0]) + dot(v0[1], k4[1]);
sum0[1].g += dot(v0[0], k5[0]) + dot(v0[1], k5[1]);
sum0[1].b += dot(v0[0], k6[0]) + dot(v0[1], k6[1]);
sum0[1].a += dot(v0[0], k7[0]) + dot(v0[1], k7[1]);

sum1[0].r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1[0].g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1[0].b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1[0].a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);
sum1[1].r += dot(v1[0], k4[0]) + dot(v1[1], k4[1]);
sum1[1].g += dot(v1[0], k5[0]) + dot(v1[1], k5[1]);
sum1[1].b += dot(v1[0], k6[0]) + dot(v1[1], k6[1]);
sum1[1].a += dot(v1[0], k7[0]) + dot(v1[1], k7[1]);

sum2[0].r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2[0].g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2[0].b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2[0].a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);
sum2[1].r += dot(v2[0], k4[0]) + dot(v2[1], k4[1]);
sum2[1].g += dot(v2[0], k5[0]) + dot(v2[1], k5[1]);
sum2[1].b += dot(v2[0], k6[0]) + dot(v2[1], k6[1]);
sum2[1].a += dot(v2[0], k7[0]) + dot(v2[1], k7[1]);

sum3[0].r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3[0].g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3[0].b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3[0].a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
sum3[1].r += dot(v3[0], k4[0]) + dot(v3[1], k4[1]);
sum3[1].g += dot(v3[0], k5[0]) + dot(v3[1], k5[1]);
sum3[1].b += dot(v3[0], k6[0]) + dot(v3[1], k6[1]);
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);

v_offset += psc(w);
w_offset += 8;
}
#endif

@@ -305,16 +289,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st8(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st8(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st8(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st8(top_blob_data, gi + 3, sum3);
#endif
}

+ 18
- 24
src/layer/vulkan/shader/convolution_pack8_im2col.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -52,42 +48,40 @@ layout (binding = 1) writeonly buffer col_blob { sfpvec8 col_blob_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

const int maxk = kernel_w * kernel_h;

if (gx >= psc(outw) * psc(outh) || gy >= maxk || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= maxk * psc(c))
return;

int sy = gx / psc(outw);
int sx = gx % psc(outw);
const int sy = gx / psc(outw);
const int sx = gx % psc(outw);

const int sz = gy / maxk;
const int k = gy % maxk;

int ky = gy / kernel_w;
int kx = gy % kernel_w;
const int ky = k / kernel_w;
const int kx = k % kernel_w;

#if NCNN_image_shader
image3d_cp8(col_blob, ivec3(gx, gy, gz), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, gz));
image3d_cp8(col_blob, ivec3(gx, gy, 0), bottom_blob, ivec3(sx * stride_w + kx * dilation_w, sy * stride_h + ky * dilation_h, sz));
#else
const int v_offset = gz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;
const int v_offset = sz * psc(cstep) + (sy * stride_h + ky * dilation_h) * psc(w) + sx * stride_w + kx * dilation_w;

const int gi = gz * psc(outcstep) + gy * psc(outw) * psc(outh) + gx;
const int gi = gy * psc(outw) * psc(outh) + gx;

buffer_cp8(col_blob_data, gi, bottom_blob_data, v_offset);
#endif


+ 49
- 65
src/layer/vulkan/shader/convolution_pack8to1_gemm.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -73,9 +65,8 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(w) || gy >= psc(outc))
return;

afp sum0;
@@ -86,9 +77,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld1(bias_data, gz);
sum0 = buffer_ld1(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -107,49 +98,42 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(kk, z, gz));

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]);
sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]);
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);
}
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0));

afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0));

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]);
sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]);
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);
}
#else
int w_offset = gz * psc(c) * maxk;
int v_offset = gx;
int w_offset = gy * psc(h);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
int v_offset = gx + z * psc(cstep);

for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k = buffer_ld8(weight_data, w_offset);

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]);
sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]);
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k = buffer_ld8(weight_data, w_offset);

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
sum1 += dot(v1[0], k[0]) + dot(v1[1], k[1]);
sum2 += dot(v2[0], k[0]) + dot(v2[1], k[1]);
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);

v_offset += psc(w);
w_offset += 1;
}
#endif

@@ -205,16 +189,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st1(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st1(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st1(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st1(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st1(top_blob_data, gi + 3, sum3);
#endif
}

+ 84
- 100
src/layer/vulkan/shader/convolution_pack8to4_gemm.comp View File

@@ -30,17 +30,13 @@ layout (constant_id = 4) const float activation_param_0 = 0;
layout (constant_id = 5) const float activation_param_1 = 0;

#define shape_constant_id_offset 6
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -56,13 +52,9 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -73,9 +65,8 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(w) || gy >= psc(outc))
return;

afpvec4 sum0;
@@ -86,9 +77,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum0 = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
sum0 = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
sum0 = buffer_ld4(bias_data, gz);
sum0 = buffer_ld4(bias_data, gy);
#endif
sum1 = sum0;
sum2 = sum0;
@@ -107,85 +98,78 @@ void main()
#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, kk, z));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, kk, z));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, kk, z));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, kk, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(kk * 4 + 0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(kk * 4 + 1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(kk * 4 + 2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(kk * 4 + 3, z, gz));

// sum += v * k;
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);

sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);

sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);

sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
}
afpvec8 v0 = image3d_ld8(col_blob, ivec3(gx4.r, z, 0));
afpvec8 v1 = image3d_ld8(col_blob, ivec3(gx4.g, z, 0));
afpvec8 v2 = image3d_ld8(col_blob, ivec3(gx4.b, z, 0));
afpvec8 v3 = image3d_ld8(col_blob, ivec3(gx4.a, z, 0));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 4 + 0, gy, 0));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 4 + 1, gy, 0));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 4 + 2, gy, 0));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 4 + 3, gy, 0));

// sum += v * k;
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);

sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);

sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);

sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
}
#else
int w_offset = gz * psc(c) * maxk;
int v_offset = gx;
int w_offset = gy * psc(h) * 4;

for (int z = 0; z < psc(c); z++)
for (int z = 0; z < psc(h); z++)
{
int v_offset = gx + z * psc(cstep);

for (int kk = 0; kk < maxk; kk++)
{
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset * 4 + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset * 4 + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset * 4 + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset * 4 + 3);

// sum += v * k;
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);

sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);

sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);

sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);

v_offset += psc(outw) * psc(outh);
w_offset += 1;
}
afpvec8 v0 = buffer_ld8(col_blob_data, v_offset + 0);
afpvec8 v1 = buffer_ld8(col_blob_data, v_offset + 1);
afpvec8 v2 = buffer_ld8(col_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(col_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3);

// sum += v * k;
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
sum0.g += dot(v0[0], k1[0]) + dot(v0[1], k1[1]);
sum0.b += dot(v0[0], k2[0]) + dot(v0[1], k2[1]);
sum0.a += dot(v0[0], k3[0]) + dot(v0[1], k3[1]);

sum1.r += dot(v1[0], k0[0]) + dot(v1[1], k0[1]);
sum1.g += dot(v1[0], k1[0]) + dot(v1[1], k1[1]);
sum1.b += dot(v1[0], k2[0]) + dot(v1[1], k2[1]);
sum1.a += dot(v1[0], k3[0]) + dot(v1[1], k3[1]);

sum2.r += dot(v2[0], k0[0]) + dot(v2[1], k0[1]);
sum2.g += dot(v2[0], k1[0]) + dot(v2[1], k1[1]);
sum2.b += dot(v2[0], k2[0]) + dot(v2[1], k2[1]);
sum2.a += dot(v2[0], k3[0]) + dot(v2[1], k3[1]);

sum3.r += dot(v3[0], k0[0]) + dot(v3[1], k0[1]);
sum3.g += dot(v3[0], k1[0]) + dot(v3[1], k1[1]);
sum3.b += dot(v3[0], k2[0]) + dot(v3[1], k2[1]);
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);

v_offset += psc(w);
w_offset += 4;
}
#endif

@@ -241,16 +225,16 @@ void main()
ivec4 sy4 = gx4 / psc(outw);
ivec4 sx4 = gx4 % psc(outw);

image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
const int gi = gz * psc(outcstep) + gx;
const int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi, sum0);
if (gx + 1 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(outw) * psc(outh)) buffer_st4(top_blob_data, gi + 3, sum3);
if (gx + 1 < psc(w)) buffer_st4(top_blob_data, gi + 1, sum1);
if (gx + 2 < psc(w)) buffer_st4(top_blob_data, gi + 2, sum2);
if (gx + 3 < psc(w)) buffer_st4(top_blob_data, gi + 3, sum3);
#endif
}

+ 11
- 17
src/layer/vulkan/shader/deconvolution_col2im.comp View File

@@ -33,17 +33,13 @@ layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -57,13 +53,9 @@ layout (binding = 2) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -94,6 +86,8 @@ void main()
sum = afp(0.f);
}

const int maxk = kernel_w * kernel_h;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

@@ -116,9 +110,9 @@ void main()
w_k /= dilation_w;

#if NCNN_image_shader
sum += image3d_ld1(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz));
sum += image3d_ld1(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0));
#else
const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;
const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;

sum += buffer_ld1(col_blob_data, gi);
#endif


+ 116
- 31
src/layer/vulkan/shader/deconvolution_gemm.comp View File

@@ -21,21 +21,16 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define LOCAL_MEMORY_UNROLL_INCH 8

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfp weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfp tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;
#endif

afp sum0 = afp(0.f);
afp sum1 = afp(0.f);
@@ -91,7 +86,7 @@ void main()
afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

afp k = image3d_ld1(weight_blob, ivec3(gy, z, gz));
afp k = image3d_ld1(weight_blob, ivec3(z, gy, 0));

sum0 += v0 * k;
sum1 += v1 * k;
@@ -100,8 +95,92 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afp k = lfp2afp(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfp(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afp k = lfp2afp(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afp v0 = buffer_ld1(bottom_blob_data, v_offset + 0);
@@ -117,17 +196,23 @@ void main()
sum3 += v3 * k;

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;
#endif

#if NCNN_image_shader
image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st1(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1);


+ 116
- 31
src/layer/vulkan/shader/deconvolution_pack1to4_gemm.comp View File

@@ -21,21 +21,16 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define LOCAL_MEMORY_UNROLL_INCH 8

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfp tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;
#endif

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
@@ -91,7 +86,7 @@ void main()
afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(gy, z, gz));
afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0));

sum0 += v0 * k;
sum1 += v1 * k;
@@ -100,8 +95,92 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afp v0 = lfp2afp(tmp_v[lx][z4][0]);
afp v1 = lfp2afp(tmp_v[lx][z4][1]);
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afp v0 = buffer_ld1(bottom_blob_data, v_offset + 0);
@@ -117,17 +196,23 @@ void main()
sum3 += v3 * k;

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;
#endif

#if NCNN_image_shader
image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st4(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1);


+ 17
- 31
src/layer/vulkan/shader/deconvolution_pack1to8_gemm.comp View File

@@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;

afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
@@ -92,7 +78,7 @@ void main()
afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(gy, z, gz));
afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0));

// sum += v * k;
sum0[0] += v0 * k[0];
@@ -109,7 +95,7 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c);

for (int z = 0; z < psc(c); z++)
{
@@ -134,17 +120,17 @@ void main()
sum3[1] += v3 * k[1];

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 1;
}
#endif

#if NCNN_image_shader
image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st8(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1);


+ 11
- 17
src/layer/vulkan/shader/deconvolution_pack4_col2im.comp View File

@@ -33,17 +33,13 @@ layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -57,13 +53,9 @@ layout (binding = 2) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -94,6 +86,8 @@ void main()
sum = afpvec4(0.f);
}

const int maxk = kernel_w * kernel_h;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

@@ -116,9 +110,9 @@ void main()
w_k /= dilation_w;

#if NCNN_image_shader
sum += image3d_ld4(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz));
sum += image3d_ld4(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0));
#else
const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;
const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;

sum += buffer_ld4(col_blob_data, gi);
#endif


+ 133
- 49
src/layer/vulkan/shader/deconvolution_pack4_gemm.comp View File

@@ -21,21 +21,16 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define LOCAL_MEMORY_UNROLL_INCH 8

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -44,39 +39,34 @@ layout (binding = 2) uniform unfp sampler3D weight_blob;
#else
layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer col_blob { sfpvec4 col_blob_data[]; };
#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
#endif

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH][4];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;
#endif

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
@@ -85,7 +75,6 @@ void main()

#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
ivec4 gy4 = gy * 4 + ivec4(0, 1, 2, 3);

ivec4 sy4 = gx4 / psc(w);
ivec4 sx4 = gx4 % psc(w);
@@ -98,10 +87,10 @@ void main()
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(gy4.r, z, gz)),
image3d_ld4(weight_blob, ivec3(gy4.g, z, gz)),
image3d_ld4(weight_blob, ivec3(gy4.b, z, gz)),
image3d_ld4(weight_blob, ivec3(gy4.a, z, gz))
image3d_ld4(weight_blob, ivec3(z * 4 + 0, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 1, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 2, gy, 0)),
image3d_ld4(weight_blob, ivec3(z * 4 + 3, gy, 0))
);

sum0 += v0 * k;
@@ -111,8 +100,102 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c) * 4;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep);
w_offset += LOCAL_MEMORY_UNROLL_INCH * 4;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

sum0 += v0 * k;
sum1 += v1 * k;
sum2 += v2 * k;
sum3 += v3 * k;
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0);
@@ -120,17 +203,12 @@ void main()
afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3);

#if NCNN_fp16_packed || (NCNN_fp16_storage && !NCNN_fp16_arithmetic)
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
buffer_ld4(weight_data, w_offset * 4 + 0),
buffer_ld4(weight_data, w_offset * 4 + 1),
buffer_ld4(weight_data, w_offset * 4 + 2),
buffer_ld4(weight_data, w_offset * 4 + 3)
buffer_ld4(weight_data, w_offset + 0),
buffer_ld4(weight_data, w_offset + 1),
buffer_ld4(weight_data, w_offset + 2),
buffer_ld4(weight_data, w_offset + 3)
);
#else
afpmat4 k = afpmat4(weight_data[w_offset]);
#endif

sum0 += v0 * k;
sum1 += v1 * k;
@@ -138,17 +216,23 @@ void main()
sum3 += v3 * k;

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 4;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;
#endif

#if NCNN_image_shader
image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st4(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1);


+ 116
- 31
src/layer/vulkan/shader/deconvolution_pack4to1_gemm.comp View File

@@ -21,21 +21,16 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define LOCAL_MEMORY_UNROLL_INCH 8

#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -49,29 +44,29 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

#if NCNN_shader_local_memory
shared lfpvec4 tmp_v[8][LOCAL_MEMORY_UNROLL_INCH][4];
shared lfpvec4 tmp_k[8][LOCAL_MEMORY_UNROLL_INCH];
#endif

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
#if !NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;
#endif

afp sum0 = afp(0.f);
afp sum1 = afp(0.f);
@@ -91,7 +86,7 @@ void main()
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(gy, z, gz));
afpvec4 k = image3d_ld4(weight_blob, ivec3(z, gy, 0));

sum0 += dot(v0, k);
sum1 += dot(v1, k);
@@ -100,8 +95,92 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c);

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);
}

v_offset += LOCAL_MEMORY_UNROLL_INCH * psc(cstep);
w_offset += LOCAL_MEMORY_UNROLL_INCH;

barrier();
}

if (z < psc(c))
{
const int remain = psc(c) - z;

if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

if (lx == 0)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

barrier();

for (int z4 = 0; z4 < remain; z4++)
{
afpvec4 v0 = lfp2afpvec4(tmp_v[lx][z4][0]);
afpvec4 v1 = lfp2afpvec4(tmp_v[lx][z4][1]);
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
sum2 += dot(v2, k);
sum3 += dot(v3, k);
}
}
#else
for (int z = 0; z < psc(c); z++)
{
afpvec4 v0 = buffer_ld4(bottom_blob_data, v_offset + 0);
@@ -117,17 +196,23 @@ void main()
sum3 += dot(v3, k);

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 1;
}
#endif
#endif

#if NCNN_shader_local_memory
if (gx >= psc(outw) || gy >= psc(outh))
return;
#endif

#if NCNN_image_shader
image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st1(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1);


+ 32
- 48
src/layer/vulkan/shader/deconvolution_pack4to8_gemm.comp View File

@@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;

afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
@@ -81,8 +67,6 @@ void main()

#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
ivec4 gy4 = gy * 8 + ivec4(0, 1, 2, 3);
ivec4 gyy4 = gy4 + 4;

ivec4 sy4 = gx4 / psc(w);
ivec4 sx4 = gx4 % psc(w);
@@ -94,14 +78,14 @@ void main()
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(gy4.r, z, gz));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(gy4.g, z, gz));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(gy4.b, z, gz));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(gy4.a, z, gz));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(gyy4.r, z, gz));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(gyy4.g, z, gz));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(gyy4.b, z, gz));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(gyy4.a, z, gz));
afpvec4 k0 = image3d_ld4(weight_blob, ivec3(z * 8 + 0, gy, 0));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(z * 8 + 1, gy, 0));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(z * 8 + 2, gy, 0));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(z * 8 + 3, gy, 0));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(z * 8 + 4, gy, 0));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(z * 8 + 5, gy, 0));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(z * 8 + 6, gy, 0));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(z * 8 + 7, gy, 0));

// sum += v * k;
sum0[0].r += dot(v0, k0);
@@ -142,7 +126,7 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c) * 8;

for (int z = 0; z < psc(c); z++)
{
@@ -151,14 +135,14 @@ void main()
afpvec4 v2 = buffer_ld4(bottom_blob_data, v_offset + 2);
afpvec4 v3 = buffer_ld4(bottom_blob_data, v_offset + 3);

afpvec4 k0 = buffer_ld4(weight_data, w_offset * 8 + 0);
afpvec4 k1 = buffer_ld4(weight_data, w_offset * 8 + 1);
afpvec4 k2 = buffer_ld4(weight_data, w_offset * 8 + 2);
afpvec4 k3 = buffer_ld4(weight_data, w_offset * 8 + 3);
afpvec4 k4 = buffer_ld4(weight_data, w_offset * 8 + 4);
afpvec4 k5 = buffer_ld4(weight_data, w_offset * 8 + 5);
afpvec4 k6 = buffer_ld4(weight_data, w_offset * 8 + 6);
afpvec4 k7 = buffer_ld4(weight_data, w_offset * 8 + 7);
afpvec4 k0 = buffer_ld4(weight_data, w_offset + 0);
afpvec4 k1 = buffer_ld4(weight_data, w_offset + 1);
afpvec4 k2 = buffer_ld4(weight_data, w_offset + 2);
afpvec4 k3 = buffer_ld4(weight_data, w_offset + 3);
afpvec4 k4 = buffer_ld4(weight_data, w_offset + 4);
afpvec4 k5 = buffer_ld4(weight_data, w_offset + 5);
afpvec4 k6 = buffer_ld4(weight_data, w_offset + 6);
afpvec4 k7 = buffer_ld4(weight_data, w_offset + 7);

// sum += v * k;
sum0[0].r += dot(v0, k0);
@@ -198,17 +182,17 @@ void main()
sum3[1].a += dot(v3, k7);

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 8;
}
#endif

#if NCNN_image_shader
image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st8(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1);


+ 11
- 17
src/layer/vulkan/shader/deconvolution_pack8_col2im.comp View File

@@ -34,17 +34,13 @@ layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;

layout (constant_id = shape_constant_id_offset + 2) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 3) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D col_blob;
@@ -58,13 +54,9 @@ layout (binding = 2) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -95,6 +87,8 @@ void main()
sum = afpvec8(afpvec4(0.f), afpvec4(0.f));
}

const int maxk = kernel_w * kernel_h;

const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;

@@ -117,9 +111,9 @@ void main()
w_k /= dilation_w;

#if NCNN_image_shader
sum += image3d_ld8(col_blob, ivec3(sy * psc(w) + sx, h_k * kernel_w + w_k, gz));
sum += image3d_ld8(col_blob, ivec3(sy * psc(w) + sx, gz * maxk + h_k * kernel_w + w_k, 0));
#else
const int gi = gz * psc(cstep) + (h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;
const int gi = (gz * maxk + h_k * kernel_w + w_k) * psc(w) * psc(h) + sy * psc(w) + sx;

sum += buffer_ld8(col_blob_data, gi);
#endif


+ 32
- 48
src/layer/vulkan/shader/deconvolution_pack8_gemm.comp View File

@@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;

afpvec8 sum0 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum1 = afpvec8(afpvec4(0.f), afpvec4(0.f));
afpvec8 sum2 = afpvec8(afpvec4(0.f), afpvec4(0.f));
@@ -81,8 +67,6 @@ void main()

#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
ivec4 gy4 = gy * 8 + ivec4(0, 1, 2, 3);
ivec4 gyy4 = gy4 + 4;

ivec4 sy4 = gx4 / psc(w);
ivec4 sx4 = gx4 % psc(w);
@@ -94,14 +78,14 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(gy4.r, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(gy4.g, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(gy4.b, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(gy4.a, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(gyy4.r, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(gyy4.g, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(gyy4.b, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(gyy4.a, z, gz));
afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 8 + 0, gy, 0));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 8 + 1, gy, 0));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 8 + 2, gy, 0));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 8 + 3, gy, 0));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(z * 8 + 4, gy, 0));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(z * 8 + 5, gy, 0));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(z * 8 + 6, gy, 0));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(z * 8 + 7, gy, 0));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -142,7 +126,7 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c) * 8;

for (int z = 0; z < psc(c); z++)
{
@@ -151,14 +135,14 @@ void main()
afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset * 8 + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset * 8 + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset * 8 + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset * 8 + 3);
afpvec8 k4 = buffer_ld8(weight_data, w_offset * 8 + 4);
afpvec8 k5 = buffer_ld8(weight_data, w_offset * 8 + 5);
afpvec8 k6 = buffer_ld8(weight_data, w_offset * 8 + 6);
afpvec8 k7 = buffer_ld8(weight_data, w_offset * 8 + 7);
afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3);
afpvec8 k4 = buffer_ld8(weight_data, w_offset + 4);
afpvec8 k5 = buffer_ld8(weight_data, w_offset + 5);
afpvec8 k6 = buffer_ld8(weight_data, w_offset + 6);
afpvec8 k7 = buffer_ld8(weight_data, w_offset + 7);

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -198,17 +182,17 @@ void main()
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 8;
}
#endif

#if NCNN_image_shader
image3d_st8(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st8(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st8(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st8(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st8(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st8(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st8(col_blob_data, gi + 1, sum1);


+ 17
- 31
src/layer/vulkan/shader/deconvolution_pack8to1_gemm.comp View File

@@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;

afp sum0 = afp(0.f);
afp sum1 = afp(0.f);
afp sum2 = afp(0.f);
@@ -92,7 +78,7 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(gy, z, gz));
afpvec8 k = image3d_ld8(weight_blob, ivec3(z, gy, 0));

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
@@ -102,7 +88,7 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c);

for (int z = 0; z < psc(c); z++)
{
@@ -120,17 +106,17 @@ void main()
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 1;
}
#endif

#if NCNN_image_shader
image3d_st1(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st1(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st1(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st1(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st1(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st1(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st1(col_blob_data, gi + 1, sum1);


+ 24
- 39
src/layer/vulkan/shader/deconvolution_pack8to4_gemm.comp View File

@@ -22,21 +22,14 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;

#define shape_constant_id_offset 2
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
#define shape_constant_id_offset 0
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -50,30 +43,23 @@ layout (binding = 2) readonly buffer weight_blob { sfpvec8 weight_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
int outcstep;
} p;

void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) || gy >= psc(outh) || gz >= psc(outc))
if (gx >= psc(outw) || gy >= psc(outh))
return;

const int maxk = kernel_w * kernel_h;

afpvec4 sum0 = afpvec4(0.f);
afpvec4 sum1 = afpvec4(0.f);
afpvec4 sum2 = afpvec4(0.f);
@@ -81,7 +67,6 @@ void main()

#if NCNN_image_shader
ivec4 gx4 = gx + ivec4(0, 1, 2, 3);
ivec4 gy4 = gy * 4 + ivec4(0, 1, 2, 3);

ivec4 sy4 = gx4 / psc(w);
ivec4 sx4 = gx4 % psc(w);
@@ -93,10 +78,10 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(gy4.r, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(gy4.g, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(gy4.b, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(gy4.a, z, gz));
afpvec8 k0 = image3d_ld8(weight_blob, ivec3(z * 4 + 0, gy, 0));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(z * 4 + 1, gy, 0));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(z * 4 + 2, gy, 0));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(z * 4 + 3, gy, 0));

// sum += v * k
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -121,7 +106,7 @@ void main()
}
#else
int v_offset = gx;
int w_offset = gz * psc(c) * maxk + gy;
int w_offset = gy * psc(c) * 4;

for (int z = 0; z < psc(c); z++)
{
@@ -130,10 +115,10 @@ void main()
afpvec8 v2 = buffer_ld8(bottom_blob_data, v_offset + 2);
afpvec8 v3 = buffer_ld8(bottom_blob_data, v_offset + 3);

afpvec8 k0 = buffer_ld8(weight_data, w_offset * 4 + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset * 4 + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset * 4 + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset * 4 + 3);
afpvec8 k0 = buffer_ld8(weight_data, w_offset + 0);
afpvec8 k1 = buffer_ld8(weight_data, w_offset + 1);
afpvec8 k2 = buffer_ld8(weight_data, w_offset + 2);
afpvec8 k3 = buffer_ld8(weight_data, w_offset + 3);

// sum += v * k
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -157,17 +142,17 @@ void main()
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);

v_offset += psc(cstep);
w_offset += maxk;
w_offset += 4;
}
#endif

#if NCNN_image_shader
image3d_st4(col_blob, ivec3(gx4.r, gy, gz), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, gz), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, gz), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, gz), sum3);
image3d_st4(col_blob, ivec3(gx4.r, gy, 0), sum0);
image3d_st4(col_blob, ivec3(gx4.g, gy, 0), sum1);
image3d_st4(col_blob, ivec3(gx4.b, gy, 0), sum2);
image3d_st4(col_blob, ivec3(gx4.a, gy, 0), sum3);
#else
const int gi = gz * psc(outcstep) + gy * psc(outw) + gx;
const int gi = gy * psc(outw) + gx;

buffer_st4(col_blob_data, gi, sum0);
if (gx + 1 < psc(outw)) buffer_st4(col_blob_data, gi + 1, sum1);


+ 6
- 0
src/net.cpp View File

@@ -1416,6 +1416,9 @@ int Net::load_param(const DataReader& dr)

if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false;

// enable local memory optimization on discrete gpu only
if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false;

// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
}
@@ -1629,6 +1632,9 @@ int Net::load_param_bin(const DataReader& dr)

if (d->vkdev->info.bug_buffer_image_load_zero()) opt.use_image_storage = false;

// enable local memory optimization on discrete gpu only
if (d->vkdev->info.type() != 0) opt.use_shader_local_memory = false;

// fp16a makes no sense when fp16 storage disabled
if (!opt.use_fp16_packed && !opt.use_fp16_storage) opt.use_fp16_arithmetic = false;
}


+ 2
- 0
src/option.cpp View File

@@ -65,6 +65,8 @@ Option::Option()
flush_denormals = 3;

use_local_pool_allocator = true;

use_shader_local_memory = true;
}

} // namespace ncnn

+ 3
- 1
src/option.h View File

@@ -129,7 +129,9 @@ public:

bool use_local_pool_allocator;

bool use_reserved_1;
// enable local memory optimization for gpu inference
bool use_shader_local_memory;

bool use_reserved_2;
bool use_reserved_3;
bool use_reserved_4;


Loading…
Cancel
Save