Browse Source

less specialization constant for vulkan conv1x1s1d1 shaders (#3657)

tags/20220420
nihui GitHub 4 years ago
parent
commit
4302f78f55
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 520 additions and 596 deletions
  1. +229
    -185
      src/layer/vulkan/convolution_vulkan.cpp
  2. +26
    -38
      src/layer/vulkan/shader/convolution_1x1s1d1.comp
  3. +36
    -47
      src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
  4. +27
    -42
      src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
  5. +45
    -56
      src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
  6. +36
    -47
      src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
  7. +32
    -47
      src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
  8. +32
    -47
      src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
  9. +27
    -42
      src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
  10. +30
    -45
      src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp

+ 229
- 185
src/layer/vulkan/convolution_vulkan.cpp View File

@@ -172,52 +172,6 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
padding->create_pipeline(opt);
}

std::vector<vk_specialization_type> specializations(10 + 10);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6].i = bias_term;
specializations[7].i = activation_type;
specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[10 + 0].i = shape_bordered_packed.dims;
specializations[10 + 1].i = shape_bordered_packed.w;
specializations[10 + 2].i = shape_bordered_packed.h;
specializations[10 + 3].i = shape_bordered_packed.c;
specializations[10 + 4].i = shape_bordered_packed.cstep;
specializations[10 + 5].i = out_shape_packed.dims;
specializations[10 + 6].i = out_shape_packed.w;
specializations[10 + 7].i = out_shape_packed.h;
specializations[10 + 8].i = out_shape_packed.c;
specializations[10 + 9].i = out_shape_packed.cstep;

if (is_conv1x1s1d1)
{
int shader_type_index = -1;
if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;

pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
if (opt.use_shader_local_memory)
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, 8);
}
else
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 1, std::min(8, num_output / out_elempack));
}
pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
}
if (opt.use_winograd_convolution && is_conv3x3s1d1 && num_input >= 16 && num_output >= 16)
{
// winograd43
@@ -477,59 +431,117 @@ int Convolution_vulkan::create_pipeline(const Option& _opt)
opt.use_image_storage = false;
}

std::vector<vk_specialization_type> specializations(10 + 8);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6].i = bias_term;
specializations[7].i = activation_type;
specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[10 + 0].i = shape_bordered_packed.w;
specializations[10 + 1].i = shape_bordered_packed.h;
specializations[10 + 2].i = shape_bordered_packed.c;
specializations[10 + 3].i = shape_bordered_packed.cstep;
specializations[10 + 4].i = out_shape_packed.w;
specializations[10 + 5].i = out_shape_packed.h;
specializations[10 + 6].i = out_shape_packed.c;
specializations[10 + 7].i = out_shape_packed.cstep;

Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
if (out_shape_packed.dims != 0)
{
std::vector<vk_specialization_type> specializations(10 + 8);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6].i = bias_term;
specializations[7].i = activation_type;
specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[10 + 0].i = shape_bordered_packed.w;
specializations[10 + 1].i = shape_bordered_packed.h;
specializations[10 + 2].i = shape_bordered_packed.c;
specializations[10 + 3].i = shape_bordered_packed.cstep;
specializations[10 + 4].i = out_shape_packed.w;
specializations[10 + 5].i = out_shape_packed.h;
specializations[10 + 6].i = out_shape_packed.c;
specializations[10 + 7].i = out_shape_packed.cstep;

Mat local_size_xyz(16, std::min(4, num_output / out_elempack), 1, (void*)0);
if (out_shape_packed.dims != 0)
{
local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
local_size_xyz.h = std::min(4, out_shape_packed.c);
}
local_size_xyz.w = std::min(16, out_shape_packed.w * out_shape_packed.h);
local_size_xyz.h = std::min(4, out_shape_packed.c);
}

int shader_type_index = -1;
if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;

pipeline_convolution_gemm = new Pipeline(vkdev);
if (opt.use_shader_local_memory)
{
pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
}
pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
int shader_type_index = -1;
if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_gemm;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_gemm;
if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_gemm;
if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_gemm;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_gemm;
if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_gemm;
if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_gemm;
if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_gemm;
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_gemm;

pipeline_convolution_gemm = new Pipeline(vkdev);
if (opt.use_shader_local_memory)
{
pipeline_convolution_gemm->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_gemm->set_optimal_local_size_xyz(local_size_xyz);
}
pipeline_convolution_gemm->create(shader_type_index, opt, specializations);
}
if (is_conv1x1s1d1)
{
std::vector<vk_specialization_type> specializations(4 + 8);
specializations[0].i = bias_term;
specializations[1].i = activation_type;
specializations[2].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[3].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[4 + 0].i = shape_bordered_packed.w;
specializations[4 + 1].i = shape_bordered_packed.h;
specializations[4 + 2].i = shape_bordered_packed.c;
specializations[4 + 3].i = shape_bordered_packed.cstep;
specializations[4 + 4].i = out_shape_packed.w;
specializations[4 + 5].i = out_shape_packed.h;
specializations[4 + 6].i = out_shape_packed.c;
specializations[4 + 7].i = out_shape_packed.cstep;

int shader_type_index = -1;
if (elempack == 1 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_1x1s1d1;
if (elempack == 4 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack4_1x1s1d1;
if (elempack == 1 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack1to4_1x1s1d1;
if (elempack == 4 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack4to1_1x1s1d1;
if (elempack == 8 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack8_1x1s1d1;
if (elempack == 1 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack1to8_1x1s1d1;
if (elempack == 8 && out_elempack == 1) shader_type_index = LayerShaderType::convolution_pack8to1_1x1s1d1;
if (elempack == 4 && out_elempack == 8) shader_type_index = LayerShaderType::convolution_pack4to8_1x1s1d1;
if (elempack == 8 && out_elempack == 4) shader_type_index = LayerShaderType::convolution_pack8to4_1x1s1d1;

pipeline_convolution_1x1s1d1 = new Pipeline(vkdev);
if (opt.use_shader_local_memory)
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, 8, 1);
}
else
{
pipeline_convolution_1x1s1d1->set_local_size_xyz(8, std::min(8, num_output / out_elempack), 1);
}
pipeline_convolution_1x1s1d1->create(shader_type_index, opt, specializations);
}
else
{
std::vector<vk_specialization_type> specializations(10 + 10);
specializations[0].i = kernel_w;
specializations[1].i = kernel_h;
specializations[2].i = dilation_w;
specializations[3].i = dilation_h;
specializations[4].i = stride_w;
specializations[5].i = stride_h;
specializations[6].i = bias_term;
specializations[7].i = activation_type;
specializations[8].f = activation_params.w >= 1 ? activation_params[0] : 0.f;
specializations[9].f = activation_params.w == 2 ? activation_params[1] : 0.f;
specializations[10 + 0].i = shape_bordered_packed.dims;
specializations[10 + 1].i = shape_bordered_packed.w;
specializations[10 + 2].i = shape_bordered_packed.h;
specializations[10 + 3].i = shape_bordered_packed.c;
specializations[10 + 4].i = shape_bordered_packed.cstep;
specializations[10 + 5].i = out_shape_packed.dims;
specializations[10 + 6].i = out_shape_packed.w;
specializations[10 + 7].i = out_shape_packed.h;
specializations[10 + 8].i = out_shape_packed.c;
specializations[10 + 9].i = out_shape_packed.cstep;

Mat local_size_xyz(8, 8, std::min(4, (num_output / out_elempack + 1) / 2), (void*)0);
if (out_shape_packed.dims != 0)
{
@@ -1194,34 +1206,63 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
{
// gemm
{
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu;
bindings[3] = bias_data_gpu;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
}
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu;
bindings[3] = bias_data_gpu;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);

return 0;
}
if (is_conv1x1s1d1)
{
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu;
bindings[3] = bias_data_gpu;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = top_blob.cstep;

VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);

return 0;
}
@@ -1248,25 +1289,12 @@ int Convolution_vulkan::forward(const VkMat& bottom_blob, VkMat& top_blob, VkCom
constants[8].i = top_blob.c;
constants[9].i = top_blob.cstep;

// record
if (is_conv1x1s1d1)
{
VkMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
dispatcher.c = top_blob.c;

cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
}
else
{
VkMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = (top_blob.c + 1) / 2;
VkMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = (top_blob.c + 1) / 2;

cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
}
cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);

return 0;
}
@@ -1567,34 +1595,63 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
if (opt.use_sgemm_convolution && !is_conv1x1s1d1 && channels * elempack >= 16 && num_output >= 16)
{
// gemm
{
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0; // bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = 0; // top_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);
}
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0; // bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = 0; // top_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_gemm, bindings, constants, dispatcher);

return 0;
}
if (is_conv1x1s1d1)
{
top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_vkallocator);
if (top_blob.empty())
return -100;

std::vector<VkImageMat> bindings(4);
bindings[0] = bottom_blob_bordered;
bindings[1] = top_blob;
bindings[2] = weight_data_gpu_image;
bindings[3] = bias_data_gpu_image;

std::vector<vk_constant_type> constants(8);
constants[0].i = bottom_blob_bordered.w;
constants[1].i = bottom_blob_bordered.h;
constants[2].i = bottom_blob_bordered.c;
constants[3].i = 0; // bottom_blob_bordered.cstep;
constants[4].i = top_blob.w;
constants[5].i = top_blob.h;
constants[6].i = top_blob.c;
constants[7].i = 0; // top_blob.cstep;

VkImageMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = top_blob.c;
dispatcher.c = 1;

cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);

return 0;
}
@@ -1621,25 +1678,12 @@ int Convolution_vulkan::forward(const VkImageMat& bottom_blob, VkImageMat& top_b
constants[8].i = top_blob.c;
constants[9].i = 0; //top_blob.cstep;

// record
if (is_conv1x1s1d1)
{
VkImageMat dispatcher;
dispatcher.w = (top_blob.w * top_blob.h + 3) / 4;
dispatcher.h = 1;
dispatcher.c = top_blob.c;

cmd.record_pipeline(pipeline_convolution_1x1s1d1, bindings, constants, dispatcher);
}
else
{
VkImageMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = (top_blob.c + 1) / 2;
VkImageMat dispatcher;
dispatcher.w = (top_blob.w + 1) / 2;
dispatcher.h = (top_blob.h + 1) / 2;
dispatcher.c = (top_blob.c + 1) / 2;

cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);
}
cmd.record_pipeline(pipeline_convolution, bindings, constants, dispatcher);

return 0;
}


+ 26
- 38
src/layer/vulkan/shader/convolution_1x1s1d1.comp View File

@@ -21,29 +21,21 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -64,13 +56,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -82,16 +72,14 @@ void main()
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx * 4 >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx * 4 >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -100,9 +88,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
sum = afpvec4(image3d_ld1(bias_blob, ivec3(gz, 0, 0)));
sum = afpvec4(image3d_ld1(bias_blob, ivec3(gy, 0, 0)));
#else
sum = afpvec4(buffer_ld1(bias_data, gz));
sum = afpvec4(buffer_ld1(bias_data, gy));
#endif
}
else
@@ -118,7 +106,7 @@ void main()

for (int z = 0; z < psc(c); z++)
{
afp k = image3d_ld1(weight_blob, ivec3(0, z, gz));
afp k = image3d_ld1(weight_blob, ivec3(0, z, gy));

sum.r += k * image3d_ld1(bottom_blob, ivec3(sx4.r, sy4.r, z));
sum.g += k * image3d_ld1(bottom_blob, ivec3(sx4.g, sy4.g, z));
@@ -126,7 +114,7 @@ void main()
sum.a += k * image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));
}
#else
int w_offset = gz * psc(c);
int w_offset = gy * psc(c);
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -174,12 +162,12 @@ void main()

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum.r);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum.g);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum.b);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum.a);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum.g);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum.b);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum.a);
#else
const int gi = gz * psc(outcstep) / 4 + gx;
const int gi = gy * psc(outcstep) / 4 + gx;

#if NCNN_fp16_packed
top_blob_data[gi] = sum;


+ 36
- 47
src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp View File

@@ -23,29 +23,21 @@

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -83,14 +73,13 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
#if !NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif
#endif
@@ -103,9 +92,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
afpvec4 b = buffer_ld4(bias_data, gz);
afpvec4 b = buffer_ld4(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -133,7 +122,7 @@ void main()
afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));

sum0 += v0 * k;
sum1 += v1 * k;
@@ -141,21 +130,21 @@ void main()
sum3 += v3 * k;
}
#else
int w_offset = gz * psc(c);
int w_offset = gy * psc(c);
int v_offset = gx;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int lz = int(gl_LocalInvocationID.z);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -163,7 +152,7 @@ void main()
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

@@ -176,7 +165,7 @@ void main()
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
@@ -194,11 +183,11 @@ void main()
{
const int remain = psc(c) - z;

if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfp(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -206,7 +195,7 @@ void main()
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

@@ -219,7 +208,7 @@ void main()
afp v2 = lfp2afp(tmp_v[lx][z4][2]);
afp v3 = lfp2afp(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += v0 * k;
sum1 += v1 * k;
@@ -297,17 +286,17 @@ void main()
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
#if NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);


+ 27
- 42
src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp View File

@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -99,9 +84,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
afpvec8 b = buffer_ld8(bias_data, gz);
afpvec8 b = buffer_ld8(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -129,7 +114,7 @@ void main()
afp v2 = image3d_ld1(bottom_blob, ivec3(sx4.b, sy4.b, z));
afp v3 = image3d_ld1(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));

// sum += v * k;
sum0[0] += v0 * k[0];
@@ -145,7 +130,7 @@ void main()
sum3[1] += v3 * k[1];
}
#else
int w_offset = gz * psc(c);
int w_offset = gy * psc(c);
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -248,12 +233,12 @@ void main()
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);


+ 45
- 56
src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp View File

@@ -23,29 +23,21 @@

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -83,14 +73,13 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
#if !NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif
#endif
@@ -103,9 +92,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
afpvec4 b = buffer_ld4(bias_data, gz);
afpvec4 b = buffer_ld4(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -134,10 +123,10 @@ void main()
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpmat4 k = afpmat4(
image3d_ld4(weight_blob, ivec3(0, z, gz)),
image3d_ld4(weight_blob, ivec3(1, z, gz)),
image3d_ld4(weight_blob, ivec3(2, z, gz)),
image3d_ld4(weight_blob, ivec3(3, z, gz))
image3d_ld4(weight_blob, ivec3(0, z, gy)),
image3d_ld4(weight_blob, ivec3(1, z, gy)),
image3d_ld4(weight_blob, ivec3(2, z, gy)),
image3d_ld4(weight_blob, ivec3(3, z, gy))
);

sum0 += v0 * k;
@@ -146,21 +135,21 @@ void main()
sum3 += v3 * k;
}
#else
int w_offset = gz * psc(c) * 4;
int w_offset = gy * psc(c) * 4;
int v_offset = gx;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int lz = int(gl_LocalInvocationID.z);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -168,7 +157,7 @@ void main()
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

@@ -181,10 +170,10 @@ void main()
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

@@ -204,11 +193,11 @@ void main()
{
const int remain = psc(c) - z;

if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -216,7 +205,7 @@ void main()
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[lz][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
tmp_k[ly][z4][lx] = sfp2lfpvec4(weight_data[w_offset + z4 * 4 + lx]);
}
}

@@ -229,10 +218,10 @@ void main()
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k0 = lfp2afpvec4(tmp_k[lz][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[lz][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[lz][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[lz][z4][3]);
afpvec4 k0 = lfp2afpvec4(tmp_k[ly][z4][0]);
afpvec4 k1 = lfp2afpvec4(tmp_k[ly][z4][1]);
afpvec4 k2 = lfp2afpvec4(tmp_k[ly][z4][2]);
afpvec4 k3 = lfp2afpvec4(tmp_k[ly][z4][3]);

afpmat4 k = afpmat4(k0, k1, k2, k3);

@@ -317,17 +306,17 @@ void main()
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
#if NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);


+ 36
- 47
src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp View File

@@ -23,29 +23,21 @@

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -61,13 +53,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -83,14 +73,13 @@ void main()
{
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
#if !NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif
#endif
@@ -103,9 +92,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
#else
afp b = buffer_ld1(bias_data, gz);
afp b = buffer_ld1(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -133,7 +122,7 @@ void main()
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gz));
afpvec4 k = image3d_ld4(weight_blob, ivec3(0, z, gy));

sum0 += dot(v0, k);
sum1 += dot(v1, k);
@@ -141,21 +130,21 @@ void main()
sum3 += dot(v3, k);
}
#else
int w_offset = gz * psc(c);
int w_offset = gy * psc(c);
int v_offset = gx;

#if NCNN_shader_local_memory
const int lx = int(gl_LocalInvocationID.x);
const int lz = int(gl_LocalInvocationID.z);
const int ly = int(gl_LocalInvocationID.y);

int z = 0;
for (; z + (LOCAL_MEMORY_UNROLL_INCH - 1) < psc(c); z += LOCAL_MEMORY_UNROLL_INCH)
{
if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -163,7 +152,7 @@ void main()
{
for (int z4 = 0; z4 < LOCAL_MEMORY_UNROLL_INCH; z4++)
{
tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

@@ -176,7 +165,7 @@ void main()
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
@@ -194,11 +183,11 @@ void main()
{
const int remain = psc(c) - z;

if (lz < 4)
if (ly < 4)
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_v[lx][z4][lz] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + lz]);
tmp_v[lx][z4][ly] = sfp2lfpvec4(bottom_blob_data[v_offset + z4 * psc(cstep) + ly]);
}
}

@@ -206,7 +195,7 @@ void main()
{
for (int z4 = 0; z4 < remain; z4++)
{
tmp_k[lz][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
tmp_k[ly][z4] = sfp2lfpvec4(weight_data[w_offset + z4]);
}
}

@@ -219,7 +208,7 @@ void main()
afpvec4 v2 = lfp2afpvec4(tmp_v[lx][z4][2]);
afpvec4 v3 = lfp2afpvec4(tmp_v[lx][z4][3]);

afpvec4 k = lfp2afpvec4(tmp_k[lz][z4]);
afpvec4 k = lfp2afpvec4(tmp_k[ly][z4]);

sum0 += dot(v0, k);
sum1 += dot(v1, k);
@@ -297,17 +286,17 @@ void main()
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
#if NCNN_shader_local_memory
if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st1(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);


+ 32
- 47
src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp View File

@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -99,9 +84,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
afpvec8 b = buffer_ld8(bias_data, gz);
afpvec8 b = buffer_ld8(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -129,14 +114,14 @@ void main()
afpvec4 v2 = image3d_ld4(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec4 v3 = image3d_ld4(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gz));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gz));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gz));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gz));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gz));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gz));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gz));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gz));
afpvec4 k0 = image3d_ld4(weight_blob, ivec3(0, z, gy));
afpvec4 k1 = image3d_ld4(weight_blob, ivec3(1, z, gy));
afpvec4 k2 = image3d_ld4(weight_blob, ivec3(2, z, gy));
afpvec4 k3 = image3d_ld4(weight_blob, ivec3(3, z, gy));
afpvec4 k4 = image3d_ld4(weight_blob, ivec3(4, z, gy));
afpvec4 k5 = image3d_ld4(weight_blob, ivec3(5, z, gy));
afpvec4 k6 = image3d_ld4(weight_blob, ivec3(6, z, gy));
afpvec4 k7 = image3d_ld4(weight_blob, ivec3(7, z, gy));

// sum += v * k;
sum0[0].r += dot(v0, k0);
@@ -176,7 +161,7 @@ void main()
sum3[1].a += dot(v3, k7);
}
#else
int w_offset = gz * psc(c) * 8;
int w_offset = gy * psc(c) * 8;
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);


+ 32
- 47
src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp View File

@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;
#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec8 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -99,9 +84,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec8 b = image3d_ld8(bias_blob, ivec3(gz, 0, 0));
afpvec8 b = image3d_ld8(bias_blob, ivec3(gy, 0, 0));
#else
afpvec8 b = buffer_ld8(bias_data, gz);
afpvec8 b = buffer_ld8(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -129,14 +114,14 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gz));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gz));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gz));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gz));
afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));
afpvec8 k4 = image3d_ld8(weight_blob, ivec3(4, z, gy));
afpvec8 k5 = image3d_ld8(weight_blob, ivec3(5, z, gy));
afpvec8 k6 = image3d_ld8(weight_blob, ivec3(6, z, gy));
afpvec8 k7 = image3d_ld8(weight_blob, ivec3(7, z, gy));

// sum += v * k
sum0[0].r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -176,7 +161,7 @@ void main()
sum3[1].a += dot(v3[0], k7[0]) + dot(v3[1], k7[1]);
}
#else
int w_offset = gz * psc(c) * 8;
int w_offset = gy * psc(c) * 8;
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -310,12 +295,12 @@ void main()
}

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st8(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st8(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st8(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st8(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st8(top_blob_data, gi + 1, sum1);


+ 27
- 42
src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp View File

@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfp bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -99,9 +84,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afp b = image3d_ld1(bias_blob, ivec3(gz, 0, 0));
afp b = image3d_ld1(bias_blob, ivec3(gy, 0, 0));
#else
afp b = buffer_ld1(bias_data, gz);
afp b = buffer_ld1(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -129,7 +114,7 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gz));
afpvec8 k = image3d_ld8(weight_blob, ivec3(0, z, gy));

// sum += dot(v, k);
sum0 += dot(v0[0], k[0]) + dot(v0[1], k[1]);
@@ -138,7 +123,7 @@ void main()
sum3 += dot(v3[0], k[0]) + dot(v3[1], k[1]);
}
#else
int w_offset = gz * psc(c);
int w_offset = gy * psc(c);
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -210,12 +195,12 @@ void main()
}

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st1(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st1(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st1(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st1(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st1(top_blob_data, gi + 1, sum1);


+ 30
- 45
src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp View File

@@ -22,29 +22,21 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
layout (constant_id = 3) const int dilation_h = 1;
layout (constant_id = 4) const int stride_w = 1;
layout (constant_id = 5) const int stride_h = 1;
layout (constant_id = 6) const int bias_term = 0;
layout (constant_id = 7) const int activation_type = 0;
layout (constant_id = 8) const float activation_param_0 = 0;
layout (constant_id = 9) const float activation_param_1 = 0;

#define shape_constant_id_offset 10
layout (constant_id = shape_constant_id_offset + 0) const int dims = 0;
layout (constant_id = shape_constant_id_offset + 1) const int w = 0;
layout (constant_id = shape_constant_id_offset + 2) const int h = 0;
layout (constant_id = shape_constant_id_offset + 3) const int c = 0;
layout (constant_id = shape_constant_id_offset + 4) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 5) const int outdims = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 8) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 9) const int outcstep = 0;
layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
layout (constant_id = 3) const float activation_param_1 = 0;

#define shape_constant_id_offset 4
layout (constant_id = shape_constant_id_offset + 0) const int w = 0;
layout (constant_id = shape_constant_id_offset + 1) const int h = 0;
layout (constant_id = shape_constant_id_offset + 2) const int c = 0;
layout (constant_id = shape_constant_id_offset + 3) const int cstep = 0;

layout (constant_id = shape_constant_id_offset + 4) const int outw = 0;
layout (constant_id = shape_constant_id_offset + 5) const int outh = 0;
layout (constant_id = shape_constant_id_offset + 6) const int outc = 0;
layout (constant_id = shape_constant_id_offset + 7) const int outcstep = 0;

#if NCNN_image_shader
layout (binding = 0) uniform unfp sampler3D bottom_blob;
@@ -60,13 +52,11 @@ layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };

layout (push_constant) uniform parameter
{
int dims;
int w;
int h;
int c;
int cstep;

int outdims;
int outw;
int outh;
int outc;
@@ -75,19 +65,14 @@ layout (push_constant) uniform parameter

void main()
{
#if NCNN_image_shader
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outw) * psc(outh) || gy >= 1 || gz >= psc(outc))
#if NCNN_image_shader
if (gx >= psc(outw) * psc(outh) || gy >= psc(outc))
return;
#else
int gx = int(gl_GlobalInvocationID.x) * 4;
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= psc(outcstep) || gy >= 1 || gz >= psc(outc))
if (gx >= psc(outcstep) || gy >= psc(outc))
return;
#endif

@@ -99,9 +84,9 @@ void main()
if (bias_term == 1)
{
#if NCNN_image_shader
afpvec4 b = image3d_ld4(bias_blob, ivec3(gz, 0, 0));
afpvec4 b = image3d_ld4(bias_blob, ivec3(gy, 0, 0));
#else
afpvec4 b = buffer_ld4(bias_data, gz);
afpvec4 b = buffer_ld4(bias_data, gy);
#endif
sum0 = b;
sum1 = b;
@@ -129,10 +114,10 @@ void main()
afpvec8 v2 = image3d_ld8(bottom_blob, ivec3(sx4.b, sy4.b, z));
afpvec8 v3 = image3d_ld8(bottom_blob, ivec3(sx4.a, sy4.a, z));

afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gz));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gz));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gz));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gz));
afpvec8 k0 = image3d_ld8(weight_blob, ivec3(0, z, gy));
afpvec8 k1 = image3d_ld8(weight_blob, ivec3(1, z, gy));
afpvec8 k2 = image3d_ld8(weight_blob, ivec3(2, z, gy));
afpvec8 k3 = image3d_ld8(weight_blob, ivec3(3, z, gy));

// sum += v * k;
sum0.r += dot(v0[0], k0[0]) + dot(v0[1], k0[1]);
@@ -156,7 +141,7 @@ void main()
sum3.a += dot(v3[0], k3[0]) + dot(v3[1], k3[1]);
}
#else
int w_offset = gz * psc(c) * 4;
int w_offset = gy * psc(c) * 4;
int v_offset = gx;

for (int z = 0; z < psc(c); z++)
@@ -246,12 +231,12 @@ void main()
}

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gz), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gz), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gz), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gz), sum3);
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);
image3d_st4(top_blob, ivec3(sx4.g, sy4.g, gy), sum1);
image3d_st4(top_blob, ivec3(sx4.b, sy4.b, gy), sum2);
image3d_st4(top_blob, ivec3(sx4.a, sy4.a, gy), sum3);
#else
int gi = gz * psc(outcstep) + gx;
int gi = gy * psc(outcstep) + gx;

buffer_st4(top_blob_data, gi + 0, sum0);
if (gx + 1 < psc(outcstep)) buffer_st4(top_blob_data, gi + 1, sum1);


Loading…
Cancel
Save