Browse Source

shader include vulkan activation, workaround for moltenvk tanh half4 issue (#3711)

tags/20220420
nihui GitHub 4 years ago
parent
commit
9826f3dbf8
No known key found for this signature in database GPG Key ID: 4AEE18F83AFDEB23
100 changed files with 602 additions and 3985 deletions
  1. +3
    -1
      cmake/ncnn_generate_shader_comp_header.cmake
  2. +1
    -0
      src/CMakeLists.txt
  3. +35
    -5
      src/gpu.cpp
  4. +11
    -71
      src/layer/vulkan/shader/convolution.comp
  5. +4
    -30
      src/layer/vulkan/shader/convolution_1x1s1d1.comp
  6. +7
    -47
      src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output.comp
  7. +19
    -119
      src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output.comp
  8. +7
    -47
      src/layer/vulkan/shader/convolution_gemm.comp
  9. +11
    -71
      src/layer/vulkan/shader/convolution_pack1to4.comp
  10. +7
    -47
      src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp
  11. +7
    -47
      src/layer/vulkan/shader/convolution_pack1to4_gemm.comp
  12. +11
    -119
      src/layer/vulkan/shader/convolution_pack1to8.comp
  13. +7
    -71
      src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp
  14. +7
    -71
      src/layer/vulkan/shader/convolution_pack1to8_gemm.comp
  15. +11
    -71
      src/layer/vulkan/shader/convolution_pack4.comp
  16. +7
    -47
      src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp
  17. +7
    -47
      src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp
  18. +7
    -47
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp
  19. +19
    -119
      src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp
  20. +7
    -47
      src/layer/vulkan/shader/convolution_pack4_gemm.comp
  21. +7
    -47
      src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp
  22. +11
    -71
      src/layer/vulkan/shader/convolution_pack4to1.comp
  23. +7
    -47
      src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp
  24. +7
    -47
      src/layer/vulkan/shader/convolution_pack4to1_gemm.comp
  25. +11
    -119
      src/layer/vulkan/shader/convolution_pack4to8.comp
  26. +7
    -71
      src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp
  27. +7
    -71
      src/layer/vulkan/shader/convolution_pack4to8_gemm.comp
  28. +11
    -119
      src/layer/vulkan/shader/convolution_pack8.comp
  29. +7
    -71
      src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp
  30. +7
    -71
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp
  31. +19
    -215
      src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp
  32. +7
    -71
      src/layer/vulkan/shader/convolution_pack8_gemm.comp
  33. +11
    -71
      src/layer/vulkan/shader/convolution_pack8to1.comp
  34. +7
    -47
      src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp
  35. +7
    -47
      src/layer/vulkan/shader/convolution_pack8to1_gemm.comp
  36. +11
    -71
      src/layer/vulkan/shader/convolution_pack8to4.comp
  37. +7
    -47
      src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp
  38. +7
    -47
      src/layer/vulkan/shader/convolution_pack8to4_gemm.comp
  39. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise.comp
  40. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group.comp
  41. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp
  42. +4
    -35
      src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp
  43. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp
  44. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp
  45. +4
    -35
      src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp
  46. +4
    -35
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp
  47. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp
  48. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp
  49. +4
    -29
      src/layer/vulkan/shader/convolutiondepthwise_pack4.comp
  50. +4
    -35
      src/layer/vulkan/shader/convolutiondepthwise_pack8.comp
  51. +4
    -19
      src/layer/vulkan/shader/deconvolution.comp
  52. +4
    -19
      src/layer/vulkan/shader/deconvolution_col2im.comp
  53. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack1to4.comp
  54. +4
    -23
      src/layer/vulkan/shader/deconvolution_pack1to8.comp
  55. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack4.comp
  56. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack4_col2im.comp
  57. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack4to1.comp
  58. +4
    -23
      src/layer/vulkan/shader/deconvolution_pack4to8.comp
  59. +4
    -23
      src/layer/vulkan/shader/deconvolution_pack8.comp
  60. +4
    -23
      src/layer/vulkan/shader/deconvolution_pack8_col2im.comp
  61. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack8to1.comp
  62. +4
    -19
      src/layer/vulkan/shader/deconvolution_pack8to4.comp
  63. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise.comp
  64. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group.comp
  65. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp
  66. +4
    -23
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp
  67. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp
  68. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp
  69. +4
    -23
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp
  70. +4
    -23
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp
  71. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp
  72. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp
  73. +4
    -19
      src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp
  74. +4
    -23
      src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp
  75. +4
    -29
      src/layer/vulkan/shader/innerproduct.comp
  76. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm.comp
  77. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm_wp1to4.comp
  78. +4
    -35
      src/layer/vulkan/shader/innerproduct_gemm_wp1to8.comp
  79. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm_wp4.comp
  80. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm_wp4to1.comp
  81. +4
    -35
      src/layer/vulkan/shader/innerproduct_gemm_wp4to8.comp
  82. +4
    -35
      src/layer/vulkan/shader/innerproduct_gemm_wp8.comp
  83. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm_wp8to1.comp
  84. +4
    -29
      src/layer/vulkan/shader/innerproduct_gemm_wp8to4.comp
  85. +4
    -29
      src/layer/vulkan/shader/innerproduct_pack1to4.comp
  86. +4
    -35
      src/layer/vulkan/shader/innerproduct_pack1to8.comp
  87. +4
    -29
      src/layer/vulkan/shader/innerproduct_pack4.comp
  88. +4
    -29
      src/layer/vulkan/shader/innerproduct_pack4to1.comp
  89. +4
    -35
      src/layer/vulkan/shader/innerproduct_pack4to8.comp
  90. +4
    -35
      src/layer/vulkan/shader/innerproduct_pack8.comp
  91. +4
    -29
      src/layer/vulkan/shader/innerproduct_pack8to1.comp
  92. +4
    -29
      src/layer/vulkan/shader/innerproduct_pack8to4.comp
  93. +4
    -29
      src/layer/vulkan/shader/innerproduct_reduce_sum8.comp
  94. +4
    -29
      src/layer/vulkan/shader/innerproduct_reduce_sum8_pack4.comp
  95. +4
    -35
      src/layer/vulkan/shader/innerproduct_reduce_sum8_pack8.comp
  96. +4
    -0
      src/layer/vulkan/shader/mish.comp
  97. +4
    -0
      src/layer/vulkan/shader/mish_pack4.comp
  98. +5
    -0
      src/layer/vulkan/shader/mish_pack8.comp
  99. +4
    -0
      src/layer/vulkan/shader/tanh.comp
  100. +4
    -0
      src/layer/vulkan/shader/tanh_pack4.comp

+ 3
- 1
cmake/ncnn_generate_shader_comp_header.cmake View File

@@ -5,7 +5,9 @@ file(READ ${SHADER_SRC} comp_data)

# skip leading comment
string(FIND "${comp_data}" "#version" version_start)
string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data)
if(NOT ${version_start} EQUAL -1)
string(SUBSTRING "${comp_data}" ${version_start} -1 comp_data)
endif()

# remove whitespace
string(REGEX REPLACE "\n +" "\n" comp_data "${comp_data}")


+ 1
- 0
src/CMakeLists.txt View File

@@ -157,6 +157,7 @@ ncnn_add_layer(DeconvolutionDepthWise3D)

if(NCNN_VULKAN)
ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/convert_ycbcr.comp)
ncnn_add_shader(${CMAKE_CURRENT_SOURCE_DIR}/layer/vulkan/shader/vulkan_activation.comp)
endif()

add_custom_target(ncnn-generate-spirv DEPENDS ${NCNN_SHADER_SPV_HEX_FILES})


+ 35
- 5
src/gpu.cpp View File

@@ -23,6 +23,8 @@
#include "glslang/SPIRV/GlslangToSpv.h"
#include "glslang/glslang/Public/ShaderLang.h"

#include "vulkan_activation.comp.hex.h"

#include "command.h"
#include "layer.h"
#include "layer/vulkan/packing_vulkan.h"
@@ -1725,7 +1727,7 @@ int VulkanDevicePrivate::create_dummy_buffer_image()
dummy_buffer.create(1, 4u, dummy_allocator);
dummy_image.create(1, 4u, dummy_allocator);
#if __APPLE__
if (vkdev->info.vendor_id() != 0x8086)
if (vkdev->info.type() == 0)
dummy_image_readonly.create(1, 4u, dummy_allocator);
#else
dummy_image_readonly.create(1, 4u, dummy_allocator);
@@ -1736,7 +1738,7 @@ int VulkanDevicePrivate::create_dummy_buffer_image()
cmd.record_dummy(dummy_buffer);
cmd.record_dummy(dummy_image);
#if __APPLE__
if (vkdev->info.vendor_id() != 0x8086)
if (vkdev->info.type() == 0)
cmd.record_dummy_readonly(dummy_image_readonly);
#else
cmd.record_dummy_readonly(dummy_image_readonly);
@@ -1752,7 +1754,7 @@ void VulkanDevicePrivate::destroy_dummy_buffer_image()
dummy_buffer.release();
dummy_image.release();
#if __APPLE__
if (vkdev->info.vendor_id() != 0x8086)
if (vkdev->info.type() == 0)
dummy_image_readonly.release();
#else
dummy_image_readonly.release();
@@ -2818,7 +2820,7 @@ VkImageMat VulkanDevice::get_dummy_image() const
VkImageMat VulkanDevice::get_dummy_image_readonly() const
{
#if __APPLE__
if (info.vendor_id() == 0x8086)
if (info.type() != 0)
return d->dummy_image;
#endif
return d->dummy_image_readonly;
@@ -3245,6 +3247,28 @@ static TBuiltInResource get_default_TBuiltInResource()
return resource;
}

class VulkanShaderIncluder : public glslang::TShader::Includer
{
public:
virtual glslang::TShader::Includer::IncludeResult* includeLocal(const char* headerName, const char* /*includerName*/, size_t /*inclusionDepth*/)
{
if (strcmp(headerName, "vulkan_activation.comp") == 0)
{
const char* const headerData = vulkan_activation_comp_data;
const size_t headerLength = sizeof(vulkan_activation_comp_data);
glslang::TShader::Includer::IncludeResult* r = new glslang::TShader::Includer::IncludeResult(headerName, headerData, headerLength, 0);
return r;
}

return 0;
}

virtual void releaseInclude(glslang::TShader::Includer::IncludeResult* r)
{
delete r;
}
};

int compile_spirv_module(const char* comp_string, const Option& opt, std::vector<uint32_t>& spirv)
{
// -1 for omitting the tail '\0'
@@ -3699,6 +3723,10 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option
custom_defines.push_back(std::make_pair("NCNN_shader_local_memory", "1"));
}

#if __APPLE__
custom_defines.push_back(std::make_pair("NCNN_moltenvk", "1"));
#endif

std::string preamble;
std::vector<std::string> processes;

@@ -3740,7 +3768,9 @@ int compile_spirv_module(const char* comp_data, int comp_data_size, const Option

TBuiltInResource resources = get_default_TBuiltInResource();

bool pr = s.parse(&resources, 100, false, EShMsgDefault);
VulkanShaderIncluder includer;

bool pr = s.parse(&resources, 100, ENoProfile, false, false, EShMsgDefault, includer);
if (!pr)
{
NCNN_LOGE("compile spir-v module failed");


+ 11
- 71
src/layer/vulkan/shader/convolution.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -195,77 +198,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
sum4 = sum4 < afp(0.f) ? sum4 * slope : sum4;
sum5 = sum5 < afp(0.f) ? sum5 * slope : sum5;
sum6 = sum6 < afp(0.f) ? sum6 * slope : sum6;
sum7 = sum7 < afp(0.f) ? sum7 * slope : sum7;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afp(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afp(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afp(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afp(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 4
- 30
src/layer/vulkan/shader/convolution_1x1s1d1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -130,36 +133,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum.r);


+ 7
- 47
src/layer/vulkan/shader/convolution_3x3s1d1_winograd23_transform_output.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -149,53 +152,10 @@ void main()
v11 = m11 - m12 + m13;
}

if (activation_type == 1)
{
v00 = max(v00, afp(0.f));
v10 = max(v10, afp(0.f));
v01 = max(v01, afp(0.f));
v11 = max(v11, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00 = v00 < afp(0.f) ? v00 * slope : v00;
v10 = v10 < afp(0.f) ? v10 * slope : v10;
v01 = v01 < afp(0.f) ? v01 * slope : v01;
v11 = v11 < afp(0.f) ? v11 * slope : v11;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00 = clamp(v00, const_min, const_max);
v10 = clamp(v10, const_min, const_max);
v01 = clamp(v01, const_min, const_max);
v11 = clamp(v11, const_min, const_max);
}
if (activation_type == 4)
{
v00 = afp(1.f) / (afp(1.f) + exp(-v00));
v10 = afp(1.f) / (afp(1.f) + exp(-v10));
v01 = afp(1.f) / (afp(1.f) + exp(-v01));
v11 = afp(1.f) / (afp(1.f) + exp(-v11));
}
if (activation_type == 5)
{
v00 = v00 * tanh(log(exp(v00) + afp(1.f)));
v01 = v01 * tanh(log(exp(v01) + afp(1.f)));
v10 = v10 * tanh(log(exp(v10) + afp(1.f)));
v11 = v11 * tanh(log(exp(v11) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00 = v00 * clamp(v00 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01 = v01 * clamp(v01 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10 = v10 * clamp(v10 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11 = v11 * clamp(v11 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afp(v00, activation_type, activation_param_0, activation_param_1);
v10 = activation_afp(v10, activation_type, activation_param_0, activation_param_1);
v01 = activation_afp(v01, activation_type, activation_param_0, activation_param_1);
v11 = activation_afp(v11, activation_type, activation_param_0, activation_param_1);

// store 2x2
int x = gx * 2;


+ 19
- 119
src/layer/vulkan/shader/convolution_3x3s1d1_winograd43_transform_output.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -237,125 +240,22 @@ void main()
v33 = bias_value + v33;
}

if (activation_type == 1)
{
v00 = max(v00, afp(0.f));
v01 = max(v01, afp(0.f));
v02 = max(v02, afp(0.f));
v03 = max(v03, afp(0.f));
v10 = max(v10, afp(0.f));
v11 = max(v11, afp(0.f));
v12 = max(v12, afp(0.f));
v13 = max(v13, afp(0.f));
v20 = max(v20, afp(0.f));
v21 = max(v21, afp(0.f));
v22 = max(v22, afp(0.f));
v23 = max(v23, afp(0.f));
v30 = max(v30, afp(0.f));
v31 = max(v31, afp(0.f));
v32 = max(v32, afp(0.f));
v33 = max(v33, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00 = v00 < afp(0.f) ? v00 * slope : v00;
v01 = v01 < afp(0.f) ? v01 * slope : v01;
v02 = v02 < afp(0.f) ? v02 * slope : v02;
v03 = v03 < afp(0.f) ? v03 * slope : v03;
v10 = v10 < afp(0.f) ? v10 * slope : v10;
v11 = v11 < afp(0.f) ? v11 * slope : v11;
v12 = v12 < afp(0.f) ? v12 * slope : v12;
v13 = v13 < afp(0.f) ? v13 * slope : v13;
v20 = v20 < afp(0.f) ? v20 * slope : v20;
v21 = v21 < afp(0.f) ? v21 * slope : v21;
v22 = v22 < afp(0.f) ? v22 * slope : v22;
v23 = v23 < afp(0.f) ? v23 * slope : v23;
v30 = v30 < afp(0.f) ? v30 * slope : v30;
v31 = v31 < afp(0.f) ? v31 * slope : v31;
v32 = v32 < afp(0.f) ? v32 * slope : v32;
v33 = v33 < afp(0.f) ? v33 * slope : v33;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00 = clamp(v00, const_min, const_max);
v01 = clamp(v01, const_min, const_max);
v02 = clamp(v02, const_min, const_max);
v03 = clamp(v03, const_min, const_max);
v10 = clamp(v10, const_min, const_max);
v11 = clamp(v11, const_min, const_max);
v12 = clamp(v12, const_min, const_max);
v13 = clamp(v13, const_min, const_max);
v20 = clamp(v20, const_min, const_max);
v21 = clamp(v21, const_min, const_max);
v22 = clamp(v22, const_min, const_max);
v23 = clamp(v23, const_min, const_max);
v30 = clamp(v30, const_min, const_max);
v31 = clamp(v31, const_min, const_max);
v32 = clamp(v32, const_min, const_max);
v33 = clamp(v33, const_min, const_max);
}
if (activation_type == 4)
{
v00 = afp(1.f) / (afp(1.f) + exp(-v00));
v01 = afp(1.f) / (afp(1.f) + exp(-v01));
v02 = afp(1.f) / (afp(1.f) + exp(-v02));
v03 = afp(1.f) / (afp(1.f) + exp(-v03));
v10 = afp(1.f) / (afp(1.f) + exp(-v10));
v11 = afp(1.f) / (afp(1.f) + exp(-v11));
v12 = afp(1.f) / (afp(1.f) + exp(-v12));
v13 = afp(1.f) / (afp(1.f) + exp(-v13));
v20 = afp(1.f) / (afp(1.f) + exp(-v20));
v21 = afp(1.f) / (afp(1.f) + exp(-v21));
v22 = afp(1.f) / (afp(1.f) + exp(-v22));
v23 = afp(1.f) / (afp(1.f) + exp(-v23));
v30 = afp(1.f) / (afp(1.f) + exp(-v30));
v31 = afp(1.f) / (afp(1.f) + exp(-v31));
v32 = afp(1.f) / (afp(1.f) + exp(-v32));
v33 = afp(1.f) / (afp(1.f) + exp(-v33));
}
if (activation_type == 5)
{
v00 = v00 * tanh(log(exp(v00) + afp(1.f)));
v01 = v01 * tanh(log(exp(v01) + afp(1.f)));
v02 = v02 * tanh(log(exp(v02) + afp(1.f)));
v03 = v03 * tanh(log(exp(v03) + afp(1.f)));
v10 = v10 * tanh(log(exp(v10) + afp(1.f)));
v11 = v11 * tanh(log(exp(v11) + afp(1.f)));
v12 = v12 * tanh(log(exp(v12) + afp(1.f)));
v13 = v13 * tanh(log(exp(v13) + afp(1.f)));
v20 = v20 * tanh(log(exp(v20) + afp(1.f)));
v21 = v21 * tanh(log(exp(v21) + afp(1.f)));
v22 = v22 * tanh(log(exp(v22) + afp(1.f)));
v23 = v23 * tanh(log(exp(v23) + afp(1.f)));
v30 = v30 * tanh(log(exp(v30) + afp(1.f)));
v31 = v31 * tanh(log(exp(v31) + afp(1.f)));
v32 = v32 * tanh(log(exp(v32) + afp(1.f)));
v33 = v33 * tanh(log(exp(v33) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00 = v00 * clamp(v00 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01 = v01 * clamp(v01 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v02 = v02 * clamp(v02 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v03 = v03 * clamp(v03 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10 = v10 * clamp(v10 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11 = v11 * clamp(v11 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v12 = v12 * clamp(v12 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v13 = v13 * clamp(v13 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v20 = v20 * clamp(v20 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v21 = v21 * clamp(v21 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v22 = v22 * clamp(v22 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v23 = v23 * clamp(v23 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v30 = v30 * clamp(v30 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v31 = v31 * clamp(v31 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v32 = v32 * clamp(v32 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v33 = v33 * clamp(v33 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afp(v00, activation_type, activation_param_0, activation_param_1);
v01 = activation_afp(v01, activation_type, activation_param_0, activation_param_1);
v02 = activation_afp(v02, activation_type, activation_param_0, activation_param_1);
v03 = activation_afp(v03, activation_type, activation_param_0, activation_param_1);
v10 = activation_afp(v10, activation_type, activation_param_0, activation_param_1);
v11 = activation_afp(v11, activation_type, activation_param_0, activation_param_1);
v12 = activation_afp(v12, activation_type, activation_param_0, activation_param_1);
v13 = activation_afp(v13, activation_type, activation_param_0, activation_param_1);
v20 = activation_afp(v20, activation_type, activation_param_0, activation_param_1);
v21 = activation_afp(v21, activation_type, activation_param_0, activation_param_1);
v22 = activation_afp(v22, activation_type, activation_param_0, activation_param_1);
v23 = activation_afp(v23, activation_type, activation_param_0, activation_param_1);
v30 = activation_afp(v30, activation_type, activation_param_0, activation_param_1);
v31 = activation_afp(v31, activation_type, activation_param_0, activation_param_1);
v32 = activation_afp(v32, activation_type, activation_param_0, activation_param_1);
v33 = activation_afp(v33, activation_type, activation_param_0, activation_param_1);

// store 4x4
int x = gx * 4;


+ 7
- 47
src/layer/vulkan/shader/convolution_gemm.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
@@ -280,53 +283,10 @@ void main()
return;
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 71
src/layer/vulkan/shader/convolution_pack1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -195,77 +198,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
sum4 = mix(sum4, sum4 * afp(slope), lessThan(sum4, afpvec4(0.f)));
sum5 = mix(sum5, sum5 * afp(slope), lessThan(sum5, afpvec4(0.f)));
sum6 = mix(sum6, sum6 * afp(slope), lessThan(sum6, afpvec4(0.f)));
sum7 = mix(sum7, sum7 * afp(slope), lessThan(sum7, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec4(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec4(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec4(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec4(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack1to4_1x1s1d1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int bias_term = 0;
@@ -237,53 +240,10 @@ void main()
#endif
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack1to4_gemm.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
@@ -280,53 +283,10 @@ void main()
return;
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 119
src/layer/vulkan/shader/convolution_pack1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -214,125 +217,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
sum4[0] = max(sum4[0], afp(0.f));
sum4[1] = max(sum4[1], afp(0.f));
sum5[0] = max(sum5[0], afp(0.f));
sum5[1] = max(sum5[1], afp(0.f));
sum6[0] = max(sum6[0], afp(0.f));
sum6[1] = max(sum6[1], afp(0.f));
sum7[0] = max(sum7[0], afp(0.f));
sum7[1] = max(sum7[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
sum4[0] = mix(sum4[0], sum4[0] * afp(slope), lessThan(sum4[0], afpvec4(0.f)));
sum4[1] = mix(sum4[1], sum4[1] * afp(slope), lessThan(sum4[1], afpvec4(0.f)));
sum5[0] = mix(sum5[0], sum5[0] * afp(slope), lessThan(sum5[0], afpvec4(0.f)));
sum5[1] = mix(sum5[1], sum5[1] * afp(slope), lessThan(sum5[1], afpvec4(0.f)));
sum6[0] = mix(sum6[0], sum6[0] * afp(slope), lessThan(sum6[0], afpvec4(0.f)));
sum6[1] = mix(sum6[1], sum6[1] * afp(slope), lessThan(sum6[1], afpvec4(0.f)));
sum7[0] = mix(sum7[0], sum7[0] * afp(slope), lessThan(sum7[0], afpvec4(0.f)));
sum7[1] = mix(sum7[1], sum7[1] * afp(slope), lessThan(sum7[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
sum4[0] = clamp(sum4[0], const_min, const_max);
sum4[1] = clamp(sum4[1], const_min, const_max);
sum5[0] = clamp(sum5[0], const_min, const_max);
sum5[1] = clamp(sum5[1], const_min, const_max);
sum6[0] = clamp(sum6[0], const_min, const_max);
sum6[1] = clamp(sum6[1], const_min, const_max);
sum7[0] = clamp(sum7[0], const_min, const_max);
sum7[1] = clamp(sum7[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
sum4[0] = afp(1.f) / (afp(1.f) + exp(-sum4[0]));
sum4[1] = afp(1.f) / (afp(1.f) + exp(-sum4[1]));
sum5[0] = afp(1.f) / (afp(1.f) + exp(-sum5[0]));
sum5[1] = afp(1.f) / (afp(1.f) + exp(-sum5[1]));
sum6[0] = afp(1.f) / (afp(1.f) + exp(-sum6[0]));
sum6[1] = afp(1.f) / (afp(1.f) + exp(-sum6[1]));
sum7[0] = afp(1.f) / (afp(1.f) + exp(-sum7[0]));
sum7[1] = afp(1.f) / (afp(1.f) + exp(-sum7[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
sum4[0] = sum4[0] * tanh(log(exp(sum4[0]) + afp(1.f)));
sum4[1] = sum4[1] * tanh(log(exp(sum4[1]) + afp(1.f)));
sum5[0] = sum5[0] * tanh(log(exp(sum5[0]) + afp(1.f)));
sum5[1] = sum5[1] * tanh(log(exp(sum5[1]) + afp(1.f)));
sum6[0] = sum6[0] * tanh(log(exp(sum6[0]) + afp(1.f)));
sum6[1] = sum6[1] * tanh(log(exp(sum6[1]) + afp(1.f)));
sum7[0] = sum7[0] * tanh(log(exp(sum7[0]) + afp(1.f)));
sum7[1] = sum7[1] * tanh(log(exp(sum7[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[0] = sum4[0] * clamp(sum4[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[1] = sum4[1] * clamp(sum4[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[0] = sum5[0] * clamp(sum5[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[1] = sum5[1] * clamp(sum5[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[0] = sum6[0] * clamp(sum6[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[1] = sum6[1] * clamp(sum6[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[0] = sum7[0] * clamp(sum7[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[1] = sum7[1] * clamp(sum7[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec8(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec8(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec8(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec8(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack1to8_1x1s1d1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -160,77 +163,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack1to8_gemm.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -183,77 +186,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 71
src/layer/vulkan/shader/convolution_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -226,77 +229,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
sum4 = mix(sum4, sum4 * afp(slope), lessThan(sum4, afpvec4(0.f)));
sum5 = mix(sum5, sum5 * afp(slope), lessThan(sum5, afpvec4(0.f)));
sum6 = mix(sum6, sum6 * afp(slope), lessThan(sum6, afpvec4(0.f)));
sum7 = mix(sum7, sum7 * afp(slope), lessThan(sum7, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec4(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec4(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec4(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec4(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4_1x1s1d1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int bias_term = 0;
@@ -257,53 +260,10 @@ void main()
#endif
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4_1x1s1d1_cm_16_8_8.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#extension GL_KHR_memory_scope_semantics: require
#extension GL_EXT_shader_explicit_arithmetic_types: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
@@ -238,53 +241,10 @@ void main()
afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y));
afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y));

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
{


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd23_transform_output.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -149,53 +152,10 @@ void main()
v11 = m11 - m12 + m13;
}

if (activation_type == 1)
{
v00 = max(v00, afp(0.f));
v10 = max(v10, afp(0.f));
v01 = max(v01, afp(0.f));
v11 = max(v11, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f)));
v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f)));
v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f)));
v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00 = clamp(v00, const_min, const_max);
v10 = clamp(v10, const_min, const_max);
v01 = clamp(v01, const_min, const_max);
v11 = clamp(v11, const_min, const_max);
}
if (activation_type == 4)
{
v00 = afp(1.f) / (afp(1.f) + exp(-v00));
v10 = afp(1.f) / (afp(1.f) + exp(-v10));
v01 = afp(1.f) / (afp(1.f) + exp(-v01));
v11 = afp(1.f) / (afp(1.f) + exp(-v11));
}
if (activation_type == 5)
{
v00 = v00 * tanh(log(exp(v00) + afp(1.f)));
v01 = v01 * tanh(log(exp(v01) + afp(1.f)));
v10 = v10 * tanh(log(exp(v10) + afp(1.f)));
v11 = v11 * tanh(log(exp(v11) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00 = v00 * clamp(v00 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01 = v01 * clamp(v01 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10 = v10 * clamp(v10 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11 = v11 * clamp(v11 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afpvec4(v00, activation_type, activation_param_0, activation_param_1);
v01 = activation_afpvec4(v01, activation_type, activation_param_0, activation_param_1);
v10 = activation_afpvec4(v10, activation_type, activation_param_0, activation_param_1);
v11 = activation_afpvec4(v11, activation_type, activation_param_0, activation_param_1);

// store 2x2
int x = gx * 2;


+ 19
- 119
src/layer/vulkan/shader/convolution_pack4_3x3s1d1_winograd43_transform_output.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -237,125 +240,22 @@ void main()
v33 = bias_value + v33;
}

if (activation_type == 1)
{
v00 = max(v00, afp(0.f));
v01 = max(v01, afp(0.f));
v02 = max(v02, afp(0.f));
v03 = max(v03, afp(0.f));
v10 = max(v10, afp(0.f));
v11 = max(v11, afp(0.f));
v12 = max(v12, afp(0.f));
v13 = max(v13, afp(0.f));
v20 = max(v20, afp(0.f));
v21 = max(v21, afp(0.f));
v22 = max(v22, afp(0.f));
v23 = max(v23, afp(0.f));
v30 = max(v30, afp(0.f));
v31 = max(v31, afp(0.f));
v32 = max(v32, afp(0.f));
v33 = max(v33, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00 = mix(v00, v00 * afp(slope), lessThan(v00, afpvec4(0.f)));
v01 = mix(v01, v01 * afp(slope), lessThan(v01, afpvec4(0.f)));
v02 = mix(v02, v02 * afp(slope), lessThan(v02, afpvec4(0.f)));
v03 = mix(v03, v03 * afp(slope), lessThan(v03, afpvec4(0.f)));
v10 = mix(v10, v10 * afp(slope), lessThan(v10, afpvec4(0.f)));
v11 = mix(v11, v11 * afp(slope), lessThan(v11, afpvec4(0.f)));
v12 = mix(v12, v12 * afp(slope), lessThan(v12, afpvec4(0.f)));
v13 = mix(v13, v13 * afp(slope), lessThan(v13, afpvec4(0.f)));
v20 = mix(v20, v20 * afp(slope), lessThan(v20, afpvec4(0.f)));
v21 = mix(v21, v21 * afp(slope), lessThan(v21, afpvec4(0.f)));
v22 = mix(v22, v22 * afp(slope), lessThan(v22, afpvec4(0.f)));
v23 = mix(v23, v23 * afp(slope), lessThan(v23, afpvec4(0.f)));
v30 = mix(v30, v30 * afp(slope), lessThan(v30, afpvec4(0.f)));
v31 = mix(v31, v31 * afp(slope), lessThan(v31, afpvec4(0.f)));
v32 = mix(v32, v32 * afp(slope), lessThan(v32, afpvec4(0.f)));
v33 = mix(v33, v33 * afp(slope), lessThan(v33, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00 = clamp(v00, const_min, const_max);
v01 = clamp(v01, const_min, const_max);
v02 = clamp(v02, const_min, const_max);
v03 = clamp(v03, const_min, const_max);
v10 = clamp(v10, const_min, const_max);
v11 = clamp(v11, const_min, const_max);
v12 = clamp(v12, const_min, const_max);
v13 = clamp(v13, const_min, const_max);
v20 = clamp(v20, const_min, const_max);
v21 = clamp(v21, const_min, const_max);
v22 = clamp(v22, const_min, const_max);
v23 = clamp(v23, const_min, const_max);
v30 = clamp(v30, const_min, const_max);
v31 = clamp(v31, const_min, const_max);
v32 = clamp(v32, const_min, const_max);
v33 = clamp(v33, const_min, const_max);
}
if (activation_type == 4)
{
v00 = afp(1.f) / (afp(1.f) + exp(-v00));
v01 = afp(1.f) / (afp(1.f) + exp(-v01));
v02 = afp(1.f) / (afp(1.f) + exp(-v02));
v03 = afp(1.f) / (afp(1.f) + exp(-v03));
v10 = afp(1.f) / (afp(1.f) + exp(-v10));
v11 = afp(1.f) / (afp(1.f) + exp(-v11));
v12 = afp(1.f) / (afp(1.f) + exp(-v12));
v13 = afp(1.f) / (afp(1.f) + exp(-v13));
v20 = afp(1.f) / (afp(1.f) + exp(-v20));
v21 = afp(1.f) / (afp(1.f) + exp(-v21));
v22 = afp(1.f) / (afp(1.f) + exp(-v22));
v23 = afp(1.f) / (afp(1.f) + exp(-v23));
v30 = afp(1.f) / (afp(1.f) + exp(-v30));
v31 = afp(1.f) / (afp(1.f) + exp(-v31));
v32 = afp(1.f) / (afp(1.f) + exp(-v32));
v33 = afp(1.f) / (afp(1.f) + exp(-v33));
}
if (activation_type == 5)
{
v00 = v00 * tanh(log(exp(v00) + afp(1.f)));
v01 = v01 * tanh(log(exp(v01) + afp(1.f)));
v02 = v02 * tanh(log(exp(v02) + afp(1.f)));
v03 = v03 * tanh(log(exp(v03) + afp(1.f)));
v10 = v10 * tanh(log(exp(v10) + afp(1.f)));
v11 = v11 * tanh(log(exp(v11) + afp(1.f)));
v12 = v12 * tanh(log(exp(v12) + afp(1.f)));
v13 = v13 * tanh(log(exp(v13) + afp(1.f)));
v20 = v20 * tanh(log(exp(v20) + afp(1.f)));
v21 = v21 * tanh(log(exp(v21) + afp(1.f)));
v22 = v22 * tanh(log(exp(v22) + afp(1.f)));
v23 = v23 * tanh(log(exp(v23) + afp(1.f)));
v30 = v30 * tanh(log(exp(v30) + afp(1.f)));
v31 = v31 * tanh(log(exp(v31) + afp(1.f)));
v32 = v32 * tanh(log(exp(v32) + afp(1.f)));
v33 = v33 * tanh(log(exp(v33) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00 = v00 * clamp(v00 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01 = v01 * clamp(v01 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v02 = v02 * clamp(v02 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v03 = v03 * clamp(v03 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10 = v10 * clamp(v10 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11 = v11 * clamp(v11 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v12 = v12 * clamp(v12 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v13 = v13 * clamp(v13 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v20 = v20 * clamp(v20 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v21 = v21 * clamp(v21 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v22 = v22 * clamp(v22 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v23 = v23 * clamp(v23 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v30 = v30 * clamp(v30 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v31 = v31 * clamp(v31 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v32 = v32 * clamp(v32 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v33 = v33 * clamp(v33 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afpvec4(v00, activation_type, activation_param_0, activation_param_1);
v01 = activation_afpvec4(v01, activation_type, activation_param_0, activation_param_1);
v02 = activation_afpvec4(v02, activation_type, activation_param_0, activation_param_1);
v03 = activation_afpvec4(v03, activation_type, activation_param_0, activation_param_1);
v10 = activation_afpvec4(v10, activation_type, activation_param_0, activation_param_1);
v11 = activation_afpvec4(v11, activation_type, activation_param_0, activation_param_1);
v12 = activation_afpvec4(v12, activation_type, activation_param_0, activation_param_1);
v13 = activation_afpvec4(v13, activation_type, activation_param_0, activation_param_1);
v20 = activation_afpvec4(v20, activation_type, activation_param_0, activation_param_1);
v21 = activation_afpvec4(v21, activation_type, activation_param_0, activation_param_1);
v22 = activation_afpvec4(v22, activation_type, activation_param_0, activation_param_1);
v23 = activation_afpvec4(v23, activation_type, activation_param_0, activation_param_1);
v30 = activation_afpvec4(v30, activation_type, activation_param_0, activation_param_1);
v31 = activation_afpvec4(v31, activation_type, activation_param_0, activation_param_1);
v32 = activation_afpvec4(v32, activation_type, activation_param_0, activation_param_1);
v33 = activation_afpvec4(v33, activation_type, activation_param_0, activation_param_1);

// store 4x4
int x = gx * 4;


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4_gemm.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
@@ -300,53 +303,10 @@ void main()
return;
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4_gemm_cm_16_8_8.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#extension GL_KHR_memory_scope_semantics: require
#extension GL_EXT_shader_explicit_arithmetic_types: require
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
@@ -275,53 +278,10 @@ void main()
afpvec4 sum2 = afpvec4(unpackHalf2x16(sum2_u2.x), unpackHalf2x16(sum2_u2.y));
afpvec4 sum3 = afpvec4(unpackHalf2x16(sum3_u2.x), unpackHalf2x16(sum3_u2.y));

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

int gi = gy * psc(outcstep) + gx + lxd16 * psc(outcstep) + lxm16;
{


+ 11
- 71
src/layer/vulkan/shader/convolution_pack4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -195,77 +198,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
sum4 = sum4 < afp(0.f) ? sum4 * slope : sum4;
sum5 = sum5 < afp(0.f) ? sum5 * slope : sum5;
sum6 = sum6 < afp(0.f) ? sum6 * slope : sum6;
sum7 = sum7 < afp(0.f) ? sum7 * slope : sum7;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afp(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afp(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afp(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afp(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4to1_1x1s1d1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int bias_term = 0;
@@ -237,53 +240,10 @@ void main()
#endif
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack4to1_gemm.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

#define LOCAL_MEMORY_UNROLL_INCH 8

layout (constant_id = 0) const int kernel_w = 1;
@@ -280,53 +283,10 @@ void main()
return;
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 119
src/layer/vulkan/shader/convolution_pack4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -342,125 +345,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
sum4[0] = max(sum4[0], afp(0.f));
sum4[1] = max(sum4[1], afp(0.f));
sum5[0] = max(sum5[0], afp(0.f));
sum5[1] = max(sum5[1], afp(0.f));
sum6[0] = max(sum6[0], afp(0.f));
sum6[1] = max(sum6[1], afp(0.f));
sum7[0] = max(sum7[0], afp(0.f));
sum7[1] = max(sum7[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
sum4[0] = mix(sum4[0], sum4[0] * afp(slope), lessThan(sum4[0], afpvec4(0.f)));
sum4[1] = mix(sum4[1], sum4[1] * afp(slope), lessThan(sum4[1], afpvec4(0.f)));
sum5[0] = mix(sum5[0], sum5[0] * afp(slope), lessThan(sum5[0], afpvec4(0.f)));
sum5[1] = mix(sum5[1], sum5[1] * afp(slope), lessThan(sum5[1], afpvec4(0.f)));
sum6[0] = mix(sum6[0], sum6[0] * afp(slope), lessThan(sum6[0], afpvec4(0.f)));
sum6[1] = mix(sum6[1], sum6[1] * afp(slope), lessThan(sum6[1], afpvec4(0.f)));
sum7[0] = mix(sum7[0], sum7[0] * afp(slope), lessThan(sum7[0], afpvec4(0.f)));
sum7[1] = mix(sum7[1], sum7[1] * afp(slope), lessThan(sum7[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
sum4[0] = clamp(sum4[0], const_min, const_max);
sum4[1] = clamp(sum4[1], const_min, const_max);
sum5[0] = clamp(sum5[0], const_min, const_max);
sum5[1] = clamp(sum5[1], const_min, const_max);
sum6[0] = clamp(sum6[0], const_min, const_max);
sum6[1] = clamp(sum6[1], const_min, const_max);
sum7[0] = clamp(sum7[0], const_min, const_max);
sum7[1] = clamp(sum7[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
sum4[0] = afp(1.f) / (afp(1.f) + exp(-sum4[0]));
sum4[1] = afp(1.f) / (afp(1.f) + exp(-sum4[1]));
sum5[0] = afp(1.f) / (afp(1.f) + exp(-sum5[0]));
sum5[1] = afp(1.f) / (afp(1.f) + exp(-sum5[1]));
sum6[0] = afp(1.f) / (afp(1.f) + exp(-sum6[0]));
sum6[1] = afp(1.f) / (afp(1.f) + exp(-sum6[1]));
sum7[0] = afp(1.f) / (afp(1.f) + exp(-sum7[0]));
sum7[1] = afp(1.f) / (afp(1.f) + exp(-sum7[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
sum4[0] = sum4[0] * tanh(log(exp(sum4[0]) + afp(1.f)));
sum4[1] = sum4[1] * tanh(log(exp(sum4[1]) + afp(1.f)));
sum5[0] = sum5[0] * tanh(log(exp(sum5[0]) + afp(1.f)));
sum5[1] = sum5[1] * tanh(log(exp(sum5[1]) + afp(1.f)));
sum6[0] = sum6[0] * tanh(log(exp(sum6[0]) + afp(1.f)));
sum6[1] = sum6[1] * tanh(log(exp(sum6[1]) + afp(1.f)));
sum7[0] = sum7[0] * tanh(log(exp(sum7[0]) + afp(1.f)));
sum7[1] = sum7[1] * tanh(log(exp(sum7[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[0] = sum4[0] * clamp(sum4[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[1] = sum4[1] * clamp(sum4[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[0] = sum5[0] * clamp(sum5[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[1] = sum5[1] * clamp(sum5[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[0] = sum6[0] * clamp(sum6[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[1] = sum6[1] * clamp(sum6[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[0] = sum7[0] * clamp(sum7[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[1] = sum7[1] * clamp(sum7[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec8(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec8(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec8(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec8(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack4to8_1x1s1d1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -222,77 +225,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack4to8_gemm.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -245,77 +248,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 119
src/layer/vulkan/shader/convolution_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -342,125 +345,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
sum4[0] = max(sum4[0], afp(0.f));
sum4[1] = max(sum4[1], afp(0.f));
sum5[0] = max(sum5[0], afp(0.f));
sum5[1] = max(sum5[1], afp(0.f));
sum6[0] = max(sum6[0], afp(0.f));
sum6[1] = max(sum6[1], afp(0.f));
sum7[0] = max(sum7[0], afp(0.f));
sum7[1] = max(sum7[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
sum4[0] = mix(sum4[0], sum4[0] * afp(slope), lessThan(sum4[0], afpvec4(0.f)));
sum4[1] = mix(sum4[1], sum4[1] * afp(slope), lessThan(sum4[1], afpvec4(0.f)));
sum5[0] = mix(sum5[0], sum5[0] * afp(slope), lessThan(sum5[0], afpvec4(0.f)));
sum5[1] = mix(sum5[1], sum5[1] * afp(slope), lessThan(sum5[1], afpvec4(0.f)));
sum6[0] = mix(sum6[0], sum6[0] * afp(slope), lessThan(sum6[0], afpvec4(0.f)));
sum6[1] = mix(sum6[1], sum6[1] * afp(slope), lessThan(sum6[1], afpvec4(0.f)));
sum7[0] = mix(sum7[0], sum7[0] * afp(slope), lessThan(sum7[0], afpvec4(0.f)));
sum7[1] = mix(sum7[1], sum7[1] * afp(slope), lessThan(sum7[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
sum4[0] = clamp(sum4[0], const_min, const_max);
sum4[1] = clamp(sum4[1], const_min, const_max);
sum5[0] = clamp(sum5[0], const_min, const_max);
sum5[1] = clamp(sum5[1], const_min, const_max);
sum6[0] = clamp(sum6[0], const_min, const_max);
sum6[1] = clamp(sum6[1], const_min, const_max);
sum7[0] = clamp(sum7[0], const_min, const_max);
sum7[1] = clamp(sum7[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
sum4[0] = afp(1.f) / (afp(1.f) + exp(-sum4[0]));
sum4[1] = afp(1.f) / (afp(1.f) + exp(-sum4[1]));
sum5[0] = afp(1.f) / (afp(1.f) + exp(-sum5[0]));
sum5[1] = afp(1.f) / (afp(1.f) + exp(-sum5[1]));
sum6[0] = afp(1.f) / (afp(1.f) + exp(-sum6[0]));
sum6[1] = afp(1.f) / (afp(1.f) + exp(-sum6[1]));
sum7[0] = afp(1.f) / (afp(1.f) + exp(-sum7[0]));
sum7[1] = afp(1.f) / (afp(1.f) + exp(-sum7[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
sum4[0] = sum4[0] * tanh(log(exp(sum4[0]) + afp(1.f)));
sum4[1] = sum4[1] * tanh(log(exp(sum4[1]) + afp(1.f)));
sum5[0] = sum5[0] * tanh(log(exp(sum5[0]) + afp(1.f)));
sum5[1] = sum5[1] * tanh(log(exp(sum5[1]) + afp(1.f)));
sum6[0] = sum6[0] * tanh(log(exp(sum6[0]) + afp(1.f)));
sum6[1] = sum6[1] * tanh(log(exp(sum6[1]) + afp(1.f)));
sum7[0] = sum7[0] * tanh(log(exp(sum7[0]) + afp(1.f)));
sum7[1] = sum7[1] * tanh(log(exp(sum7[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[0] = sum4[0] * clamp(sum4[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4[1] = sum4[1] * clamp(sum4[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[0] = sum5[0] * clamp(sum5[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5[1] = sum5[1] * clamp(sum5[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[0] = sum6[0] * clamp(sum6[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6[1] = sum6[1] * clamp(sum6[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[0] = sum7[0] * clamp(sum7[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7[1] = sum7[1] * clamp(sum7[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec8(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec8(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec8(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec8(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack8_1x1s1d1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -222,77 +225,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 71
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd23_transform_output.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -150,77 +153,10 @@ void main()
v11 = m11 - m12 + m13;
}

if (activation_type == 1)
{
v00[0] = max(v00[0], afp(0.f));
v00[1] = max(v00[1], afp(0.f));
v10[0] = max(v10[0], afp(0.f));
v10[1] = max(v10[1], afp(0.f));
v01[0] = max(v01[0], afp(0.f));
v01[1] = max(v01[1], afp(0.f));
v11[0] = max(v11[0], afp(0.f));
v11[1] = max(v11[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00[0] = mix(v00[0], v00[0] * afp(slope), lessThan(v00[0], afpvec4(0.f)));
v00[1] = mix(v00[1], v00[1] * afp(slope), lessThan(v00[1], afpvec4(0.f)));
v10[0] = mix(v10[0], v10[0] * afp(slope), lessThan(v10[0], afpvec4(0.f)));
v10[1] = mix(v10[1], v10[1] * afp(slope), lessThan(v10[1], afpvec4(0.f)));
v01[0] = mix(v01[0], v01[0] * afp(slope), lessThan(v01[0], afpvec4(0.f)));
v01[1] = mix(v01[1], v01[1] * afp(slope), lessThan(v01[1], afpvec4(0.f)));
v11[0] = mix(v11[0], v11[0] * afp(slope), lessThan(v11[0], afpvec4(0.f)));
v11[1] = mix(v11[1], v11[1] * afp(slope), lessThan(v11[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00[0] = clamp(v00[0], const_min, const_max);
v00[1] = clamp(v00[1], const_min, const_max);
v10[0] = clamp(v10[0], const_min, const_max);
v10[1] = clamp(v10[1], const_min, const_max);
v01[0] = clamp(v01[0], const_min, const_max);
v01[1] = clamp(v01[1], const_min, const_max);
v11[0] = clamp(v11[0], const_min, const_max);
v11[1] = clamp(v11[1], const_min, const_max);
}
if (activation_type == 4)
{
v00[0] = afp(1.f) / (afp(1.f) + exp(-v00[0]));
v00[1] = afp(1.f) / (afp(1.f) + exp(-v00[1]));
v10[0] = afp(1.f) / (afp(1.f) + exp(-v10[0]));
v10[1] = afp(1.f) / (afp(1.f) + exp(-v10[1]));
v01[0] = afp(1.f) / (afp(1.f) + exp(-v01[0]));
v01[1] = afp(1.f) / (afp(1.f) + exp(-v01[1]));
v11[0] = afp(1.f) / (afp(1.f) + exp(-v11[0]));
v11[1] = afp(1.f) / (afp(1.f) + exp(-v11[1]));
}
if (activation_type == 5)
{
v00[0] = v00[0] * tanh(log(exp(v00[0]) + afp(1.f)));
v00[1] = v00[1] * tanh(log(exp(v00[1]) + afp(1.f)));
v10[0] = v10[0] * tanh(log(exp(v10[0]) + afp(1.f)));
v10[1] = v10[1] * tanh(log(exp(v10[1]) + afp(1.f)));
v01[0] = v01[0] * tanh(log(exp(v01[0]) + afp(1.f)));
v01[1] = v01[1] * tanh(log(exp(v01[1]) + afp(1.f)));
v11[0] = v11[0] * tanh(log(exp(v11[0]) + afp(1.f)));
v11[1] = v11[1] * tanh(log(exp(v11[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00[0] = v00[0] * clamp(v00[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v00[1] = v00[1] * clamp(v00[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01[0] = v01[0] * clamp(v01[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01[1] = v01[1] * clamp(v01[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10[0] = v10[0] * clamp(v10[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10[1] = v10[1] * clamp(v10[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11[0] = v11[0] * clamp(v11[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11[1] = v11[1] * clamp(v11[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afpvec8(v00, activation_type, activation_param_0, activation_param_1);
v01 = activation_afpvec8(v01, activation_type, activation_param_0, activation_param_1);
v10 = activation_afpvec8(v10, activation_type, activation_param_0, activation_param_1);
v11 = activation_afpvec8(v11, activation_type, activation_param_0, activation_param_1);

// store 2x2
int x = gx * 2;


+ 19
- 215
src/layer/vulkan/shader/convolution_pack8_3x3s1d1_winograd43_transform_output.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -238,221 +241,22 @@ void main()
v33 = bias_value + v33;
}

if (activation_type == 1)
{
v00[0] = max(v00[0], afp(0.f));
v00[1] = max(v00[1], afp(0.f));
v01[0] = max(v01[0], afp(0.f));
v01[1] = max(v01[1], afp(0.f));
v02[0] = max(v02[0], afp(0.f));
v02[1] = max(v02[1], afp(0.f));
v03[0] = max(v03[0], afp(0.f));
v03[1] = max(v03[1], afp(0.f));
v10[0] = max(v10[0], afp(0.f));
v10[1] = max(v10[1], afp(0.f));
v11[0] = max(v11[0], afp(0.f));
v11[1] = max(v11[1], afp(0.f));
v12[0] = max(v12[0], afp(0.f));
v12[1] = max(v12[1], afp(0.f));
v13[0] = max(v13[0], afp(0.f));
v13[1] = max(v13[1], afp(0.f));
v20[0] = max(v20[0], afp(0.f));
v20[1] = max(v20[1], afp(0.f));
v21[0] = max(v21[0], afp(0.f));
v21[1] = max(v21[1], afp(0.f));
v22[0] = max(v22[0], afp(0.f));
v22[1] = max(v22[1], afp(0.f));
v23[0] = max(v23[0], afp(0.f));
v23[1] = max(v23[1], afp(0.f));
v30[0] = max(v30[0], afp(0.f));
v30[1] = max(v30[1], afp(0.f));
v31[0] = max(v31[0], afp(0.f));
v31[1] = max(v31[1], afp(0.f));
v32[0] = max(v32[0], afp(0.f));
v32[1] = max(v32[1], afp(0.f));
v33[0] = max(v33[0], afp(0.f));
v33[1] = max(v33[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
v00[0] = mix(v00[0], v00[0] * afp(slope), lessThan(v00[0], afpvec4(0.f)));
v00[1] = mix(v00[1], v00[1] * afp(slope), lessThan(v00[1], afpvec4(0.f)));
v01[0] = mix(v01[0], v01[0] * afp(slope), lessThan(v01[0], afpvec4(0.f)));
v01[1] = mix(v01[1], v01[1] * afp(slope), lessThan(v01[1], afpvec4(0.f)));
v02[0] = mix(v02[0], v02[0] * afp(slope), lessThan(v02[0], afpvec4(0.f)));
v02[1] = mix(v02[1], v02[1] * afp(slope), lessThan(v02[1], afpvec4(0.f)));
v03[0] = mix(v03[0], v03[0] * afp(slope), lessThan(v03[0], afpvec4(0.f)));
v03[1] = mix(v03[1], v03[1] * afp(slope), lessThan(v03[1], afpvec4(0.f)));
v10[0] = mix(v10[0], v10[0] * afp(slope), lessThan(v10[0], afpvec4(0.f)));
v10[1] = mix(v10[1], v10[1] * afp(slope), lessThan(v10[1], afpvec4(0.f)));
v11[0] = mix(v11[0], v11[0] * afp(slope), lessThan(v11[0], afpvec4(0.f)));
v11[1] = mix(v11[1], v11[1] * afp(slope), lessThan(v11[1], afpvec4(0.f)));
v12[0] = mix(v12[0], v12[0] * afp(slope), lessThan(v12[0], afpvec4(0.f)));
v12[1] = mix(v12[1], v12[1] * afp(slope), lessThan(v12[1], afpvec4(0.f)));
v13[0] = mix(v13[0], v13[0] * afp(slope), lessThan(v13[0], afpvec4(0.f)));
v13[1] = mix(v13[1], v13[1] * afp(slope), lessThan(v13[1], afpvec4(0.f)));
v20[0] = mix(v20[0], v20[0] * afp(slope), lessThan(v20[0], afpvec4(0.f)));
v20[1] = mix(v20[1], v20[1] * afp(slope), lessThan(v20[1], afpvec4(0.f)));
v21[0] = mix(v21[0], v21[0] * afp(slope), lessThan(v21[0], afpvec4(0.f)));
v21[1] = mix(v21[1], v21[1] * afp(slope), lessThan(v21[1], afpvec4(0.f)));
v22[0] = mix(v22[0], v22[0] * afp(slope), lessThan(v22[0], afpvec4(0.f)));
v22[1] = mix(v22[1], v22[1] * afp(slope), lessThan(v22[1], afpvec4(0.f)));
v23[0] = mix(v23[0], v23[0] * afp(slope), lessThan(v23[0], afpvec4(0.f)));
v23[1] = mix(v23[1], v23[1] * afp(slope), lessThan(v23[1], afpvec4(0.f)));
v30[0] = mix(v30[0], v30[0] * afp(slope), lessThan(v30[0], afpvec4(0.f)));
v30[1] = mix(v30[1], v30[1] * afp(slope), lessThan(v30[1], afpvec4(0.f)));
v31[0] = mix(v31[0], v31[0] * afp(slope), lessThan(v31[0], afpvec4(0.f)));
v31[1] = mix(v31[1], v31[1] * afp(slope), lessThan(v31[1], afpvec4(0.f)));
v32[0] = mix(v32[0], v32[0] * afp(slope), lessThan(v32[0], afpvec4(0.f)));
v32[1] = mix(v32[1], v32[1] * afp(slope), lessThan(v32[1], afpvec4(0.f)));
v33[0] = mix(v33[0], v33[0] * afp(slope), lessThan(v33[0], afpvec4(0.f)));
v33[1] = mix(v33[1], v33[1] * afp(slope), lessThan(v33[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
v00[0] = clamp(v00[0], const_min, const_max);
v00[1] = clamp(v00[1], const_min, const_max);
v01[0] = clamp(v01[0], const_min, const_max);
v01[1] = clamp(v01[1], const_min, const_max);
v02[0] = clamp(v02[0], const_min, const_max);
v02[1] = clamp(v02[1], const_min, const_max);
v03[0] = clamp(v03[0], const_min, const_max);
v03[1] = clamp(v03[1], const_min, const_max);
v10[0] = clamp(v10[0], const_min, const_max);
v10[1] = clamp(v10[1], const_min, const_max);
v11[0] = clamp(v11[0], const_min, const_max);
v11[1] = clamp(v11[1], const_min, const_max);
v12[0] = clamp(v12[0], const_min, const_max);
v12[1] = clamp(v12[1], const_min, const_max);
v13[0] = clamp(v13[0], const_min, const_max);
v13[1] = clamp(v13[1], const_min, const_max);
v20[0] = clamp(v20[0], const_min, const_max);
v20[1] = clamp(v20[1], const_min, const_max);
v21[0] = clamp(v21[0], const_min, const_max);
v21[1] = clamp(v21[1], const_min, const_max);
v22[0] = clamp(v22[0], const_min, const_max);
v22[1] = clamp(v22[1], const_min, const_max);
v23[0] = clamp(v23[0], const_min, const_max);
v23[1] = clamp(v23[1], const_min, const_max);
v30[0] = clamp(v30[0], const_min, const_max);
v30[1] = clamp(v30[1], const_min, const_max);
v31[0] = clamp(v31[0], const_min, const_max);
v31[1] = clamp(v31[1], const_min, const_max);
v32[0] = clamp(v32[0], const_min, const_max);
v32[1] = clamp(v32[1], const_min, const_max);
v33[0] = clamp(v33[0], const_min, const_max);
v33[1] = clamp(v33[1], const_min, const_max);
}
if (activation_type == 4)
{
v00[0] = afp(1.f) / (afp(1.f) + exp(-v00[0]));
v00[1] = afp(1.f) / (afp(1.f) + exp(-v00[1]));
v01[0] = afp(1.f) / (afp(1.f) + exp(-v01[0]));
v01[1] = afp(1.f) / (afp(1.f) + exp(-v01[1]));
v02[0] = afp(1.f) / (afp(1.f) + exp(-v02[0]));
v02[1] = afp(1.f) / (afp(1.f) + exp(-v02[1]));
v03[0] = afp(1.f) / (afp(1.f) + exp(-v03[0]));
v03[1] = afp(1.f) / (afp(1.f) + exp(-v03[1]));
v10[0] = afp(1.f) / (afp(1.f) + exp(-v10[0]));
v10[1] = afp(1.f) / (afp(1.f) + exp(-v10[1]));
v11[0] = afp(1.f) / (afp(1.f) + exp(-v11[0]));
v11[1] = afp(1.f) / (afp(1.f) + exp(-v11[1]));
v12[0] = afp(1.f) / (afp(1.f) + exp(-v12[0]));
v12[1] = afp(1.f) / (afp(1.f) + exp(-v12[1]));
v13[0] = afp(1.f) / (afp(1.f) + exp(-v13[0]));
v13[1] = afp(1.f) / (afp(1.f) + exp(-v13[1]));
v20[0] = afp(1.f) / (afp(1.f) + exp(-v20[0]));
v20[1] = afp(1.f) / (afp(1.f) + exp(-v20[1]));
v21[0] = afp(1.f) / (afp(1.f) + exp(-v21[0]));
v21[1] = afp(1.f) / (afp(1.f) + exp(-v21[1]));
v22[0] = afp(1.f) / (afp(1.f) + exp(-v22[0]));
v22[1] = afp(1.f) / (afp(1.f) + exp(-v22[1]));
v23[0] = afp(1.f) / (afp(1.f) + exp(-v23[0]));
v23[1] = afp(1.f) / (afp(1.f) + exp(-v23[1]));
v30[0] = afp(1.f) / (afp(1.f) + exp(-v30[0]));
v30[1] = afp(1.f) / (afp(1.f) + exp(-v30[1]));
v31[0] = afp(1.f) / (afp(1.f) + exp(-v31[0]));
v31[1] = afp(1.f) / (afp(1.f) + exp(-v31[1]));
v32[0] = afp(1.f) / (afp(1.f) + exp(-v32[0]));
v32[1] = afp(1.f) / (afp(1.f) + exp(-v32[1]));
v33[0] = afp(1.f) / (afp(1.f) + exp(-v33[0]));
v33[1] = afp(1.f) / (afp(1.f) + exp(-v33[1]));
}
if (activation_type == 5)
{
v00[0] = v00[0] * tanh(log(exp(v00[0]) + afp(1.f)));
v00[1] = v00[1] * tanh(log(exp(v00[1]) + afp(1.f)));
v01[0] = v01[0] * tanh(log(exp(v01[0]) + afp(1.f)));
v01[1] = v01[1] * tanh(log(exp(v01[1]) + afp(1.f)));
v02[0] = v02[0] * tanh(log(exp(v02[0]) + afp(1.f)));
v02[1] = v02[1] * tanh(log(exp(v02[1]) + afp(1.f)));
v03[0] = v03[0] * tanh(log(exp(v03[0]) + afp(1.f)));
v03[1] = v03[1] * tanh(log(exp(v03[1]) + afp(1.f)));
v10[0] = v10[0] * tanh(log(exp(v10[0]) + afp(1.f)));
v10[1] = v10[1] * tanh(log(exp(v10[1]) + afp(1.f)));
v11[0] = v11[0] * tanh(log(exp(v11[0]) + afp(1.f)));
v11[1] = v11[1] * tanh(log(exp(v11[1]) + afp(1.f)));
v12[0] = v12[0] * tanh(log(exp(v12[0]) + afp(1.f)));
v12[1] = v12[1] * tanh(log(exp(v12[1]) + afp(1.f)));
v13[0] = v13[0] * tanh(log(exp(v13[0]) + afp(1.f)));
v13[1] = v13[1] * tanh(log(exp(v13[1]) + afp(1.f)));
v20[0] = v20[0] * tanh(log(exp(v20[0]) + afp(1.f)));
v20[1] = v20[1] * tanh(log(exp(v20[1]) + afp(1.f)));
v21[0] = v21[0] * tanh(log(exp(v21[0]) + afp(1.f)));
v21[1] = v21[1] * tanh(log(exp(v21[1]) + afp(1.f)));
v22[0] = v22[0] * tanh(log(exp(v22[0]) + afp(1.f)));
v22[1] = v22[1] * tanh(log(exp(v22[1]) + afp(1.f)));
v23[0] = v23[0] * tanh(log(exp(v23[0]) + afp(1.f)));
v23[1] = v23[1] * tanh(log(exp(v23[1]) + afp(1.f)));
v30[0] = v30[0] * tanh(log(exp(v30[0]) + afp(1.f)));
v30[1] = v30[1] * tanh(log(exp(v30[1]) + afp(1.f)));
v31[0] = v31[0] * tanh(log(exp(v31[0]) + afp(1.f)));
v31[1] = v31[1] * tanh(log(exp(v31[1]) + afp(1.f)));
v32[0] = v32[0] * tanh(log(exp(v32[0]) + afp(1.f)));
v32[1] = v32[1] * tanh(log(exp(v32[1]) + afp(1.f)));
v33[0] = v33[0] * tanh(log(exp(v33[0]) + afp(1.f)));
v33[1] = v33[1] * tanh(log(exp(v33[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
v00[0] = v00[0] * clamp(v00[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v00[1] = v00[1] * clamp(v00[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01[0] = v01[0] * clamp(v01[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v01[1] = v01[1] * clamp(v01[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v02[0] = v02[0] * clamp(v02[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v02[1] = v02[1] * clamp(v02[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v03[0] = v03[0] * clamp(v03[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v03[1] = v03[1] * clamp(v03[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10[0] = v10[0] * clamp(v10[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v10[1] = v10[1] * clamp(v10[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11[0] = v11[0] * clamp(v11[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v11[1] = v11[1] * clamp(v11[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v12[0] = v12[0] * clamp(v12[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v12[1] = v12[1] * clamp(v12[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v13[0] = v13[0] * clamp(v13[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v13[1] = v13[1] * clamp(v13[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v20[0] = v20[0] * clamp(v20[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v20[1] = v20[1] * clamp(v20[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v21[0] = v21[0] * clamp(v21[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v21[1] = v21[1] * clamp(v21[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v22[0] = v22[0] * clamp(v22[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v22[1] = v22[1] * clamp(v22[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v23[0] = v23[0] * clamp(v23[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v23[1] = v23[1] * clamp(v23[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v30[0] = v30[0] * clamp(v30[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v30[1] = v30[1] * clamp(v30[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v31[0] = v31[0] * clamp(v31[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v31[1] = v31[1] * clamp(v31[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v32[0] = v32[0] * clamp(v32[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v32[1] = v32[1] * clamp(v32[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v33[0] = v33[0] * clamp(v33[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
v33[1] = v33[1] * clamp(v33[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
v00 = activation_afpvec8(v00, activation_type, activation_param_0, activation_param_1);
v01 = activation_afpvec8(v01, activation_type, activation_param_0, activation_param_1);
v02 = activation_afpvec8(v02, activation_type, activation_param_0, activation_param_1);
v03 = activation_afpvec8(v03, activation_type, activation_param_0, activation_param_1);
v10 = activation_afpvec8(v10, activation_type, activation_param_0, activation_param_1);
v11 = activation_afpvec8(v11, activation_type, activation_param_0, activation_param_1);
v12 = activation_afpvec8(v12, activation_type, activation_param_0, activation_param_1);
v13 = activation_afpvec8(v13, activation_type, activation_param_0, activation_param_1);
v20 = activation_afpvec8(v20, activation_type, activation_param_0, activation_param_1);
v21 = activation_afpvec8(v21, activation_type, activation_param_0, activation_param_1);
v22 = activation_afpvec8(v22, activation_type, activation_param_0, activation_param_1);
v23 = activation_afpvec8(v23, activation_type, activation_param_0, activation_param_1);
v30 = activation_afpvec8(v30, activation_type, activation_param_0, activation_param_1);
v31 = activation_afpvec8(v31, activation_type, activation_param_0, activation_param_1);
v32 = activation_afpvec8(v32, activation_type, activation_param_0, activation_param_1);
v33 = activation_afpvec8(v33, activation_type, activation_param_0, activation_param_1);

// store 4x4
int x = gx * 4;


+ 7
- 71
src/layer/vulkan/shader/convolution_pack8_gemm.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -245,77 +248,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0[0] = max(sum0[0], afp(0.f));
sum0[1] = max(sum0[1], afp(0.f));
sum1[0] = max(sum1[0], afp(0.f));
sum1[1] = max(sum1[1], afp(0.f));
sum2[0] = max(sum2[0], afp(0.f));
sum2[1] = max(sum2[1], afp(0.f));
sum3[0] = max(sum3[0], afp(0.f));
sum3[1] = max(sum3[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0[0] = mix(sum0[0], sum0[0] * afp(slope), lessThan(sum0[0], afpvec4(0.f)));
sum0[1] = mix(sum0[1], sum0[1] * afp(slope), lessThan(sum0[1], afpvec4(0.f)));
sum1[0] = mix(sum1[0], sum1[0] * afp(slope), lessThan(sum1[0], afpvec4(0.f)));
sum1[1] = mix(sum1[1], sum1[1] * afp(slope), lessThan(sum1[1], afpvec4(0.f)));
sum2[0] = mix(sum2[0], sum2[0] * afp(slope), lessThan(sum2[0], afpvec4(0.f)));
sum2[1] = mix(sum2[1], sum2[1] * afp(slope), lessThan(sum2[1], afpvec4(0.f)));
sum3[0] = mix(sum3[0], sum3[0] * afp(slope), lessThan(sum3[0], afpvec4(0.f)));
sum3[1] = mix(sum3[1], sum3[1] * afp(slope), lessThan(sum3[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0[0] = clamp(sum0[0], const_min, const_max);
sum0[1] = clamp(sum0[1], const_min, const_max);
sum1[0] = clamp(sum1[0], const_min, const_max);
sum1[1] = clamp(sum1[1], const_min, const_max);
sum2[0] = clamp(sum2[0], const_min, const_max);
sum2[1] = clamp(sum2[1], const_min, const_max);
sum3[0] = clamp(sum3[0], const_min, const_max);
sum3[1] = clamp(sum3[1], const_min, const_max);
}
if (activation_type == 4)
{
sum0[0] = afp(1.f) / (afp(1.f) + exp(-sum0[0]));
sum0[1] = afp(1.f) / (afp(1.f) + exp(-sum0[1]));
sum1[0] = afp(1.f) / (afp(1.f) + exp(-sum1[0]));
sum1[1] = afp(1.f) / (afp(1.f) + exp(-sum1[1]));
sum2[0] = afp(1.f) / (afp(1.f) + exp(-sum2[0]));
sum2[1] = afp(1.f) / (afp(1.f) + exp(-sum2[1]));
sum3[0] = afp(1.f) / (afp(1.f) + exp(-sum3[0]));
sum3[1] = afp(1.f) / (afp(1.f) + exp(-sum3[1]));
}
if (activation_type == 5)
{
sum0[0] = sum0[0] * tanh(log(exp(sum0[0]) + afp(1.f)));
sum0[1] = sum0[1] * tanh(log(exp(sum0[1]) + afp(1.f)));
sum1[0] = sum1[0] * tanh(log(exp(sum1[0]) + afp(1.f)));
sum1[1] = sum1[1] * tanh(log(exp(sum1[1]) + afp(1.f)));
sum2[0] = sum2[0] * tanh(log(exp(sum2[0]) + afp(1.f)));
sum2[1] = sum2[1] * tanh(log(exp(sum2[1]) + afp(1.f)));
sum3[0] = sum3[0] * tanh(log(exp(sum3[0]) + afp(1.f)));
sum3[1] = sum3[1] * tanh(log(exp(sum3[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0[0] = sum0[0] * clamp(sum0[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum0[1] = sum0[1] * clamp(sum0[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[0] = sum1[0] * clamp(sum1[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1[1] = sum1[1] * clamp(sum1[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[0] = sum2[0] * clamp(sum2[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2[1] = sum2[1] * clamp(sum2[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[0] = sum3[0] * clamp(sum3[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3[1] = sum3[1] * clamp(sum3[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec8(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec8(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec8(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec8(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 71
src/layer/vulkan/shader/convolution_pack8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -198,77 +201,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
sum4 = sum4 < afp(0.f) ? sum4 * slope : sum4;
sum5 = sum5 < afp(0.f) ? sum5 * slope : sum5;
sum6 = sum6 < afp(0.f) ? sum6 * slope : sum6;
sum7 = sum7 < afp(0.f) ? sum7 * slope : sum7;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afp(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afp(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afp(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afp(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack8to1_1x1s1d1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -146,53 +149,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack8to1_gemm.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -169,53 +172,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = sum0 < afp(0.f) ? sum0 * slope : sum0;
sum1 = sum1 < afp(0.f) ? sum1 * slope : sum1;
sum2 = sum2 < afp(0.f) ? sum2 * slope : sum2;
sum3 = sum3 < afp(0.f) ? sum3 * slope : sum3;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afp(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afp(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afp(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afp(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 11
- 71
src/layer/vulkan/shader/convolution_pack8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -258,77 +261,14 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
sum4 = max(sum4, afp(0.f));
sum5 = max(sum5, afp(0.f));
sum6 = max(sum6, afp(0.f));
sum7 = max(sum7, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
sum4 = mix(sum4, sum4 * afp(slope), lessThan(sum4, afpvec4(0.f)));
sum5 = mix(sum5, sum5 * afp(slope), lessThan(sum5, afpvec4(0.f)));
sum6 = mix(sum6, sum6 * afp(slope), lessThan(sum6, afpvec4(0.f)));
sum7 = mix(sum7, sum7 * afp(slope), lessThan(sum7, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
sum4 = clamp(sum4, const_min, const_max);
sum5 = clamp(sum5, const_min, const_max);
sum6 = clamp(sum6, const_min, const_max);
sum7 = clamp(sum7, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
sum4 = afp(1.f) / (afp(1.f) + exp(-sum4));
sum5 = afp(1.f) / (afp(1.f) + exp(-sum5));
sum6 = afp(1.f) / (afp(1.f) + exp(-sum6));
sum7 = afp(1.f) / (afp(1.f) + exp(-sum7));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
sum4 = sum4 * tanh(log(exp(sum4) + afp(1.f)));
sum5 = sum5 * tanh(log(exp(sum5) + afp(1.f)));
sum6 = sum6 * tanh(log(exp(sum6) + afp(1.f)));
sum7 = sum7 * tanh(log(exp(sum7) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum4 = sum4 * clamp(sum4 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum5 = sum5 * clamp(sum5 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum6 = sum6 * clamp(sum6 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum7 = sum7 * clamp(sum7 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);
sum4 = activation_afpvec4(sum4, activation_type, activation_param_0, activation_param_1);
sum5 = activation_afpvec4(sum5, activation_type, activation_param_0, activation_param_1);
sum6 = activation_afpvec4(sum6, activation_type, activation_param_0, activation_param_1);
sum7 = activation_afpvec4(sum7, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx2.x, gy2.x, gz2.x), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack8to4_1x1s1d1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -182,53 +185,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 7
- 47
src/layer/vulkan/shader/convolution_pack8to4_gemm.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -205,53 +208,10 @@ void main()
}
#endif

if (activation_type == 1)
{
sum0 = max(sum0, afp(0.f));
sum1 = max(sum1, afp(0.f));
sum2 = max(sum2, afp(0.f));
sum3 = max(sum3, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum0 = mix(sum0, sum0 * afp(slope), lessThan(sum0, afpvec4(0.f)));
sum1 = mix(sum1, sum1 * afp(slope), lessThan(sum1, afpvec4(0.f)));
sum2 = mix(sum2, sum2 * afp(slope), lessThan(sum2, afpvec4(0.f)));
sum3 = mix(sum3, sum3 * afp(slope), lessThan(sum3, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum0 = clamp(sum0, const_min, const_max);
sum1 = clamp(sum1, const_min, const_max);
sum2 = clamp(sum2, const_min, const_max);
sum3 = clamp(sum3, const_min, const_max);
}
if (activation_type == 4)
{
sum0 = afp(1.f) / (afp(1.f) + exp(-sum0));
sum1 = afp(1.f) / (afp(1.f) + exp(-sum1));
sum2 = afp(1.f) / (afp(1.f) + exp(-sum2));
sum3 = afp(1.f) / (afp(1.f) + exp(-sum3));
}
if (activation_type == 5)
{
sum0 = sum0 * tanh(log(exp(sum0) + afp(1.f)));
sum1 = sum1 * tanh(log(exp(sum1) + afp(1.f)));
sum2 = sum2 * tanh(log(exp(sum2) + afp(1.f)));
sum3 = sum3 * tanh(log(exp(sum3) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum0 = sum0 * clamp(sum0 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum1 = sum1 * clamp(sum1 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum2 = sum2 * clamp(sum2 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum3 = sum3 * clamp(sum3 * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum0 = activation_afpvec4(sum0, activation_type, activation_param_0, activation_param_1);
sum1 = activation_afpvec4(sum1, activation_type, activation_param_0, activation_param_1);
sum2 = activation_afpvec4(sum2, activation_type, activation_param_0, activation_param_1);
sum3 = activation_afpvec4(sum3, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(sx4.r, sy4.r, gy), sum0);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -132,35 +135,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -152,35 +155,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group_pack1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -160,35 +163,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 35
src/layer/vulkan/shader/convolutiondepthwise_group_pack1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -165,41 +168,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -180,35 +183,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group_pack4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -160,35 +163,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 35
src/layer/vulkan/shader/convolutiondepthwise_group_pack4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -191,41 +194,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 35
src/layer/vulkan/shader/convolutiondepthwise_group_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -191,41 +194,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group_pack8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -163,35 +166,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_group_pack8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -175,35 +178,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/convolutiondepthwise_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -140,35 +143,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 35
src/layer/vulkan/shader/convolutiondepthwise_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -145,41 +148,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -165,25 +168,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_col2im.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -120,25 +123,7 @@ void main()
}
}

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -173,25 +176,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolution_pack1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -178,29 +181,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -193,25 +196,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack4_col2im.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -120,25 +123,7 @@ void main()
}
}

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -173,25 +176,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolution_pack4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -204,29 +207,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolution_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -204,29 +207,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolution_pack8_col2im.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -121,29 +124,7 @@ void main()
}
}

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -176,25 +179,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolution_pack8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -188,25 +191,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -159,25 +162,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -177,25 +180,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -185,25 +188,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -190,29 +193,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -205,25 +208,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -185,25 +188,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -216,29 +219,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -216,29 +219,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -188,25 +191,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_group_pack8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -200,25 +203,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 19
src/layer/vulkan/shader/deconvolutiondepthwise_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -167,25 +170,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 23
src/layer/vulkan/shader/deconvolutiondepthwise_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int kernel_w = 1;
layout (constant_id = 1) const int kernel_h = 1;
layout (constant_id = 2) const int dilation_w = 1;
@@ -172,29 +175,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, gy, gz), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -104,35 +107,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -105,35 +108,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm_wp1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -113,35 +116,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 4 + 0, gy, 0), sum.r);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_gemm_wp1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -118,41 +121,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 8 + 0, gy, 0), sum[0].r);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm_wp4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -141,35 +144,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 4 + 0, gy, 0), sum.r);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm_wp4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -121,35 +124,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, 0), sum);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_gemm_wp4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -152,41 +155,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 8 + 0, gy, 0), sum[0].r);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_gemm_wp8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -160,41 +163,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 8 + 0, gy, 0), sum[0].r);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm_wp8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -132,35 +135,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, gy, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_gemm_wp8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -144,35 +147,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx * 4 + 0, gy, 0), sum.r);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_pack1to4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -112,35 +115,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_pack1to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -119,41 +122,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -135,35 +138,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_pack4to1.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -112,35 +115,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_pack4to8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -147,41 +150,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -147,41 +150,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_pack8to1.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -115,35 +118,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_pack8to4.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -131,35 +134,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_reduce_sum8.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -88,35 +91,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = sum < afp(0.f) ? sum * slope : sum;
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afp(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st1(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 29
src/layer/vulkan/shader/innerproduct_reduce_sum8_pack4.comp View File

@@ -21,6 +21,9 @@
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -88,35 +91,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum = max(sum, afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum = mix(sum, sum * afp(slope), lessThan(sum, afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum = clamp(sum, const_min, const_max);
}
if (activation_type == 4)
{
sum = afp(1.f) / (afp(1.f) + exp(-sum));
}
if (activation_type == 5)
{
sum = sum * tanh(log(exp(sum) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum = sum * clamp(sum * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec4(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st4(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 35
src/layer/vulkan/shader/innerproduct_reduce_sum8_pack8.comp View File

@@ -22,6 +22,9 @@ struct sfpvec8 { f16vec4 abcd; f16vec4 efgh; };
#extension GL_EXT_shader_explicit_arithmetic_types_float16: require
#endif

#extension GL_GOOGLE_include_directive: enable
#include "vulkan_activation.comp"

layout (constant_id = 0) const int bias_term = 0;
layout (constant_id = 1) const int activation_type = 0;
layout (constant_id = 2) const float activation_param_0 = 0;
@@ -89,41 +92,7 @@ void main()
}
#endif

if (activation_type == 1)
{
sum[0] = max(sum[0], afp(0.f));
sum[1] = max(sum[1], afp(0.f));
}
if (activation_type == 2)
{
const afp slope = afp(activation_param_0);
sum[0] = mix(sum[0], sum[0] * afp(slope), lessThan(sum[0], afpvec4(0.f)));
sum[1] = mix(sum[1], sum[1] * afp(slope), lessThan(sum[1], afpvec4(0.f)));
}
if (activation_type == 3)
{
const afp const_min = afp(activation_param_0);
const afp const_max = afp(activation_param_1);
sum[0] = clamp(sum[0], const_min, const_max);
sum[1] = clamp(sum[1], const_min, const_max);
}
if (activation_type == 4)
{
sum[0] = afp(1.f) / (afp(1.f) + exp(-sum[0]));
sum[1] = afp(1.f) / (afp(1.f) + exp(-sum[1]));
}
if (activation_type == 5)
{
sum[0] = sum[0] * tanh(log(exp(sum[0]) + afp(1.f)));
sum[1] = sum[1] * tanh(log(exp(sum[1]) + afp(1.f)));
}
if (activation_type == 6)
{
const afp alpha = afp(activation_param_0);
const afp beta = afp(activation_param_1);
sum[0] = sum[0] * clamp(sum[0] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
sum[1] = sum[1] * clamp(sum[1] * afp(alpha) + afp(beta), afp(0.f), afp(1.f));
}
sum = activation_afpvec8(sum, activation_type, activation_param_0, activation_param_1);

#if NCNN_image_shader
image3d_st8(top_blob, ivec3(gx, 0, 0), sum);


+ 4
- 0
src/layer/vulkan/shader/mish.comp View File

@@ -61,7 +61,11 @@ void main()
afp v = buffer_ld1(bottom_top_blob_data, gi);
#endif

#if NCNN_moltenvk
v = v * afp(tanh(float(log(exp(v) + afp(1.f)))));
#else
v = v * tanh(log(exp(v) + afp(1.f)));
#endif

#if NCNN_image_shader
image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);


+ 4
- 0
src/layer/vulkan/shader/mish_pack4.comp View File

@@ -61,7 +61,11 @@ void main()
afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
#endif

#if NCNN_moltenvk
v = v * afpvec4(tanh(vec4(log(exp(v) + afpvec4(1.f)))));
#else
v = v * tanh(log(exp(v) + afpvec4(1.f)));
#endif

#if NCNN_image_shader
image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);


+ 5
- 0
src/layer/vulkan/shader/mish_pack8.comp View File

@@ -62,8 +62,13 @@ void main()
afpvec8 v = buffer_ld8(bottom_top_blob_data, gi);
#endif

#if NCNN_moltenvk
v[0] = v[0] * afpvec4(tanh(vec4(log(exp(v[0]) + afpvec4(1.f)))));
v[1] = v[1] * afpvec4(tanh(vec4(log(exp(v[1]) + afpvec4(1.f)))));
#else
v[0] = v[0] * tanh(log(exp(v[0]) + afpvec4(1.f)));
v[1] = v[1] * tanh(log(exp(v[1]) + afpvec4(1.f)));
#endif

#if NCNN_image_shader
image3d_st8(top_blob_3d, ivec3(gx, gy, gz), v);


+ 4
- 0
src/layer/vulkan/shader/tanh.comp View File

@@ -61,7 +61,11 @@ void main()
afp v = buffer_ld1(bottom_top_blob_data, gi);
#endif

#if NCNN_moltenvk
v = afp(tanh(float(v)));
#else
v = tanh(v);
#endif

#if NCNN_image_shader
image3d_st1(top_blob_3d, ivec3(gx, gy, gz), v);


+ 4
- 0
src/layer/vulkan/shader/tanh_pack4.comp View File

@@ -61,7 +61,11 @@ void main()
afpvec4 v = buffer_ld4(bottom_top_blob_data, gi);
#endif

#if NCNN_moltenvk
v = afpvec4(tanh(vec4(v)));
#else
v = tanh(v);
#endif

#if NCNN_image_shader
image3d_st4(top_blob_3d, ivec3(gx, gy, gz), v);


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save