Browse Source

require GL_EXT_shader_16bit_storage only for fp16_storage, explicit type cast

tags/20190611
nihui 7 years ago
parent
commit
58ed8e437f
94 changed files with 809 additions and 481 deletions
  1. +1
    -1
      src/CMakeLists.txt
  2. +5
    -2
      src/layer/shader/absval.comp
  3. +5
    -2
      src/layer/shader/absval_pack4.comp
  4. +10
    -7
      src/layer/shader/batchnorm.comp
  5. +10
    -7
      src/layer/shader/batchnorm_pack4.comp
  6. +6
    -3
      src/layer/shader/binaryop.comp
  7. +6
    -3
      src/layer/shader/binaryop_pack4.comp
  8. +5
    -2
      src/layer/shader/clip.comp
  9. +5
    -2
      src/layer/shader/clip_pack4.comp
  10. +16
    -18
      src/layer/shader/concat.comp
  11. +16
    -18
      src/layer/shader/concat_pack4.comp
  12. +15
    -12
      src/layer/shader/concat_pack4to1.comp
  13. +5
    -2
      src/layer/shader/convolution.comp
  14. +4
    -1
      src/layer/shader/convolution_1x1s1d1.comp
  15. +7
    -4
      src/layer/shader/convolution_pack1to4.comp
  16. +23
    -5
      src/layer/shader/convolution_pack4.comp
  17. +7
    -4
      src/layer/shader/convolution_pack4to1.comp
  18. +5
    -2
      src/layer/shader/convolutiondepthwise.comp
  19. +5
    -2
      src/layer/shader/convolutiondepthwise_group.comp
  20. +7
    -4
      src/layer/shader/convolutiondepthwise_group_pack1to4.comp
  21. +22
    -4
      src/layer/shader/convolutiondepthwise_group_pack4.comp
  22. +7
    -4
      src/layer/shader/convolutiondepthwise_group_pack4to1.comp
  23. +8
    -5
      src/layer/shader/convolutiondepthwise_pack4.comp
  24. +4
    -1
      src/layer/shader/crop.comp
  25. +4
    -1
      src/layer/shader/crop_pack4.comp
  26. +5
    -2
      src/layer/shader/deconvolution.comp
  27. +7
    -4
      src/layer/shader/deconvolution_pack1to4.comp
  28. +23
    -5
      src/layer/shader/deconvolution_pack4.comp
  29. +7
    -4
      src/layer/shader/deconvolution_pack4to1.comp
  30. +5
    -2
      src/layer/shader/deconvolutiondepthwise.comp
  31. +5
    -2
      src/layer/shader/deconvolutiondepthwise_group.comp
  32. +7
    -4
      src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp
  33. +23
    -5
      src/layer/shader/deconvolutiondepthwise_group_pack4.comp
  34. +7
    -4
      src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp
  35. +7
    -4
      src/layer/shader/deconvolutiondepthwise_pack4.comp
  36. +5
    -2
      src/layer/shader/dropout.comp
  37. +5
    -2
      src/layer/shader/dropout_pack4.comp
  38. +6
    -3
      src/layer/shader/eltwise.comp
  39. +6
    -3
      src/layer/shader/eltwise_pack4.comp
  40. +4
    -1
      src/layer/shader/flatten.comp
  41. +8
    -8
      src/layer/shader/flatten_pack4.comp
  42. +5
    -2
      src/layer/shader/innerproduct.comp
  43. +8
    -5
      src/layer/shader/innerproduct_pack1to4.comp
  44. +24
    -6
      src/layer/shader/innerproduct_pack4.comp
  45. +7
    -4
      src/layer/shader/innerproduct_pack4to1.comp
  46. +10
    -11
      src/layer/shader/interp.comp
  47. +11
    -10
      src/layer/shader/interp_pack4.comp
  48. +5
    -2
      src/layer/shader/lrn_norm.comp
  49. +5
    -2
      src/layer/shader/lrn_norm_across_channel_pack4.comp
  50. +5
    -2
      src/layer/shader/lrn_norm_within_channel_pack4.comp
  51. +6
    -3
      src/layer/shader/lrn_square_pad.comp
  52. +5
    -2
      src/layer/shader/lrn_square_pad_across_channel_pack4.comp
  53. +5
    -2
      src/layer/shader/lrn_square_pad_within_channel_pack4.comp
  54. +9
    -16
      src/layer/shader/packing_1to4.comp
  55. +9
    -6
      src/layer/shader/packing_4to1.comp
  56. +7
    -8
      src/layer/shader/padding.comp
  57. +7
    -8
      src/layer/shader/padding_pack4.comp
  58. +4
    -1
      src/layer/shader/permute.comp
  59. +9
    -6
      src/layer/shader/permute_pack4to1.comp
  60. +4
    -1
      src/layer/shader/pooling.comp
  61. +4
    -1
      src/layer/shader/pooling_global.comp
  62. +4
    -1
      src/layer/shader/pooling_global_pack4.comp
  63. +4
    -1
      src/layer/shader/pooling_pack4.comp
  64. +10
    -7
      src/layer/shader/prelu.comp
  65. +10
    -7
      src/layer/shader/prelu_pack4.comp
  66. +23
    -10
      src/layer/shader/priorbox.comp
  67. +6
    -3
      src/layer/shader/priorbox_mxnet.comp
  68. +5
    -2
      src/layer/shader/relu.comp
  69. +5
    -2
      src/layer/shader/relu_pack4.comp
  70. +5
    -4
      src/layer/shader/reorg.comp
  71. +9
    -8
      src/layer/shader/reorg_pack1to4.comp
  72. +25
    -30
      src/layer/shader/reorg_pack4.comp
  73. +19
    -6
      src/layer/shader/reshape.comp
  74. +22
    -10
      src/layer/shader/reshape_pack1to4.comp
  75. +37
    -32
      src/layer/shader/reshape_pack4.comp
  76. +9
    -6
      src/layer/shader/reshape_pack4to1.comp
  77. +7
    -4
      src/layer/shader/scale.comp
  78. +7
    -4
      src/layer/shader/scale_pack4.comp
  79. +4
    -1
      src/layer/shader/shufflechannel.comp
  80. +25
    -30
      src/layer/shader/shufflechannel_pack4.comp
  81. +5
    -2
      src/layer/shader/sigmoid.comp
  82. +5
    -2
      src/layer/shader/sigmoid_pack4.comp
  83. +16
    -13
      src/layer/shader/softmax_div_sum.comp
  84. +16
    -13
      src/layer/shader/softmax_div_sum_pack4.comp
  85. +4
    -1
      src/layer/shader/softmax_exp_sub_max.comp
  86. +4
    -1
      src/layer/shader/softmax_exp_sub_max_pack4.comp
  87. +4
    -1
      src/layer/shader/softmax_reduce_max.comp
  88. +4
    -1
      src/layer/shader/softmax_reduce_max_pack4.comp
  89. +4
    -1
      src/layer/shader/softmax_reduce_sum.comp
  90. +4
    -1
      src/layer/shader/softmax_reduce_sum_pack4.comp
  91. +5
    -2
      src/layer/shader/tanh.comp
  92. +5
    -2
      src/layer/shader/tanh_pack4.comp
  93. +5
    -2
      src/layer/shader/unaryop.comp
  94. +5
    -2
      src/layer/shader/unaryop_pack4.comp

+ 1
- 1
src/CMakeLists.txt View File

@@ -112,7 +112,7 @@ macro(ncnn_add_layer class)
add_custom_command( add_custom_command(
OUTPUT ${SHADER_fp16s_SPV_HEX_FILE} OUTPUT ${SHADER_fp16s_SPV_HEX_FILE}
COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} COMMAND ${GLSLANGVALIDATOR_EXECUTABLE}
ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpmat4=f16mat4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC}
DEPENDS ${SHADER_SRC} DEPENDS ${SHADER_SRC}
COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv" COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv"
VERBATIM VERBATIM


+ 5
- 2
src/layer/shader/absval.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfp(abs(v)); bottom_top_blob_data[gi] = sfp(abs(v));
} }

+ 5
- 2
src/layer/shader/absval_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfpvec4(abs(v)); bottom_top_blob_data[gi] = sfpvec4(abs(v));
} }

+ 10
- 7
src/layer/shader/batchnorm.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,9 +49,9 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afp v = bottom_top_blob_data[gx];
afp v = afp(bottom_top_blob_data[gx]);


v = b_data[gx] * v + a_data[gx];
v = afp(b_data[gx]) * v + afp(a_data[gx]);


bottom_top_blob_data[gx] = sfp(v); bottom_top_blob_data[gx] = sfp(v);


@@ -59,9 +62,9 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


v = b_data[gy] * v + a_data[gy];
v = afp(b_data[gy]) * v + afp(a_data[gy]);


bottom_top_blob_data[gi] = sfp(v); bottom_top_blob_data[gi] = sfp(v);


@@ -72,9 +75,9 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


v = b_data[gz] * v + a_data[gz];
v = afp(b_data[gz]) * v + afp(a_data[gz]);


bottom_top_blob_data[gi] = sfp(v); bottom_top_blob_data[gi] = sfp(v);




+ 10
- 7
src/layer/shader/batchnorm_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,9 +49,9 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afpvec4 v = bottom_top_blob_data[gx];
afpvec4 v = afpvec4(bottom_top_blob_data[gx]);


v = b_data[gx] * v + a_data[gx];
v = afpvec4(b_data[gx]) * v + afpvec4(a_data[gx]);


bottom_top_blob_data[gx] = sfpvec4(v); bottom_top_blob_data[gx] = sfpvec4(v);


@@ -59,9 +62,9 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


v = b_data[gy] * v + a_data[gy];
v = afpvec4(b_data[gy]) * v + afpvec4(a_data[gy]);


bottom_top_blob_data[gi] = sfpvec4(v); bottom_top_blob_data[gi] = sfpvec4(v);


@@ -72,9 +75,9 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


v = b_data[gz] * v + a_data[gz];
v = afpvec4(b_data[gz]) * v + afpvec4(a_data[gz]);


bottom_top_blob_data[gi] = sfpvec4(v); bottom_top_blob_data[gi] = sfpvec4(v);




+ 6
- 3
src/layer/shader/binaryop.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -62,7 +65,7 @@ void main()


const int gi = gz * p.outcstep + gy * p.outw + gx; const int gi = gz * p.outcstep + gy * p.outw + gx;


afp v1 = a_blob_data[gi];
afp v1 = afp(a_blob_data[gi]);


afp res; afp res;


@@ -87,7 +90,7 @@ void main()


if (p.adims == p.bdims) if (p.adims == p.bdims)
{ {
afp v2 = b_blob_data[gi];
afp v2 = afp(b_blob_data[gi]);


if (op_type == 0) res = v1 + v2; if (op_type == 0) res = v1 + v2;
if (op_type == 1) res = v1 - v2; if (op_type == 1) res = v1 - v2;


+ 6
- 3
src/layer/shader/binaryop_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -62,7 +65,7 @@ void main()


const int gi = gz * p.outcstep + gy * p.outw + gx; const int gi = gz * p.outcstep + gy * p.outw + gx;


afpvec4 v1 = a_blob_data[gi];
afpvec4 v1 = afpvec4(a_blob_data[gi]);


afpvec4 res; afpvec4 res;


@@ -87,7 +90,7 @@ void main()


if (p.adims == p.bdims) if (p.adims == p.bdims)
{ {
afpvec4 v2 = b_blob_data[gi];
afpvec4 v2 = afpvec4(b_blob_data[gi]);


if (op_type == 0) res = v1 + v2; if (op_type == 0) res = v1 + v2;
if (op_type == 1) res = v1 - v2; if (op_type == 1) res = v1 - v2;


+ 5
- 2
src/layer/shader/clip.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -47,7 +50,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfp(clamp(v, afp(const_min), afp(const_max))); bottom_top_blob_data[gi] = sfp(clamp(v, afp(const_min), afp(const_max)));
} }

+ 5
- 2
src/layer/shader/clip_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -47,7 +50,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfpvec4(clamp(v, afp(const_min), afp(const_max))); bottom_top_blob_data[gi] = sfpvec4(clamp(v, afp(const_min), afp(const_max)));
} }

+ 16
- 18
src/layer/shader/concat.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -53,39 +56,34 @@ void main()
if (gx >= p.w || gy >= p.h || gz >= p.c) if (gx >= p.w || gy >= p.h || gz >= p.c)
return; return;


int v_offset;
sfp v;

if (p.dims == 1) // axis == 0 if (p.dims == 1) // axis == 0
{ {
v_offset = gx + p.offset;
v = bottom_blob_data[gx];
int v_offset = gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gx];
} }
else if (p.dims == 2 && axis == 0) else if (p.dims == 2 && axis == 0)
{ {
v_offset = (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
int v_offset = (gy + p.offset) * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx];
} }
else if (p.dims == 2 && axis == 1) else if (p.dims == 2 && axis == 1)
{ {
v_offset = gy * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
int v_offset = gy * p.outw + gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 0) else if (p.dims == 3 && axis == 0)
{ {
v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 1) else if (p.dims == 3 && axis == 1)
{ {
v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 2) else if (p.dims == 3 && axis == 2)
{ {
v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }

top_blob_data[v_offset] = v;
} }

+ 16
- 18
src/layer/shader/concat_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -53,39 +56,34 @@ void main()
if (gx >= p.w || gy >= p.h || gz >= p.c) if (gx >= p.w || gy >= p.h || gz >= p.c)
return; return;


int v_offset;
sfpvec4 v;

if (p.dims == 1) // axis == 0 if (p.dims == 1) // axis == 0
{ {
v_offset = gx + p.offset;
v = bottom_blob_data[gx];
int v_offset = gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gx];
} }
else if (p.dims == 2 && axis == 0) else if (p.dims == 2 && axis == 0)
{ {
v_offset = (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
int v_offset = (gy + p.offset) * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx];
} }
else if (p.dims == 2 && axis == 1) else if (p.dims == 2 && axis == 1)
{ {
v_offset = gy * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
int v_offset = gy * p.outw + gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 0) else if (p.dims == 3 && axis == 0)
{ {
v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 1) else if (p.dims == 3 && axis == 1)
{ {
v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }
else if (p.dims == 3 && axis == 2) else if (p.dims == 3 && axis == 2)
{ {
v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset;
top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
} }

top_blob_data[v_offset] = v;
} }

+ 15
- 12
src/layer/shader/concat_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -54,41 +57,41 @@ void main()
return; return;


ivec4 v_offset; ivec4 v_offset;
sfpvec4 v;
int gi;


if (p.dims == 1) // axis == 0 if (p.dims == 1) // axis == 0
{ {
v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3); v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3);
v = bottom_blob_data[gx];
gi = gx;
} }
else if (p.dims == 2 && axis == 0) else if (p.dims == 2 && axis == 0)
{ {
v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx; v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx;
v = bottom_blob_data[gy * p.w + gx];
gi = gy * p.w + gx;
} }
else if (p.dims == 2 && axis == 1) else if (p.dims == 2 && axis == 1)
{ {
v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset; v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset;
v = bottom_blob_data[gy * p.w + gx];
gi = gy * p.w + gx;
} }
else if (p.dims == 3 && axis == 0) else if (p.dims == 3 && axis == 0)
{ {
v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx; v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
gi = gz * p.cstep + gy * p.w + gx;
} }
else if (p.dims == 3 && axis == 1) else if (p.dims == 3 && axis == 1)
{ {
v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx; v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
gi = gz * p.cstep + gy * p.w + gx;
} }
else if (p.dims == 3 && axis == 2) else if (p.dims == 3 && axis == 2)
{ {
v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset; v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset;
v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
gi = gz * p.cstep + gy * p.w + gx;
} }


top_blob_data[v_offset.r] = v.r;
top_blob_data[v_offset.g] = v.g;
top_blob_data[v_offset.b] = v.b;
top_blob_data[v_offset.a] = v.a;
top_blob_data[v_offset.r] = bottom_blob_data[gi].r;
top_blob_data[v_offset.g] = bottom_blob_data[gi].g;
top_blob_data[v_offset.b] = bottom_blob_data[gi].b;
top_blob_data[v_offset.a] = bottom_blob_data[gi].a;
} }

+ 5
- 2
src/layer/shader/convolution.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 4
- 1
src/layer/shader/convolution_1x1s1d1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 7
- 4
src/layer/shader/convolution_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -80,9 +83,9 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afp v = bottom_blob_data[v_offset + x * dilation_w];
afp v = afp(bottom_blob_data[v_offset + x * dilation_w]);


afpvec4 k = weight_data[w_offset + x];
afpvec4 k = afpvec4(weight_data[w_offset + x]);


sum += v * k; sum += v * k;
} }


+ 23
- 5
src/layer/shader/convolution_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in;


layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };


layout (push_constant) uniform parameter layout (push_constant) uniform parameter
@@ -63,7 +71,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -80,9 +88,19 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afpvec4 v = bottom_blob_data[v_offset + x * dilation_w];

afpmat4 k = weight_data[w_offset + x];
afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]);

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + x) * 4 + 0]),
afpvec4(weight_data[(w_offset + x) * 4 + 1]),
afpvec4(weight_data[(w_offset + x) * 4 + 2]),
afpvec4(weight_data[(w_offset + x) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset + x]);
#endif


sum += v * k; sum += v * k;
} }


+ 7
- 4
src/layer/shader/convolution_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {
@@ -80,9 +83,9 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afpvec4 v = bottom_blob_data[v_offset + x * dilation_w];
afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]);


afpvec4 k = weight_data[w_offset + x];
afpvec4 k = afpvec4(weight_data[w_offset + x]);


sum += dot(v, k); sum += dot(v, k);
} }


+ 5
- 2
src/layer/shader/convolutiondepthwise.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 5
- 2
src/layer/shader/convolutiondepthwise_group.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 7
- 4
src/layer/shader/convolutiondepthwise_group_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -89,9 +92,9 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afp v = bottom_blob_data[v_offset + x * dilation_w];
afp v = afp(bottom_blob_data[v_offset + x * dilation_w]);


afpvec4 k = weight_data[w_offset + x];
afpvec4 k = afpvec4(weight_data[w_offset + x]);


sum += v * k; sum += v * k;
} }


+ 22
- 4
src/layer/shader/convolutiondepthwise_group_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in;


layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };


layout (push_constant) uniform parameter layout (push_constant) uniform parameter
@@ -64,7 +72,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -89,9 +97,19 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afpvec4 v = bottom_blob_data[v_offset + x * dilation_w];

afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]);

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + x) * 4 + 0]),
afpvec4(weight_data[(w_offset + x) * 4 + 1]),
afpvec4(weight_data[(w_offset + x) * 4 + 2]),
afpvec4(weight_data[(w_offset + x) * 4 + 3])
);
#else
afpmat4 k = weight_data[w_offset + x]; afpmat4 k = weight_data[w_offset + x];
#endif


sum += v * k; sum += v * k;
} }


+ 7
- 4
src/layer/shader/convolutiondepthwise_group_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {
@@ -89,9 +92,9 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afpvec4 v = bottom_blob_data[v_offset + x * dilation_w];
afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]);


afpvec4 k = weight_data[w_offset + x];
afpvec4 k = afpvec4(weight_data[w_offset + x]);


sum += dot(v, k); sum += dot(v, k);
} }


+ 8
- 5
src/layer/shader/convolutiondepthwise_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,11 +67,11 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
sum = afpvec4(0.0);
sum = afpvec4(0.f);
} }


// depth-wise convolution // depth-wise convolution
@@ -79,9 +82,9 @@ void main()
{ {
for (int x = 0; x < kernel_w; x++) for (int x = 0; x < kernel_w; x++)
{ {
afpvec4 v = bottom_blob_data[v_offset + x * dilation_w];
afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]);


afpvec4 k = weight_data[w_offset + x];
afpvec4 k = afpvec4(weight_data[w_offset + x]);


sum += v * k; sum += v * k;
} }


+ 4
- 1
src/layer/shader/crop.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/crop_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 5
- 2
src/layer/shader/deconvolution.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 7
- 4
src/layer/shader/deconvolution_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -100,9 +103,9 @@ void main()


for (int z = 0; z < p.c; z++) for (int z = 0; z < p.c; z++)
{ {
afp v = bottom_blob_data[v_offset];
afp v = afp(bottom_blob_data[v_offset]);


afpvec4 k = weight_data[w_offset];
afpvec4 k = afpvec4(weight_data[w_offset]);


sum += v * k; sum += v * k;




+ 23
- 5
src/layer/shader/deconvolution_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in;


layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };


layout (push_constant) uniform parameter layout (push_constant) uniform parameter
@@ -63,7 +71,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -100,9 +108,19 @@ void main()


for (int z = 0; z < p.c; z++) for (int z = 0; z < p.c; z++)
{ {
afpvec4 v = bottom_blob_data[v_offset];

afpmat4 k = weight_data[w_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + x) * 4 + 0]),
afpvec4(weight_data[(w_offset + x) * 4 + 1]),
afpvec4(weight_data[(w_offset + x) * 4 + 2]),
afpvec4(weight_data[(w_offset + x) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset]);
#endif


sum += v * k; sum += v * k;




+ 7
- 4
src/layer/shader/deconvolution_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {
@@ -100,9 +103,9 @@ void main()


for (int z = 0; z < p.c; z++) for (int z = 0; z < p.c; z++)
{ {
afpvec4 v = bottom_blob_data[v_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);


afpvec4 k = weight_data[w_offset];
afpvec4 k = afpvec4(weight_data[w_offset]);


sum += dot(v, k); sum += dot(v, k);




+ 5
- 2
src/layer/shader/deconvolutiondepthwise.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 5
- 2
src/layer/shader/deconvolutiondepthwise_group.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {


+ 7
- 4
src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -109,9 +112,9 @@ void main()


for (int z = 0; z < channels_g; z++) for (int z = 0; z < channels_g; z++)
{ {
afp v = bottom_blob_data[v_offset];
afp v = afp(bottom_blob_data[v_offset]);


afpvec4 k = weight_data[w_offset];
afpvec4 k = afpvec4(weight_data[w_offset]);


sum += v * k; sum += v * k;




+ 23
- 5
src/layer/shader/deconvolutiondepthwise_group_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in;


layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };


layout (push_constant) uniform parameter layout (push_constant) uniform parameter
@@ -64,7 +72,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -109,9 +117,19 @@ void main()


for (int z = 0; z < channels_g; z++) for (int z = 0; z < channels_g; z++)
{ {
afpvec4 v = bottom_blob_data[v_offset];

afpmat4 k = weight_data[w_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + x) * 4 + 0]),
afpvec4(weight_data[(w_offset + x) * 4 + 1]),
afpvec4(weight_data[(w_offset + x) * 4 + 2]),
afpvec4(weight_data[(w_offset + x) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset]);
#endif


sum += v * k; sum += v * k;




+ 7
- 4
src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afp(bias_data[gz]);
} }
else else
{ {
@@ -109,9 +112,9 @@ void main()


for (int z = 0; z < channels_g; z++) for (int z = 0; z < channels_g; z++)
{ {
afpvec4 v = bottom_blob_data[v_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);


afpvec4 k = weight_data[w_offset];
afpvec4 k = afpvec4(weight_data[w_offset]);


sum += dot(v, k); sum += dot(v, k);




+ 7
- 4
src/layer/shader/deconvolutiondepthwise_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -64,7 +67,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gz];
sum = afpvec4(bias_data[gz]);
} }
else else
{ {
@@ -101,9 +104,9 @@ void main()
int v_offset = v_offset_0 + sy * p.w + sx; int v_offset = v_offset_0 + sy * p.w + sx;
int w_offset = w_offset_0 + y * kernel_w + x; int w_offset = w_offset_0 + y * kernel_w + x;


afpvec4 v = bottom_blob_data[v_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);


afpvec4 k = weight_data[w_offset];
afpvec4 k = afpvec4(weight_data[w_offset]);


sum += v * k; sum += v * k;
} }


+ 5
- 2
src/layer/shader/dropout.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


v *= afp(scale); v *= afp(scale);




+ 5
- 2
src/layer/shader/dropout_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


v *= afp(scale); v *= afp(scale);




+ 6
- 3
src/layer/shader/eltwise.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -52,8 +55,8 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v1 = bottom_blob1_data[gi];
afp v2 = bottom_blob2_data[gi];
afp v1 = afp(bottom_blob1_data[gi]);
afp v2 = afp(bottom_blob2_data[gi]);


afp res; afp res;




+ 6
- 3
src/layer/shader/eltwise_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -52,8 +55,8 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v1 = bottom_blob1_data[gi];
afpvec4 v2 = bottom_blob2_data[gi];
afpvec4 v1 = afpvec4(bottom_blob1_data[gi]);
afpvec4 v2 = afpvec4(bottom_blob2_data[gi]);


afpvec4 res; afpvec4 res;




+ 4
- 1
src/layer/shader/flatten.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 8
- 8
src/layer/shader/flatten_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -74,11 +77,8 @@ void main()
v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4; v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4;
} }


sfpvec4 v;
v.r = bottom_blob_data[v_offset.r];
v.g = bottom_blob_data[v_offset.g];
v.b = bottom_blob_data[v_offset.b];
v.a = bottom_blob_data[v_offset.a];

top_blob_data[gx] = v;
top_blob_data[gx].r = bottom_blob_data[v_offset.r];
top_blob_data[gx].g = bottom_blob_data[v_offset.g];
top_blob_data[gx].b = bottom_blob_data[v_offset.b];
top_blob_data[gx].a = bottom_blob_data[v_offset.a];
} }

+ 5
- 2
src/layer/shader/innerproduct.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -57,7 +60,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gx];
sum = afp(bias_data[gx]);
} }
else else
{ {


+ 8
- 5
src/layer/shader/innerproduct_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -57,20 +60,20 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gx];
sum = afpvec4(bias_data[gx]);
} }
else else
{ {
sum = afpvec4(0.0);
sum = afpvec4(0.f);
} }


int w_offset = gx * p.w; int w_offset = gx * p.w;


for (int i = 0; i < p.w; i++) for (int i = 0; i < p.w; i++)
{ {
afp v = bottom_blob_data[i];
afp v = afp(bottom_blob_data[i]);


afpvec4 k = weight_data[w_offset + i];
afpvec4 k = afpvec4(weight_data[w_offset + i]);


sum += v * k; sum += v * k;
} }


+ 24
- 6
src/layer/shader/innerproduct_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -26,7 +29,12 @@ layout (local_size_z_id = 235) in;


layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; };
layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; };
#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; };
#else
layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; };
#endif
layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; };


layout (push_constant) uniform parameter layout (push_constant) uniform parameter
@@ -57,20 +65,30 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gx];
sum = afpvec4(bias_data[gx]);
} }
else else
{ {
sum = afpvec4(0.0);
sum = afpvec4(0.f);
} }


int w_offset = gx * p.w; int w_offset = gx * p.w;


for (int i = 0; i < p.w; i++) for (int i = 0; i < p.w; i++)
{ {
afpvec4 v = bottom_blob_data[i];

afpmat4 k = weight_data[w_offset + i];
afpvec4 v = afpvec4(bottom_blob_data[i]);

#if NCNN_fp16_storage && !NCNN_fp16_arithmetic
// GL_EXT_shader_16bit_storage does not define f16mat4 type :(
afpmat4 k = afpmat4(
afpvec4(weight_data[(w_offset + i) * 4 + 0]),
afpvec4(weight_data[(w_offset + i) * 4 + 1]),
afpvec4(weight_data[(w_offset + i) * 4 + 2]),
afpvec4(weight_data[(w_offset + i) * 4 + 3])
);
#else
afpmat4 k = afpmat4(weight_data[w_offset + i]);
#endif


sum += v * k; sum += v * k;
} }


+ 7
- 4
src/layer/shader/innerproduct_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -57,7 +60,7 @@ void main()


if (bias_term == 1) if (bias_term == 1)
{ {
sum = bias_data[gx];
sum = afp(bias_data[gx]);
} }
else else
{ {
@@ -68,9 +71,9 @@ void main()


for (int i = 0; i < p.w; i++) for (int i = 0; i < p.w; i++)
{ {
afpvec4 v = bottom_blob_data[i];
afpvec4 v = afpvec4(bottom_blob_data[i]);


afpvec4 k = weight_data[w_offset + i];
afpvec4 k = afpvec4(weight_data[w_offset + i]);


sum += dot(v, k); sum += dot(v, k);
} }


+ 10
- 11
src/layer/shader/interp.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -54,8 +57,6 @@ void main()
if (gx >= p.outw || gy >= p.outh || gz >= p.outc) if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return; return;


afp res;

if (resize_type == 1) // nearest if (resize_type == 1) // nearest
{ {
afpvec2 gxy = afpvec2(gx, gy); afpvec2 gxy = afpvec2(gx, gy);
@@ -67,7 +68,7 @@ void main()


int v_offset = gz * p.cstep + sy * p.w + sx; int v_offset = gz * p.cstep + sy * p.w + sx;


res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }
else if (resize_type == 2) // bilinear else if (resize_type == 2) // bilinear
{ {
@@ -94,17 +95,15 @@ void main()
int v_offset_0 = gz * p.cstep + sy * p.w + sx; int v_offset_0 = gz * p.cstep + sy * p.w + sx;
int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx;


afp a0 = bottom_blob_data[v_offset_0];
afp a1 = bottom_blob_data[v_offset_0 + 1];
afp b0 = bottom_blob_data[v_offset_1];
afp b1 = bottom_blob_data[v_offset_1 + 1];
afp a0 = afp(bottom_blob_data[v_offset_0]);
afp a1 = afp(bottom_blob_data[v_offset_0 + 1]);
afp b0 = afp(bottom_blob_data[v_offset_1]);
afp b1 = afp(bottom_blob_data[v_offset_1 + 1]);


afp fx = fxy.r; afp fx = fxy.r;
afp fy = fxy.g; afp fy = fxy.g;


afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx; afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx;
res = ab.r * (afp(1.f) - fy) + ab.g * fy;
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(ab.r * (afp(1.f) - fy) + ab.g * fy);
} }

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(res);
} }

+ 11
- 10
src/layer/shader/interp_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -54,7 +57,7 @@ void main()
if (gx >= p.outw || gy >= p.outh || gz >= p.outc) if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return; return;


afpvec4 res;
// afpvec4 res;


if (resize_type == 1) // nearest if (resize_type == 1) // nearest
{ {
@@ -67,7 +70,7 @@ void main()


int v_offset = gz * p.cstep + sy * p.w + sx; int v_offset = gz * p.cstep + sy * p.w + sx;


res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }
else if (resize_type == 2) // bilinear else if (resize_type == 2) // bilinear
{ {
@@ -94,10 +97,10 @@ void main()
int v_offset_0 = gz * p.cstep + sy * p.w + sx; int v_offset_0 = gz * p.cstep + sy * p.w + sx;
int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx;


afpvec4 a0 = bottom_blob_data[v_offset_0];
afpvec4 a1 = bottom_blob_data[v_offset_0 + 1];
afpvec4 b0 = bottom_blob_data[v_offset_1];
afpvec4 b1 = bottom_blob_data[v_offset_1 + 1];
afpvec4 a0 = afpvec4(bottom_blob_data[v_offset_0]);
afpvec4 a1 = afpvec4(bottom_blob_data[v_offset_0 + 1]);
afpvec4 b0 = afpvec4(bottom_blob_data[v_offset_1]);
afpvec4 b1 = afpvec4(bottom_blob_data[v_offset_1 + 1]);


afp fx = fxy.r; afp fx = fxy.r;
afp fy = fxy.g; afp fy = fxy.g;
@@ -105,8 +108,6 @@ void main()
afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx; afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx;
afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx; afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx;


res = a * (afp(1.f) - fy) + b * fy;
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(a * (afp(1.f) - fy) + b * fy);
} }

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(res);
} }

+ 5
- 2
src/layer/shader/lrn_norm.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -95,7 +98,7 @@ void main()
scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta)); scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta));
} }


afp v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx];
afp v = afp(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]);


v *= scale; v *= scale;




+ 5
- 2
src/layer/shader/lrn_norm_across_channel_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -75,7 +78,7 @@ void main()


afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta));


afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx];
afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]);


v *= scale; v *= scale;




+ 5
- 2
src/layer/shader/lrn_norm_within_channel_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -75,7 +78,7 @@ void main()


afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta));


afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx];
afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]);


v *= scale; v *= scale;




+ 6
- 3
src/layer/shader/lrn_square_pad.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -62,7 +65,7 @@ void main()
if (z >= 0 && z < p.c) if (z >= 0 && z < p.c)
{ {
int v_offset = z * p.cstep + gy * p.w + gx; int v_offset = z * p.cstep + gy * p.w + gx;
afp v = bottom_blob_data[v_offset];
afp v = afp(bottom_blob_data[v_offset]);
res = v * v; res = v * v;
} }
else else
@@ -78,7 +81,7 @@ void main()
if (x >= 0 && x < p.w && y >= 0 && y < p.h) if (x >= 0 && x < p.w && y >= 0 && y < p.h)
{ {
int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
afp v = bottom_blob_data[v_offset];
afp v = afp(bottom_blob_data[v_offset]);
res = v * v; res = v * v;
} }
else else


+ 5
- 2
src/layer/shader/lrn_square_pad_across_channel_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -62,7 +65,7 @@ void main()
if (z >= 0 && z < p.c) if (z >= 0 && z < p.c)
{ {
int v_offset = z * p.cstep + gy * p.w + gx; int v_offset = z * p.cstep + gy * p.w + gx;
afpvec4 v4 = bottom_blob_data[v_offset];
afpvec4 v4 = afpvec4(bottom_blob_data[v_offset]);


int lane = (gz - pad_head) % 4; int lane = (gz - pad_head) % 4;




+ 5
- 2
src/layer/shader/lrn_square_pad_within_channel_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,7 +66,7 @@ void main()
if (x >= 0 && x < p.w && y >= 0 && y < p.h) if (x >= 0 && x < p.w && y >= 0 && y < p.h)
{ {
int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
afpvec4 v = bottom_blob_data[v_offset];
afpvec4 v = afpvec4(bottom_blob_data[v_offset]);
res = v * v; res = v * v;
} }
else else


+ 9
- 16
src/layer/shader/packing_1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -55,35 +58,25 @@ void main()
{ {
ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3);


// prevent out of range access
// x4 = min(x4, p.w - 1);

v_offset = x4; v_offset = x4;
} }
else if (p.dims == 2) else if (p.dims == 2)
{ {
ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3);


// prevent out of range access
// y4 = min(y4, p.h - 1);

v_offset = y4 * p.w + gx; v_offset = y4 * p.w + gx;
} }
else // if (p.dims == 3) else // if (p.dims == 3)
{ {
ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3);


// prevent out of range access
z4 = min(z4, p.c - 1);

v_offset = z4 * p.cstep + ivec4(gy * p.w + gx); v_offset = z4 * p.cstep + ivec4(gy * p.w + gx);
} }


sfpvec4 v;
v.r = bottom_blob_data[v_offset.r];
v.g = bottom_blob_data[v_offset.g];
v.b = bottom_blob_data[v_offset.b];
v.a = bottom_blob_data[v_offset.a];
int gi = gz * p.outcstep + gy * p.outw + gx;


top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
top_blob_data[gi].r = bottom_blob_data[v_offset.r];
top_blob_data[gi].g = bottom_blob_data[v_offset.g];
top_blob_data[gi].b = bottom_blob_data[v_offset.b];
top_blob_data[gi].a = bottom_blob_data[v_offset.a];
} }

+ 9
- 6
src/layer/shader/packing_4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -70,10 +73,10 @@ void main()
v_offset = z4 * p.outcstep + ivec4(gy * p.outw + gx); v_offset = z4 * p.outcstep + ivec4(gy * p.outw + gx);
} }


sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int gi = gz * p.cstep + gy * p.w + gx;


top_blob_data[v_offset.r] = v.r;
top_blob_data[v_offset.g] = v.g;
top_blob_data[v_offset.b] = v.b;
top_blob_data[v_offset.a] = v.a;
top_blob_data[v_offset.r] = bottom_blob_data[gi].r;
top_blob_data[v_offset.g] = bottom_blob_data[gi].g;
top_blob_data[v_offset.b] = bottom_blob_data[gi].b;
top_blob_data[v_offset.a] = bottom_blob_data[gi].a;
} }

+ 7
- 8
src/layer/shader/padding.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -56,8 +59,6 @@ void main()
if (gx >= p.outw || gy >= p.outh || gz >= p.outc) if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return; return;


sfp res;

int x = gx - left; int x = gx - left;
int y = gy - top; int y = gy - top;


@@ -66,11 +67,11 @@ void main()
if (x >= 0 && x < p.w && y >= 0 && y < p.h) if (x >= 0 && x < p.w && y >= 0 && y < p.h)
{ {
int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }
else else
{ {
res = sfp(value);
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(value);
} }
} }
else if (type == 1) else if (type == 1)
@@ -79,8 +80,6 @@ void main()
y = clamp(y, 0, p.h - 1); y = clamp(y, 0, p.h - 1);


int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res;
} }

+ 7
- 8
src/layer/shader/padding_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -56,8 +59,6 @@ void main()
if (gx >= p.outw || gy >= p.outh || gz >= p.outc) if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return; return;


sfpvec4 res;

int x = gx - left; int x = gx - left;
int y = gy - top; int y = gy - top;


@@ -66,11 +67,11 @@ void main()
if (x >= 0 && x < p.w && y >= 0 && y < p.h) if (x >= 0 && x < p.w && y >= 0 && y < p.h)
{ {
int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }
else else
{ {
res = sfpvec4(value);
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value);
} }
} }
else if (type == 1) else if (type == 1)
@@ -79,8 +80,6 @@ void main()
y = clamp(y, 0, p.h - 1); y = clamp(y, 0, p.h - 1);


int v_offset = gz * p.cstep + y * p.w + x; int v_offset = gz * p.cstep + y * p.w + x;
res = bottom_blob_data[v_offset];
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res;
} }

+ 4
- 1
src/layer/shader/permute.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 9
- 6
src/layer/shader/permute_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -104,10 +107,10 @@ void main()
} }
} }


sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int gi = gz * p.cstep + gy * p.w + gx;


top_blob_data[v_offset.r] = v.r;
top_blob_data[v_offset.g] = v.g;
top_blob_data[v_offset.b] = v.b;
top_blob_data[v_offset.a] = v.a;
top_blob_data[v_offset.r] = bottom_blob_data[gi].r;
top_blob_data[v_offset.g] = bottom_blob_data[gi].g;
top_blob_data[v_offset.b] = bottom_blob_data[gi].b;
top_blob_data[v_offset.a] = bottom_blob_data[gi].a;
} }

+ 4
- 1
src/layer/shader/pooling.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/pooling_global.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/pooling_global_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/pooling_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 10
- 7
src/layer/shader/prelu.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -47,9 +50,9 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afp v = bottom_top_blob_data[gx];
afp v = afp(bottom_top_blob_data[gx]);


afp slope = num_slope > 1 ? slope_blob_data[gx] : slope_blob_data[0];
afp slope = num_slope > 1 ? afp(slope_blob_data[gx]) : afp(slope_blob_data[0]);


v = v < afp(0.f) ? v * slope : v; v = v < afp(0.f) ? v * slope : v;


@@ -62,9 +65,9 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


afp slope = num_slope > 1 ? slope_blob_data[gy] : slope_blob_data[0];
afp slope = num_slope > 1 ? afp(slope_blob_data[gy]) : afp(slope_blob_data[0]);


v = v < afp(0.f) ? v * slope : v; v = v < afp(0.f) ? v * slope : v;


@@ -77,9 +80,9 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


afp slope = num_slope > 1 ? slope_blob_data[gz] : slope_blob_data[0];
afp slope = num_slope > 1 ? afp(slope_blob_data[gz]) : afp(slope_blob_data[0]);


v = v < afp(0.f) ? v * slope : v; v = v < afp(0.f) ? v * slope : v;




+ 10
- 7
src/layer/shader/prelu_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -47,9 +50,9 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afpvec4 v = bottom_top_blob_data[gx];
afpvec4 v = afpvec4(bottom_top_blob_data[gx]);


afpvec4 slope = num_slope > 1 ? slope_blob_data[gx] : afpvec4(slope_blob_data[0]);
afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gx]) : afpvec4(slope_blob_data[0]);


v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); v = mix(v, v * slope, lessThan(v, afpvec4(0.f)));


@@ -62,9 +65,9 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


afpvec4 slope = num_slope > 1 ? slope_blob_data[gy] : afpvec4(slope_blob_data[0]);
afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gy]) : afpvec4(slope_blob_data[0]);


v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); v = mix(v, v * slope, lessThan(v, afpvec4(0.f)));


@@ -77,9 +80,9 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


afpvec4 slope = num_slope > 1 ? slope_blob_data[gz] : afpvec4(slope_blob_data[0]);
afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gz]) : afpvec4(slope_blob_data[0]);


v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); v = mix(v, v * slope, lessThan(v, afpvec4(0.f)));




+ 23
- 10
src/layer/shader/priorbox.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -69,14 +72,12 @@ void main()


afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h); afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h);


sfpvec4 variance = sfpvec4(variances_0, variances_1, variances_2, variances_3);

afpvec4 box; afpvec4 box;


afp box_w; afp box_w;
afp box_h; afp box_h;


afp min_size = min_sizes_data[gx];
afp min_size = afp(min_sizes_data[gx]);


// min size box // min size box
box_w = box_h = min_size; box_w = box_h = min_size;
@@ -84,14 +85,17 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;


top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box);
top_blob_data[var_offset] = variance;
top_blob_data[var_offset].r = sfp(variances_0);
top_blob_data[var_offset].g = sfp(variances_1);
top_blob_data[var_offset].b = sfp(variances_2);
top_blob_data[var_offset].a = sfp(variances_3);


v_offset += 1; v_offset += 1;
var_offset += 1; var_offset += 1;


if (num_max_size > 0) if (num_max_size > 0)
{ {
afp max_size = max_sizes_data[gx];
afp max_size = afp(max_sizes_data[gx]);


// max size box // max size box
box_w = box_h = sqrt(min_size * max_size); box_w = box_h = sqrt(min_size * max_size);
@@ -99,7 +103,10 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;


top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box);
top_blob_data[var_offset] = variance;
top_blob_data[var_offset].r = sfp(variances_0);
top_blob_data[var_offset].g = sfp(variances_1);
top_blob_data[var_offset].b = sfp(variances_2);
top_blob_data[var_offset].a = sfp(variances_3);


v_offset += 1; v_offset += 1;
var_offset += 1; var_offset += 1;
@@ -108,7 +115,7 @@ void main()
// all aspect_ratios // all aspect_ratios
for (int pi = 0; pi < num_aspect_ratio; pi++) for (int pi = 0; pi < num_aspect_ratio; pi++)
{ {
afp ar = aspect_ratios_data[pi];
afp ar = afp(aspect_ratios_data[pi]);


box_w = min_size * sqrt(ar); box_w = min_size * sqrt(ar);
box_h = min_size / sqrt(ar); box_h = min_size / sqrt(ar);
@@ -116,7 +123,10 @@ void main()
box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm;


top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box);
top_blob_data[var_offset] = variance;
top_blob_data[var_offset].r = sfp(variances_0);
top_blob_data[var_offset].g = sfp(variances_1);
top_blob_data[var_offset].b = sfp(variances_2);
top_blob_data[var_offset].a = sfp(variances_3);


v_offset += 1; v_offset += 1;
var_offset += 1; var_offset += 1;
@@ -126,7 +136,10 @@ void main()
box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm;


top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box);
top_blob_data[var_offset] = variance;
top_blob_data[var_offset].r = sfp(variances_0);
top_blob_data[var_offset].g = sfp(variances_1);
top_blob_data[var_offset].b = sfp(variances_2);
top_blob_data[var_offset].a = sfp(variances_3);


v_offset += 1; v_offset += 1;
var_offset += 1; var_offset += 1;


+ 6
- 3
src/layer/shader/priorbox_mxnet.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -58,7 +61,7 @@ void main()
afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); afpvec4 center = afpvec4(center_x, center_y, center_x, center_y);


// ratio = 1, various sizes // ratio = 1, various sizes
afp size = min_sizes_data[gx];
afp size = afp(min_sizes_data[gx]);
afp cw = size * afp(p.h) / afp(p.w) / afp(2); afp cw = size * afp(p.h) / afp(p.w) / afp(2);
afp ch = size / afp(2); afp ch = size / afp(2);


@@ -69,7 +72,7 @@ void main()
if (gx == num_sizes - 1) if (gx == num_sizes - 1)
{ {
// various ratios, size = min_size = size[0] // various ratios, size = min_size = size[0]
afp size = min_sizes_data[0];
afp size = afp(min_sizes_data[0]);
for (int pi = 1; pi < num_ratios; pi++) for (int pi = 1; pi < num_ratios; pi++)
{ {
afp ratio = sqrt(afp(aspect_ratios_data[pi])); afp ratio = sqrt(afp(aspect_ratios_data[pi]));


+ 5
- 2
src/layer/shader/relu.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


if (slope == 0) if (slope == 0)
v = max(v, afp(0.f)); v = max(v, afp(0.f));


+ 5
- 2
src/layer/shader/relu_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


if (slope == 0) if (slope == 0)
v = max(v, afp(0.f)); v = max(v, afp(0.f));


+ 5
- 4
src/layer/shader/reorg.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -58,7 +61,5 @@ void main()


int v_offset = z * p.cstep + y * p.w + x; int v_offset = z * p.cstep + y * p.w + x;


sfp v = bottom_blob_data[v_offset];

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset];
} }

+ 9
- 8
src/layer/shader/reorg_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -60,12 +63,10 @@ void main()


ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;


sfpvec4 v;

v.r = bottom_blob_data[v_offset.r];
v.g = bottom_blob_data[v_offset.g];
v.b = bottom_blob_data[v_offset.b];
v.a = bottom_blob_data[v_offset.a];
int gi = gz * p.outcstep + gy * p.outw + gx;


top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
top_blob_data[gi].r = bottom_blob_data[v_offset.r];
top_blob_data[gi].g = bottom_blob_data[v_offset.g];
top_blob_data[gi].b = bottom_blob_data[v_offset.b];
top_blob_data[gi].a = bottom_blob_data[v_offset.a];
} }

+ 25
- 30
src/layer/shader/reorg_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -63,33 +66,25 @@ void main()
ivec4 lane4 = z4 % 4; ivec4 lane4 = z4 % 4;


// v = v4[lane] // v = v4[lane]
sfpvec4 v;

sfpvec4 v4;

v4 = bottom_blob_data[v_offset.r];
if (lane4.r == 0) v.r = v4.r;
else if (lane4.r == 1) v.r = v4.g;
else if (lane4.r == 2) v.r = v4.b;
else /* if (lane4.r == 3) */ v.r = v4.a;

v4 = bottom_blob_data[v_offset.g];
if (lane4.g == 0) v.g = v4.r;
else if (lane4.g == 1) v.g = v4.g;
else if (lane4.g == 2) v.g = v4.b;
else /* if (lane4.g == 3) */ v.g = v4.a;

v4 = bottom_blob_data[v_offset.b];
if (lane4.b == 0) v.b = v4.r;
else if (lane4.b == 1) v.b = v4.g;
else if (lane4.b == 2) v.b = v4.b;
else /* if (lane4.b == 3) */ v.b = v4.a;

v4 = bottom_blob_data[v_offset.a];
if (lane4.a == 0) v.a = v4.r;
else if (lane4.a == 1) v.a = v4.g;
else if (lane4.a == 2) v.a = v4.b;
else /* if (lane4.a == 3) */ v.a = v4.a;

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
int gi = gz * p.outcstep + gy * p.outw + gx;

if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r;
else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g;
else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b;
else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a;

if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r;
else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g;
else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b;
else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a;

if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r;
else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g;
else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b;
else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a;

if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r;
else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g;
else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b;
else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a;
} }

+ 19
- 6
src/layer/shader/reshape.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -65,9 +68,19 @@ void main()


int v_offset = z * p.cstep + y * p.w + x; int v_offset = z * p.cstep + y * p.w + x;


sfp v = bottom_blob_data[v_offset];

if (ndim == 1) top_blob_data[gx] = v;
if (ndim == 2) top_blob_data[gy * p.outw + gx] = v;
if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
int gi;
if (ndim == 1)
{
gi = gx;
}
if (ndim == 2)
{
gi = gy * p.outw + gx;
}
if (ndim == 3)
{
gi = gz * p.outcstep + gy * p.outw + gx;
}

top_blob_data[gi] = bottom_blob_data[v_offset];
} }

+ 22
- 10
src/layer/shader/reshape_pack1to4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -65,13 +68,22 @@ void main()


ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4;


sfpvec4 v;
v.r = bottom_blob_data[v_offset.r];
v.g = bottom_blob_data[v_offset.g];
v.b = bottom_blob_data[v_offset.b];
v.a = bottom_blob_data[v_offset.a];

if (ndim == 1) top_blob_data[gx] = v;
if (ndim == 2) top_blob_data[gy * p.outw + gx] = v;
if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
int gi;
if (ndim == 1)
{
gi = gx;
}
if (ndim == 2)
{
gi = gy * p.outw + gx;
}
if (ndim == 3)
{
gi = gz * p.outcstep + gy * p.outw + gx;
}

top_blob_data[gi].r = bottom_blob_data[v_offset.r];
top_blob_data[gi].g = bottom_blob_data[v_offset.g];
top_blob_data[gi].b = bottom_blob_data[v_offset.b];
top_blob_data[gi].a = bottom_blob_data[v_offset.a];
} }

+ 37
- 32
src/layer/shader/reshape_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -85,35 +88,37 @@ void main()
lane4 = z4 % 4; lane4 = z4 % 4;
} }


sfpvec4 v;

sfpvec4 v4;

v4 = bottom_blob_data[v_offset.r];
if (lane4.r == 0) v.r = v4.r;
else if (lane4.r == 1) v.r = v4.g;
else if (lane4.r == 2) v.r = v4.b;
else /* if (lane4.r == 3) */ v.r = v4.a;

v4 = bottom_blob_data[v_offset.g];
if (lane4.g == 0) v.g = v4.r;
else if (lane4.g == 1) v.g = v4.g;
else if (lane4.g == 2) v.g = v4.b;
else /* if (lane4.g == 3) */ v.g = v4.a;

v4 = bottom_blob_data[v_offset.b];
if (lane4.b == 0) v.b = v4.r;
else if (lane4.b == 1) v.b = v4.g;
else if (lane4.b == 2) v.b = v4.b;
else /* if (lane4.b == 3) */ v.b = v4.a;

v4 = bottom_blob_data[v_offset.a];
if (lane4.a == 0) v.a = v4.r;
else if (lane4.a == 1) v.a = v4.g;
else if (lane4.a == 2) v.a = v4.b;
else /* if (lane4.a == 3) */ v.a = v4.a;

if (ndim == 1) top_blob_data[gx] = v;
if (ndim == 2) top_blob_data[gy * p.outw + gx] = v;
if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
int gi;
if (ndim == 1)
{
gi = gx;
}
if (ndim == 2)
{
gi = gy * p.outw + gx;
}
if (ndim == 3)
{
gi = gz * p.outcstep + gy * p.outw + gx;
}

if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r;
else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g;
else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b;
else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a;

if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r;
else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g;
else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b;
else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a;

if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r;
else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g;
else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b;
else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a;

if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r;
else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g;
else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b;
else /* if (lane4.a == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.a].a;
} }

+ 9
- 6
src/layer/shader/reshape_pack4to1.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -81,10 +84,10 @@ void main()
v_offset = z4 * p.outcstep + y4 * p.outw + x4; v_offset = z4 * p.outcstep + y4 * p.outw + x4;
} }


sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx];
int gi = gz * p.cstep + gy * p.w + gx;


top_blob_data[v_offset.r] = v.r;
top_blob_data[v_offset.g] = v.g;
top_blob_data[v_offset.b] = v.b;
top_blob_data[v_offset.a] = v.a;
top_blob_data[v_offset.r] = bottom_blob_data[gi].r;
top_blob_data[v_offset.g] = bottom_blob_data[gi].g;
top_blob_data[v_offset.b] = bottom_blob_data[gi].b;
top_blob_data[v_offset.a] = bottom_blob_data[gi].a;
} }

+ 7
- 4
src/layer/shader/scale.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -48,7 +51,7 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afp v = bottom_top_blob_data[gx];
afp v = afp(bottom_top_blob_data[gx]);


if (bias_term == 1) if (bias_term == 1)
v = afp(scale_blob_data[gx]) * v + afp(bias_blob_data[gx]); v = afp(scale_blob_data[gx]) * v + afp(bias_blob_data[gx]);
@@ -64,7 +67,7 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


if (bias_term == 1) if (bias_term == 1)
v = afp(scale_blob_data[gy]) * v + afp(bias_blob_data[gy]); v = afp(scale_blob_data[gy]) * v + afp(bias_blob_data[gy]);
@@ -80,7 +83,7 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


if (bias_term == 1) if (bias_term == 1)
v = afp(scale_blob_data[gz]) * v + afp(bias_blob_data[gz]); v = afp(scale_blob_data[gz]) * v + afp(bias_blob_data[gz]);


+ 7
- 4
src/layer/shader/scale_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -48,7 +51,7 @@ void main()


if (p.dims == 1) if (p.dims == 1)
{ {
afpvec4 v = bottom_top_blob_data[gx];
afpvec4 v = afpvec4(bottom_top_blob_data[gx]);


if (bias_term == 1) if (bias_term == 1)
v = afpvec4(scale_blob_data[gx]) * v + afpvec4(bias_blob_data[gx]); v = afpvec4(scale_blob_data[gx]) * v + afpvec4(bias_blob_data[gx]);
@@ -64,7 +67,7 @@ void main()
{ {
const int gi = gy * p.w + gx; const int gi = gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


if (bias_term == 1) if (bias_term == 1)
v = afpvec4(scale_blob_data[gy]) * v + afpvec4(bias_blob_data[gy]); v = afpvec4(scale_blob_data[gy]) * v + afpvec4(bias_blob_data[gy]);
@@ -80,7 +83,7 @@ void main()
{ {
const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


if (bias_term == 1) if (bias_term == 1)
v = afpvec4(scale_blob_data[gz]) * v + afpvec4(bias_blob_data[gz]); v = afpvec4(scale_blob_data[gz]) * v + afpvec4(bias_blob_data[gz]);


+ 4
- 1
src/layer/shader/shufflechannel.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 25
- 30
src/layer/shader/shufflechannel_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -62,33 +65,25 @@ void main()
ivec4 lane4 = z4 % 4; ivec4 lane4 = z4 % 4;


// v = v4[lane] // v = v4[lane]
sfpvec4 v;

sfpvec4 v4;

v4 = bottom_blob_data[v_offset.r];
if (lane4.r == 0) v.r = v4.r;
else if (lane4.r == 1) v.r = v4.g;
else if (lane4.r == 2) v.r = v4.b;
else /* if (lane4.r == 3) */ v.r = v4.a;

v4 = bottom_blob_data[v_offset.g];
if (lane4.g == 0) v.g = v4.r;
else if (lane4.g == 1) v.g = v4.g;
else if (lane4.g == 2) v.g = v4.b;
else /* if (lane4.g == 3) */ v.g = v4.a;

v4 = bottom_blob_data[v_offset.b];
if (lane4.b == 0) v.b = v4.r;
else if (lane4.b == 1) v.b = v4.g;
else if (lane4.b == 2) v.b = v4.b;
else /* if (lane4.b == 3) */ v.b = v4.a;

v4 = bottom_blob_data[v_offset.a];
if (lane4.a == 0) v.a = v4.r;
else if (lane4.a == 1) v.a = v4.g;
else if (lane4.a == 2) v.a = v4.b;
else /* if (lane4.a == 3) */ v.a = v4.a;

top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v;
int gi = gz * p.outcstep + gy * p.outw + gx;

if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r;
else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g;
else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b;
else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a;

if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r;
else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g;
else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b;
else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a;

if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r;
else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g;
else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b;
else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a;

if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r;
else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g;
else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b;
else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a;
} }

+ 5
- 2
src/layer/shader/sigmoid.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


v = afp(1.f) / (afp(1.f) + exp(-v)); v = afp(1.f) / (afp(1.f) + exp(-v));




+ 5
- 2
src/layer/shader/sigmoid_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


v = afp(1.f) / (afp(1.f) + exp(-v)); v = afp(1.f) / (afp(1.f) + exp(-v));




+ 16
- 13
src/layer/shader/softmax_div_sum.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -53,8 +56,8 @@ void main()


if (p.dims == 1) // axis == 0 if (p.dims == 1) // axis == 0
{ {
afp sum = sum_workspace_data[0];
afp v = bottom_top_blob_data[gx];
afp sum = afp(sum_workspace_data[0]);
afp v = afp(bottom_top_blob_data[gx]);
bottom_top_blob_data[gx] = sfp(v / sum); bottom_top_blob_data[gx] = sfp(v / sum);
return; return;
} }
@@ -62,8 +65,8 @@ void main()
if (p.dims == 2 && axis == 0) if (p.dims == 2 && axis == 0)
{ {
int gi = gy * p.w + gx; int gi = gy * p.w + gx;
afp sum = sum_workspace_data[gx];
afp v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gx]);
afp v = afp(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfp(v / sum); bottom_top_blob_data[gi] = sfp(v / sum);
return; return;
} }
@@ -71,8 +74,8 @@ void main()
if (p.dims == 2 && axis == 1) if (p.dims == 2 && axis == 1)
{ {
int gi = gy * p.w + gx; int gi = gy * p.w + gx;
afp sum = sum_workspace_data[gy];
afp v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gy]);
afp v = afp(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfp(v / sum); bottom_top_blob_data[gi] = sfp(v / sum);
return; return;
} }
@@ -80,8 +83,8 @@ void main()
if (p.dims == 3 && axis == 0) if (p.dims == 3 && axis == 0)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gy * p.w + gx];
afp v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gy * p.w + gx]);
afp v = afp(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfp(v / sum); bottom_top_blob_data[gi] = sfp(v / sum);
return; return;
} }
@@ -89,8 +92,8 @@ void main()
if (p.dims == 3 && axis == 1) if (p.dims == 3 && axis == 1)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gz * p.w + gx];
afp v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gz * p.w + gx]);
afp v = afp(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfp(v / sum); bottom_top_blob_data[gi] = sfp(v / sum);
return; return;
} }
@@ -98,8 +101,8 @@ void main()
if (p.dims == 3 && axis == 2) if (p.dims == 3 && axis == 2)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gz * p.h + gy];
afp v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gz * p.h + gy]);
afp v = afp(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfp(v / sum); bottom_top_blob_data[gi] = sfp(v / sum);
return; return;
} }


+ 16
- 13
src/layer/shader/softmax_div_sum_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -53,8 +56,8 @@ void main()


if (p.dims == 1) // axis == 0 if (p.dims == 1) // axis == 0
{ {
afp sum = sum_workspace_data[0];
afpvec4 v = bottom_top_blob_data[gx];
afp sum = afp(sum_workspace_data[0]);
afpvec4 v = afpvec4(bottom_top_blob_data[gx]);
bottom_top_blob_data[gx] = sfpvec4(v / sum); bottom_top_blob_data[gx] = sfpvec4(v / sum);
return; return;
} }
@@ -62,8 +65,8 @@ void main()
if (p.dims == 2 && axis == 0) if (p.dims == 2 && axis == 0)
{ {
int gi = gy * p.w + gx; int gi = gy * p.w + gx;
afp sum = sum_workspace_data[gx];
afpvec4 v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum); bottom_top_blob_data[gi] = sfpvec4(v / sum);
return; return;
} }
@@ -71,8 +74,8 @@ void main()
if (p.dims == 2 && axis == 1) if (p.dims == 2 && axis == 1)
{ {
int gi = gy * p.w + gx; int gi = gy * p.w + gx;
afp sum = sum_workspace_data[gy];
afpvec4 v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gy]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum); bottom_top_blob_data[gi] = sfpvec4(v / sum);
return; return;
} }
@@ -80,8 +83,8 @@ void main()
if (p.dims == 3 && axis == 0) if (p.dims == 3 && axis == 0)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gy * p.w + gx];
afpvec4 v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gy * p.w + gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum); bottom_top_blob_data[gi] = sfpvec4(v / sum);
return; return;
} }
@@ -89,8 +92,8 @@ void main()
if (p.dims == 3 && axis == 1) if (p.dims == 3 && axis == 1)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gz * p.w + gx];
afpvec4 v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gz * p.w + gx]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum); bottom_top_blob_data[gi] = sfpvec4(v / sum);
return; return;
} }
@@ -98,8 +101,8 @@ void main()
if (p.dims == 3 && axis == 2) if (p.dims == 3 && axis == 2)
{ {
int gi = gz * p.cstep + gy * p.w + gx; int gi = gz * p.cstep + gy * p.w + gx;
afp sum = sum_workspace_data[gz * p.h + gy];
afpvec4 v = bottom_top_blob_data[gi];
afp sum = afp(sum_workspace_data[gz * p.h + gy]);
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);
bottom_top_blob_data[gi] = sfpvec4(v / sum); bottom_top_blob_data[gi] = sfpvec4(v / sum);
return; return;
} }


+ 4
- 1
src/layer/shader/softmax_exp_sub_max.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/softmax_exp_sub_max_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/softmax_reduce_max.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/softmax_reduce_max_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/softmax_reduce_sum.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 4
- 1
src/layer/shader/softmax_reduce_sum_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif




+ 5
- 2
src/layer/shader/tanh.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfp(tanh(v)); bottom_top_blob_data[gi] = sfp(tanh(v));
} }

+ 5
- 2
src/layer/shader/tanh_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -44,7 +47,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


bottom_top_blob_data[gi] = sfpvec4(tanh(v)); bottom_top_blob_data[gi] = sfpvec4(tanh(v));
} }

+ 5
- 2
src/layer/shader/unaryop.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afp v = bottom_top_blob_data[gi];
afp v = afp(bottom_top_blob_data[gi]);


afp res; afp res;




+ 5
- 2
src/layer/shader/unaryop_pack4.comp View File

@@ -14,7 +14,10 @@


#version 450 #version 450


#if NCNN_fp16_storage || NCNN_fp16_arithmetic
#if NCNN_fp16_storage
#extension GL_EXT_shader_16bit_storage: require
#endif
#if NCNN_fp16_arithmetic
#extension GL_AMD_gpu_shader_half_float: require #extension GL_AMD_gpu_shader_half_float: require
#endif #endif


@@ -46,7 +49,7 @@ void main()


const int gi = gz * p.cstep + gy * p.w + gx; const int gi = gz * p.cstep + gy * p.w + gx;


afpvec4 v = bottom_top_blob_data[gi];
afpvec4 v = afpvec4(bottom_top_blob_data[gi]);


afpvec4 res; afpvec4 res;




Loading…
Cancel
Save