From 58ed8e437f4e3689708f1941d6f736f70809cee2 Mon Sep 17 00:00:00 2001 From: nihui Date: Sat, 23 Mar 2019 14:04:20 +0800 Subject: [PATCH] require GL_EXT_shader_16bit_storage only for fp16_storage, explicit type cast --- src/CMakeLists.txt | 2 +- src/layer/shader/absval.comp | 7 +- src/layer/shader/absval_pack4.comp | 7 +- src/layer/shader/batchnorm.comp | 17 +++-- src/layer/shader/batchnorm_pack4.comp | 17 +++-- src/layer/shader/binaryop.comp | 9 ++- src/layer/shader/binaryop_pack4.comp | 9 ++- src/layer/shader/clip.comp | 7 +- src/layer/shader/clip_pack4.comp | 7 +- src/layer/shader/concat.comp | 34 +++++---- src/layer/shader/concat_pack4.comp | 34 +++++---- src/layer/shader/concat_pack4to1.comp | 27 ++++---- src/layer/shader/convolution.comp | 7 +- src/layer/shader/convolution_1x1s1d1.comp | 5 +- src/layer/shader/convolution_pack1to4.comp | 11 +-- src/layer/shader/convolution_pack4.comp | 28 ++++++-- src/layer/shader/convolution_pack4to1.comp | 11 +-- src/layer/shader/convolutiondepthwise.comp | 7 +- .../shader/convolutiondepthwise_group.comp | 7 +- .../convolutiondepthwise_group_pack1to4.comp | 11 +-- .../convolutiondepthwise_group_pack4.comp | 26 +++++-- .../convolutiondepthwise_group_pack4to1.comp | 11 +-- .../shader/convolutiondepthwise_pack4.comp | 13 ++-- src/layer/shader/crop.comp | 5 +- src/layer/shader/crop_pack4.comp | 5 +- src/layer/shader/deconvolution.comp | 7 +- src/layer/shader/deconvolution_pack1to4.comp | 11 +-- src/layer/shader/deconvolution_pack4.comp | 28 ++++++-- src/layer/shader/deconvolution_pack4to1.comp | 11 +-- src/layer/shader/deconvolutiondepthwise.comp | 7 +- .../shader/deconvolutiondepthwise_group.comp | 7 +- ...deconvolutiondepthwise_group_pack1to4.comp | 11 +-- .../deconvolutiondepthwise_group_pack4.comp | 28 ++++++-- ...deconvolutiondepthwise_group_pack4to1.comp | 11 +-- .../shader/deconvolutiondepthwise_pack4.comp | 11 +-- src/layer/shader/dropout.comp | 7 +- src/layer/shader/dropout_pack4.comp | 7 +- src/layer/shader/eltwise.comp | 9 ++- src/layer/shader/eltwise_pack4.comp | 9 ++- src/layer/shader/flatten.comp | 5 +- src/layer/shader/flatten_pack4.comp | 16 ++--- src/layer/shader/innerproduct.comp | 7 +- src/layer/shader/innerproduct_pack1to4.comp | 13 ++-- src/layer/shader/innerproduct_pack4.comp | 30 ++++++-- src/layer/shader/innerproduct_pack4to1.comp | 11 +-- src/layer/shader/interp.comp | 21 +++--- src/layer/shader/interp_pack4.comp | 21 +++--- src/layer/shader/lrn_norm.comp | 7 +- .../shader/lrn_norm_across_channel_pack4.comp | 7 +- .../shader/lrn_norm_within_channel_pack4.comp | 7 +- src/layer/shader/lrn_square_pad.comp | 9 ++- .../lrn_square_pad_across_channel_pack4.comp | 7 +- .../lrn_square_pad_within_channel_pack4.comp | 7 +- src/layer/shader/packing_1to4.comp | 25 +++---- src/layer/shader/packing_4to1.comp | 15 ++-- src/layer/shader/padding.comp | 15 ++-- src/layer/shader/padding_pack4.comp | 15 ++-- src/layer/shader/permute.comp | 5 +- src/layer/shader/permute_pack4to1.comp | 15 ++-- src/layer/shader/pooling.comp | 5 +- src/layer/shader/pooling_global.comp | 5 +- src/layer/shader/pooling_global_pack4.comp | 5 +- src/layer/shader/pooling_pack4.comp | 5 +- src/layer/shader/prelu.comp | 17 +++-- src/layer/shader/prelu_pack4.comp | 17 +++-- src/layer/shader/priorbox.comp | 33 ++++++--- src/layer/shader/priorbox_mxnet.comp | 9 ++- src/layer/shader/relu.comp | 7 +- src/layer/shader/relu_pack4.comp | 7 +- src/layer/shader/reorg.comp | 9 +-- src/layer/shader/reorg_pack1to4.comp | 17 ++--- src/layer/shader/reorg_pack4.comp | 55 +++++++-------- src/layer/shader/reshape.comp | 25 +++++-- src/layer/shader/reshape_pack1to4.comp | 32 ++++++--- src/layer/shader/reshape_pack4.comp | 69 ++++++++++--------- src/layer/shader/reshape_pack4to1.comp | 15 ++-- src/layer/shader/scale.comp | 11 +-- src/layer/shader/scale_pack4.comp | 11 +-- src/layer/shader/shufflechannel.comp | 5 +- src/layer/shader/shufflechannel_pack4.comp | 55 +++++++-------- src/layer/shader/sigmoid.comp | 7 +- src/layer/shader/sigmoid_pack4.comp | 7 +- src/layer/shader/softmax_div_sum.comp | 29 ++++---- src/layer/shader/softmax_div_sum_pack4.comp | 29 ++++---- src/layer/shader/softmax_exp_sub_max.comp | 5 +- .../shader/softmax_exp_sub_max_pack4.comp | 5 +- src/layer/shader/softmax_reduce_max.comp | 5 +- .../shader/softmax_reduce_max_pack4.comp | 5 +- src/layer/shader/softmax_reduce_sum.comp | 5 +- .../shader/softmax_reduce_sum_pack4.comp | 5 +- src/layer/shader/tanh.comp | 7 +- src/layer/shader/tanh_pack4.comp | 7 +- src/layer/shader/unaryop.comp | 7 +- src/layer/shader/unaryop_pack4.comp | 7 +- 94 files changed, 809 insertions(+), 481 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 148fd11e9..d675f7cba 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -112,7 +112,7 @@ macro(ncnn_add_layer class) add_custom_command( OUTPUT ${SHADER_fp16s_SPV_HEX_FILE} COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} - ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpmat4=f16mat4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC} + ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC} DEPENDS ${SHADER_SRC} COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv" VERBATIM diff --git a/src/layer/shader/absval.comp b/src/layer/shader/absval.comp index 64b2aa30a..9cb8f246f 100644 --- a/src/layer/shader/absval.comp +++ b/src/layer/shader/absval.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(abs(v)); } diff --git a/src/layer/shader/absval_pack4.comp b/src/layer/shader/absval_pack4.comp index 4bfadd8de..c6bef96c0 100644 --- a/src/layer/shader/absval_pack4.comp +++ b/src/layer/shader/absval_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(abs(v)); } diff --git a/src/layer/shader/batchnorm.comp b/src/layer/shader/batchnorm.comp index 4466b4261..12f7d8362 100644 --- a/src/layer/shader/batchnorm.comp +++ b/src/layer/shader/batchnorm.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,9 +49,9 @@ void main() if (p.dims == 1) { - afp v = bottom_top_blob_data[gx]; + afp v = afp(bottom_top_blob_data[gx]); - v = b_data[gx] * v + a_data[gx]; + v = afp(b_data[gx]) * v + afp(a_data[gx]); bottom_top_blob_data[gx] = sfp(v); @@ -59,9 +62,9 @@ void main() { const int gi = gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); - v = b_data[gy] * v + a_data[gy]; + v = afp(b_data[gy]) * v + afp(a_data[gy]); bottom_top_blob_data[gi] = sfp(v); @@ -72,9 +75,9 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); - v = b_data[gz] * v + a_data[gz]; + v = afp(b_data[gz]) * v + afp(a_data[gz]); bottom_top_blob_data[gi] = sfp(v); diff --git a/src/layer/shader/batchnorm_pack4.comp b/src/layer/shader/batchnorm_pack4.comp index 302dcb436..00a7dc7b9 100644 --- a/src/layer/shader/batchnorm_pack4.comp +++ b/src/layer/shader/batchnorm_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,9 +49,9 @@ void main() if (p.dims == 1) { - afpvec4 v = bottom_top_blob_data[gx]; + afpvec4 v = afpvec4(bottom_top_blob_data[gx]); - v = b_data[gx] * v + a_data[gx]; + v = afpvec4(b_data[gx]) * v + afpvec4(a_data[gx]); bottom_top_blob_data[gx] = sfpvec4(v); @@ -59,9 +62,9 @@ void main() { const int gi = gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - v = b_data[gy] * v + a_data[gy]; + v = afpvec4(b_data[gy]) * v + afpvec4(a_data[gy]); bottom_top_blob_data[gi] = sfpvec4(v); @@ -72,9 +75,9 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - v = b_data[gz] * v + a_data[gz]; + v = afpvec4(b_data[gz]) * v + afpvec4(a_data[gz]); bottom_top_blob_data[gi] = sfpvec4(v); diff --git a/src/layer/shader/binaryop.comp b/src/layer/shader/binaryop.comp index 4dabda3c8..518f85b0a 100644 --- a/src/layer/shader/binaryop.comp +++ b/src/layer/shader/binaryop.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -62,7 +65,7 @@ void main() const int gi = gz * p.outcstep + gy * p.outw + gx; - afp v1 = a_blob_data[gi]; + afp v1 = afp(a_blob_data[gi]); afp res; @@ -87,7 +90,7 @@ void main() if (p.adims == p.bdims) { - afp v2 = b_blob_data[gi]; + afp v2 = afp(b_blob_data[gi]); if (op_type == 0) res = v1 + v2; if (op_type == 1) res = v1 - v2; diff --git a/src/layer/shader/binaryop_pack4.comp b/src/layer/shader/binaryop_pack4.comp index 6d070eab4..e85195e9c 100644 --- a/src/layer/shader/binaryop_pack4.comp +++ b/src/layer/shader/binaryop_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -62,7 +65,7 @@ void main() const int gi = gz * p.outcstep + gy * p.outw + gx; - afpvec4 v1 = a_blob_data[gi]; + afpvec4 v1 = afpvec4(a_blob_data[gi]); afpvec4 res; @@ -87,7 +90,7 @@ void main() if (p.adims == p.bdims) { - afpvec4 v2 = b_blob_data[gi]; + afpvec4 v2 = afpvec4(b_blob_data[gi]); if (op_type == 0) res = v1 + v2; if (op_type == 1) res = v1 - v2; diff --git a/src/layer/shader/clip.comp b/src/layer/shader/clip.comp index e7ba10938..7b8ed94a4 100644 --- a/src/layer/shader/clip.comp +++ b/src/layer/shader/clip.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -47,7 +50,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(clamp(v, afp(const_min), afp(const_max))); } diff --git a/src/layer/shader/clip_pack4.comp b/src/layer/shader/clip_pack4.comp index 27043e657..980e7ddc5 100644 --- a/src/layer/shader/clip_pack4.comp +++ b/src/layer/shader/clip_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -47,7 +50,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(clamp(v, afp(const_min), afp(const_max))); } diff --git a/src/layer/shader/concat.comp b/src/layer/shader/concat.comp index 09535f648..c7e38d4fe 100644 --- a/src/layer/shader/concat.comp +++ b/src/layer/shader/concat.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -53,39 +56,34 @@ void main() if (gx >= p.w || gy >= p.h || gz >= p.c) return; - int v_offset; - sfp v; - if (p.dims == 1) // axis == 0 { - v_offset = gx + p.offset; - v = bottom_blob_data[gx]; + int v_offset = gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gx]; } else if (p.dims == 2 && axis == 0) { - v_offset = (gy + p.offset) * p.outw + gx; - v = bottom_blob_data[gy * p.w + gx]; + int v_offset = (gy + p.offset) * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; } else if (p.dims == 2 && axis == 1) { - v_offset = gy * p.outw + gx + p.offset; - v = bottom_blob_data[gy * p.w + gx]; + int v_offset = gy * p.outw + gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; } else if (p.dims == 3 && axis == 0) { - v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } else if (p.dims == 3 && axis == 1) { - v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } else if (p.dims == 3 && axis == 2) { - v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } - - top_blob_data[v_offset] = v; } diff --git a/src/layer/shader/concat_pack4.comp b/src/layer/shader/concat_pack4.comp index fe4642fcb..27cecb2cb 100644 --- a/src/layer/shader/concat_pack4.comp +++ b/src/layer/shader/concat_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -53,39 +56,34 @@ void main() if (gx >= p.w || gy >= p.h || gz >= p.c) return; - int v_offset; - sfpvec4 v; - if (p.dims == 1) // axis == 0 { - v_offset = gx + p.offset; - v = bottom_blob_data[gx]; + int v_offset = gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gx]; } else if (p.dims == 2 && axis == 0) { - v_offset = (gy + p.offset) * p.outw + gx; - v = bottom_blob_data[gy * p.w + gx]; + int v_offset = (gy + p.offset) * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; } else if (p.dims == 2 && axis == 1) { - v_offset = gy * p.outw + gx + p.offset; - v = bottom_blob_data[gy * p.w + gx]; + int v_offset = gy * p.outw + gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; } else if (p.dims == 3 && axis == 0) { - v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } else if (p.dims == 3 && axis == 1) { - v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } else if (p.dims == 3 && axis == 2) { - v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; + top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; } - - top_blob_data[v_offset] = v; } diff --git a/src/layer/shader/concat_pack4to1.comp b/src/layer/shader/concat_pack4to1.comp index 703c36fcd..e801d91fb 100644 --- a/src/layer/shader/concat_pack4to1.comp +++ b/src/layer/shader/concat_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -54,41 +57,41 @@ void main() return; ivec4 v_offset; - sfpvec4 v; + int gi; if (p.dims == 1) // axis == 0 { v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3); - v = bottom_blob_data[gx]; + gi = gx; } else if (p.dims == 2 && axis == 0) { v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx; - v = bottom_blob_data[gy * p.w + gx]; + gi = gy * p.w + gx; } else if (p.dims == 2 && axis == 1) { v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset; - v = bottom_blob_data[gy * p.w + gx]; + gi = gy * p.w + gx; } else if (p.dims == 3 && axis == 0) { v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + gi = gz * p.cstep + gy * p.w + gx; } else if (p.dims == 3 && axis == 1) { v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + gi = gz * p.cstep + gy * p.w + gx; } else if (p.dims == 3 && axis == 2) { v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset; - v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + gi = gz * p.cstep + gy * p.w + gx; } - top_blob_data[v_offset.r] = v.r; - top_blob_data[v_offset.g] = v.g; - top_blob_data[v_offset.b] = v.b; - top_blob_data[v_offset.a] = v.a; + top_blob_data[v_offset.r] = bottom_blob_data[gi].r; + top_blob_data[v_offset.g] = bottom_blob_data[gi].g; + top_blob_data[v_offset.b] = bottom_blob_data[gi].b; + top_blob_data[v_offset.a] = bottom_blob_data[gi].a; } diff --git a/src/layer/shader/convolution.comp b/src/layer/shader/convolution.comp index ece8f8c54..fe8799e65 100644 --- a/src/layer/shader/convolution.comp +++ b/src/layer/shader/convolution.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/convolution_1x1s1d1.comp b/src/layer/shader/convolution_1x1s1d1.comp index d789fbdfd..541150bbd 100644 --- a/src/layer/shader/convolution_1x1s1d1.comp +++ b/src/layer/shader/convolution_1x1s1d1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/convolution_pack1to4.comp b/src/layer/shader/convolution_pack1to4.comp index e74a2c4af..35243e7d6 100644 --- a/src/layer/shader/convolution_pack1to4.comp +++ b/src/layer/shader/convolution_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -80,9 +83,9 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afp v = bottom_blob_data[v_offset + x * dilation_w]; + afp v = afp(bottom_blob_data[v_offset + x * dilation_w]); - afpvec4 k = weight_data[w_offset + x]; + afpvec4 k = afpvec4(weight_data[w_offset + x]); sum += v * k; } diff --git a/src/layer/shader/convolution_pack4.comp b/src/layer/shader/convolution_pack4.comp index d6e8060af..0417bd970 100644 --- a/src/layer/shader/convolution_pack4.comp +++ b/src/layer/shader/convolution_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter @@ -63,7 +71,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -80,9 +88,19 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; - - afpmat4 k = weight_data[w_offset + x]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); + +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + x) * 4 + 0]), + afpvec4(weight_data[(w_offset + x) * 4 + 1]), + afpvec4(weight_data[(w_offset + x) * 4 + 2]), + afpvec4(weight_data[(w_offset + x) * 4 + 3]) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset + x]); +#endif sum += v * k; } diff --git a/src/layer/shader/convolution_pack4to1.comp b/src/layer/shader/convolution_pack4to1.comp index 8476bbc51..eaee7014c 100644 --- a/src/layer/shader/convolution_pack4to1.comp +++ b/src/layer/shader/convolution_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { @@ -80,9 +83,9 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); - afpvec4 k = weight_data[w_offset + x]; + afpvec4 k = afpvec4(weight_data[w_offset + x]); sum += dot(v, k); } diff --git a/src/layer/shader/convolutiondepthwise.comp b/src/layer/shader/convolutiondepthwise.comp index 4db971af9..06de34e17 100644 --- a/src/layer/shader/convolutiondepthwise.comp +++ b/src/layer/shader/convolutiondepthwise.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/convolutiondepthwise_group.comp b/src/layer/shader/convolutiondepthwise_group.comp index 9736d6b17..880ef7d95 100644 --- a/src/layer/shader/convolutiondepthwise_group.comp +++ b/src/layer/shader/convolutiondepthwise_group.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/convolutiondepthwise_group_pack1to4.comp b/src/layer/shader/convolutiondepthwise_group_pack1to4.comp index ad8644eee..92a8904a6 100644 --- a/src/layer/shader/convolutiondepthwise_group_pack1to4.comp +++ b/src/layer/shader/convolutiondepthwise_group_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -89,9 +92,9 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afp v = bottom_blob_data[v_offset + x * dilation_w]; + afp v = afp(bottom_blob_data[v_offset + x * dilation_w]); - afpvec4 k = weight_data[w_offset + x]; + afpvec4 k = afpvec4(weight_data[w_offset + x]); sum += v * k; } diff --git a/src/layer/shader/convolutiondepthwise_group_pack4.comp b/src/layer/shader/convolutiondepthwise_group_pack4.comp index 8a92033e8..6e7946e3f 100644 --- a/src/layer/shader/convolutiondepthwise_group_pack4.comp +++ b/src/layer/shader/convolutiondepthwise_group_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter @@ -64,7 +72,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -89,9 +97,19 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; - + afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); + +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + x) * 4 + 0]), + afpvec4(weight_data[(w_offset + x) * 4 + 1]), + afpvec4(weight_data[(w_offset + x) * 4 + 2]), + afpvec4(weight_data[(w_offset + x) * 4 + 3]) + ); +#else afpmat4 k = weight_data[w_offset + x]; +#endif sum += v * k; } diff --git a/src/layer/shader/convolutiondepthwise_group_pack4to1.comp b/src/layer/shader/convolutiondepthwise_group_pack4to1.comp index 2b112c462..94e136eed 100644 --- a/src/layer/shader/convolutiondepthwise_group_pack4to1.comp +++ b/src/layer/shader/convolutiondepthwise_group_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { @@ -89,9 +92,9 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); - afpvec4 k = weight_data[w_offset + x]; + afpvec4 k = afpvec4(weight_data[w_offset + x]); sum += dot(v, k); } diff --git a/src/layer/shader/convolutiondepthwise_pack4.comp b/src/layer/shader/convolutiondepthwise_pack4.comp index 382eb960c..3dc2295a4 100644 --- a/src/layer/shader/convolutiondepthwise_pack4.comp +++ b/src/layer/shader/convolutiondepthwise_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,11 +67,11 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { - sum = afpvec4(0.0); + sum = afpvec4(0.f); } // depth-wise convolution @@ -79,9 +82,9 @@ void main() { for (int x = 0; x < kernel_w; x++) { - afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); - afpvec4 k = weight_data[w_offset + x]; + afpvec4 k = afpvec4(weight_data[w_offset + x]); sum += v * k; } diff --git a/src/layer/shader/crop.comp b/src/layer/shader/crop.comp index d74126d79..7cd867aea 100644 --- a/src/layer/shader/crop.comp +++ b/src/layer/shader/crop.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/crop_pack4.comp b/src/layer/shader/crop_pack4.comp index 3c357e08a..1a723d63f 100644 --- a/src/layer/shader/crop_pack4.comp +++ b/src/layer/shader/crop_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/deconvolution.comp b/src/layer/shader/deconvolution.comp index 88dac7dbf..0487bccaa 100644 --- a/src/layer/shader/deconvolution.comp +++ b/src/layer/shader/deconvolution.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/deconvolution_pack1to4.comp b/src/layer/shader/deconvolution_pack1to4.comp index e2263b6a5..8e4a369ee 100644 --- a/src/layer/shader/deconvolution_pack1to4.comp +++ b/src/layer/shader/deconvolution_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -100,9 +103,9 @@ void main() for (int z = 0; z < p.c; z++) { - afp v = bottom_blob_data[v_offset]; + afp v = afp(bottom_blob_data[v_offset]); - afpvec4 k = weight_data[w_offset]; + afpvec4 k = afpvec4(weight_data[w_offset]); sum += v * k; diff --git a/src/layer/shader/deconvolution_pack4.comp b/src/layer/shader/deconvolution_pack4.comp index fb69ad456..c79aca157 100644 --- a/src/layer/shader/deconvolution_pack4.comp +++ b/src/layer/shader/deconvolution_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter @@ -63,7 +71,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -100,9 +108,19 @@ void main() for (int z = 0; z < p.c; z++) { - afpvec4 v = bottom_blob_data[v_offset]; - - afpmat4 k = weight_data[w_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); + +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + x) * 4 + 0]), + afpvec4(weight_data[(w_offset + x) * 4 + 1]), + afpvec4(weight_data[(w_offset + x) * 4 + 2]), + afpvec4(weight_data[(w_offset + x) * 4 + 3]) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset]); +#endif sum += v * k; diff --git a/src/layer/shader/deconvolution_pack4to1.comp b/src/layer/shader/deconvolution_pack4to1.comp index 3909743df..24a3f7785 100644 --- a/src/layer/shader/deconvolution_pack4to1.comp +++ b/src/layer/shader/deconvolution_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { @@ -100,9 +103,9 @@ void main() for (int z = 0; z < p.c; z++) { - afpvec4 v = bottom_blob_data[v_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); - afpvec4 k = weight_data[w_offset]; + afpvec4 k = afpvec4(weight_data[w_offset]); sum += dot(v, k); diff --git a/src/layer/shader/deconvolutiondepthwise.comp b/src/layer/shader/deconvolutiondepthwise.comp index 13f70cc58..dfcf78640 100644 --- a/src/layer/shader/deconvolutiondepthwise.comp +++ b/src/layer/shader/deconvolutiondepthwise.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/deconvolutiondepthwise_group.comp b/src/layer/shader/deconvolutiondepthwise_group.comp index 089ac0106..713418138 100644 --- a/src/layer/shader/deconvolutiondepthwise_group.comp +++ b/src/layer/shader/deconvolutiondepthwise_group.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp index bf79221bd..572fd58c2 100644 --- a/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp +++ b/src/layer/shader/deconvolutiondepthwise_group_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -109,9 +112,9 @@ void main() for (int z = 0; z < channels_g; z++) { - afp v = bottom_blob_data[v_offset]; + afp v = afp(bottom_blob_data[v_offset]); - afpvec4 k = weight_data[w_offset]; + afpvec4 k = afpvec4(weight_data[w_offset]); sum += v * k; diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp index 540a02726..6fb091fe6 100644 --- a/src/layer/shader/deconvolutiondepthwise_group_pack4.comp +++ b/src/layer/shader/deconvolutiondepthwise_group_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter @@ -64,7 +72,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -109,9 +117,19 @@ void main() for (int z = 0; z < channels_g; z++) { - afpvec4 v = bottom_blob_data[v_offset]; - - afpmat4 k = weight_data[w_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); + +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + x) * 4 + 0]), + afpvec4(weight_data[(w_offset + x) * 4 + 1]), + afpvec4(weight_data[(w_offset + x) * 4 + 2]), + afpvec4(weight_data[(w_offset + x) * 4 + 3]) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset]); +#endif sum += v * k; diff --git a/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp index de57041ec..c8de0db29 100644 --- a/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp +++ b/src/layer/shader/deconvolutiondepthwise_group_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afp(bias_data[gz]); } else { @@ -109,9 +112,9 @@ void main() for (int z = 0; z < channels_g; z++) { - afpvec4 v = bottom_blob_data[v_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); - afpvec4 k = weight_data[w_offset]; + afpvec4 k = afpvec4(weight_data[w_offset]); sum += dot(v, k); diff --git a/src/layer/shader/deconvolutiondepthwise_pack4.comp b/src/layer/shader/deconvolutiondepthwise_pack4.comp index 1f6cb0fa3..959107e94 100644 --- a/src/layer/shader/deconvolutiondepthwise_pack4.comp +++ b/src/layer/shader/deconvolutiondepthwise_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -64,7 +67,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gz]; + sum = afpvec4(bias_data[gz]); } else { @@ -101,9 +104,9 @@ void main() int v_offset = v_offset_0 + sy * p.w + sx; int w_offset = w_offset_0 + y * kernel_w + x; - afpvec4 v = bottom_blob_data[v_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); - afpvec4 k = weight_data[w_offset]; + afpvec4 k = afpvec4(weight_data[w_offset]); sum += v * k; } diff --git a/src/layer/shader/dropout.comp b/src/layer/shader/dropout.comp index 076e86c5f..4bfda0304 100644 --- a/src/layer/shader/dropout.comp +++ b/src/layer/shader/dropout.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); v *= afp(scale); diff --git a/src/layer/shader/dropout_pack4.comp b/src/layer/shader/dropout_pack4.comp index b2610d0a4..94f2ca085 100644 --- a/src/layer/shader/dropout_pack4.comp +++ b/src/layer/shader/dropout_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); v *= afp(scale); diff --git a/src/layer/shader/eltwise.comp b/src/layer/shader/eltwise.comp index 5631837ed..c21a034f5 100644 --- a/src/layer/shader/eltwise.comp +++ b/src/layer/shader/eltwise.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -52,8 +55,8 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v1 = bottom_blob1_data[gi]; - afp v2 = bottom_blob2_data[gi]; + afp v1 = afp(bottom_blob1_data[gi]); + afp v2 = afp(bottom_blob2_data[gi]); afp res; diff --git a/src/layer/shader/eltwise_pack4.comp b/src/layer/shader/eltwise_pack4.comp index 8e0a4af41..05165104e 100644 --- a/src/layer/shader/eltwise_pack4.comp +++ b/src/layer/shader/eltwise_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -52,8 +55,8 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v1 = bottom_blob1_data[gi]; - afpvec4 v2 = bottom_blob2_data[gi]; + afpvec4 v1 = afpvec4(bottom_blob1_data[gi]); + afpvec4 v2 = afpvec4(bottom_blob2_data[gi]); afpvec4 res; diff --git a/src/layer/shader/flatten.comp b/src/layer/shader/flatten.comp index ef80622a0..68c728b80 100644 --- a/src/layer/shader/flatten.comp +++ b/src/layer/shader/flatten.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/flatten_pack4.comp b/src/layer/shader/flatten_pack4.comp index 7f02cdccb..c214c5706 100644 --- a/src/layer/shader/flatten_pack4.comp +++ b/src/layer/shader/flatten_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -74,11 +77,8 @@ void main() v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4; } - sfpvec4 v; - v.r = bottom_blob_data[v_offset.r]; - v.g = bottom_blob_data[v_offset.g]; - v.b = bottom_blob_data[v_offset.b]; - v.a = bottom_blob_data[v_offset.a]; - - top_blob_data[gx] = v; + top_blob_data[gx].r = bottom_blob_data[v_offset.r]; + top_blob_data[gx].g = bottom_blob_data[v_offset.g]; + top_blob_data[gx].b = bottom_blob_data[v_offset.b]; + top_blob_data[gx].a = bottom_blob_data[v_offset.a]; } diff --git a/src/layer/shader/innerproduct.comp b/src/layer/shader/innerproduct.comp index 97242efb8..006cbd9fa 100644 --- a/src/layer/shader/innerproduct.comp +++ b/src/layer/shader/innerproduct.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -57,7 +60,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gx]; + sum = afp(bias_data[gx]); } else { diff --git a/src/layer/shader/innerproduct_pack1to4.comp b/src/layer/shader/innerproduct_pack1to4.comp index 8fe75bcac..a27ae22cb 100644 --- a/src/layer/shader/innerproduct_pack1to4.comp +++ b/src/layer/shader/innerproduct_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -57,20 +60,20 @@ void main() if (bias_term == 1) { - sum = bias_data[gx]; + sum = afpvec4(bias_data[gx]); } else { - sum = afpvec4(0.0); + sum = afpvec4(0.f); } int w_offset = gx * p.w; for (int i = 0; i < p.w; i++) { - afp v = bottom_blob_data[i]; + afp v = afp(bottom_blob_data[i]); - afpvec4 k = weight_data[w_offset + i]; + afpvec4 k = afpvec4(weight_data[w_offset + i]); sum += v * k; } diff --git a/src/layer/shader/innerproduct_pack4.comp b/src/layer/shader/innerproduct_pack4.comp index 4973543e8..17cf7869e 100644 --- a/src/layer/shader/innerproduct_pack4.comp +++ b/src/layer/shader/innerproduct_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -26,7 +29,12 @@ layout (local_size_z_id = 235) in; layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic +// GL_EXT_shader_16bit_storage does not define f16mat4 type :( +layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; +#else layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; +#endif layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; layout (push_constant) uniform parameter @@ -57,20 +65,30 @@ void main() if (bias_term == 1) { - sum = bias_data[gx]; + sum = afpvec4(bias_data[gx]); } else { - sum = afpvec4(0.0); + sum = afpvec4(0.f); } int w_offset = gx * p.w; for (int i = 0; i < p.w; i++) { - afpvec4 v = bottom_blob_data[i]; - - afpmat4 k = weight_data[w_offset + i]; + afpvec4 v = afpvec4(bottom_blob_data[i]); + +#if NCNN_fp16_storage && !NCNN_fp16_arithmetic + // GL_EXT_shader_16bit_storage does not define f16mat4 type :( + afpmat4 k = afpmat4( + afpvec4(weight_data[(w_offset + i) * 4 + 0]), + afpvec4(weight_data[(w_offset + i) * 4 + 1]), + afpvec4(weight_data[(w_offset + i) * 4 + 2]), + afpvec4(weight_data[(w_offset + i) * 4 + 3]) + ); +#else + afpmat4 k = afpmat4(weight_data[w_offset + i]); +#endif sum += v * k; } diff --git a/src/layer/shader/innerproduct_pack4to1.comp b/src/layer/shader/innerproduct_pack4to1.comp index 70d2aa7f4..a93464439 100644 --- a/src/layer/shader/innerproduct_pack4to1.comp +++ b/src/layer/shader/innerproduct_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -57,7 +60,7 @@ void main() if (bias_term == 1) { - sum = bias_data[gx]; + sum = afp(bias_data[gx]); } else { @@ -68,9 +71,9 @@ void main() for (int i = 0; i < p.w; i++) { - afpvec4 v = bottom_blob_data[i]; + afpvec4 v = afpvec4(bottom_blob_data[i]); - afpvec4 k = weight_data[w_offset + i]; + afpvec4 k = afpvec4(weight_data[w_offset + i]); sum += dot(v, k); } diff --git a/src/layer/shader/interp.comp b/src/layer/shader/interp.comp index 482f76482..0c6ecd3d0 100644 --- a/src/layer/shader/interp.comp +++ b/src/layer/shader/interp.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -54,8 +57,6 @@ void main() if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; - afp res; - if (resize_type == 1) // nearest { afpvec2 gxy = afpvec2(gx, gy); @@ -67,7 +68,7 @@ void main() int v_offset = gz * p.cstep + sy * p.w + sx; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } else if (resize_type == 2) // bilinear { @@ -94,17 +95,15 @@ void main() int v_offset_0 = gz * p.cstep + sy * p.w + sx; int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; - afp a0 = bottom_blob_data[v_offset_0]; - afp a1 = bottom_blob_data[v_offset_0 + 1]; - afp b0 = bottom_blob_data[v_offset_1]; - afp b1 = bottom_blob_data[v_offset_1 + 1]; + afp a0 = afp(bottom_blob_data[v_offset_0]); + afp a1 = afp(bottom_blob_data[v_offset_0 + 1]); + afp b0 = afp(bottom_blob_data[v_offset_1]); + afp b1 = afp(bottom_blob_data[v_offset_1 + 1]); afp fx = fxy.r; afp fy = fxy.g; afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx; - res = ab.r * (afp(1.f) - fy) + ab.g * fy; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(ab.r * (afp(1.f) - fy) + ab.g * fy); } - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(res); } diff --git a/src/layer/shader/interp_pack4.comp b/src/layer/shader/interp_pack4.comp index add7fd951..ada930107 100644 --- a/src/layer/shader/interp_pack4.comp +++ b/src/layer/shader/interp_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -54,7 +57,7 @@ void main() if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; - afpvec4 res; +// afpvec4 res; if (resize_type == 1) // nearest { @@ -67,7 +70,7 @@ void main() int v_offset = gz * p.cstep + sy * p.w + sx; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } else if (resize_type == 2) // bilinear { @@ -94,10 +97,10 @@ void main() int v_offset_0 = gz * p.cstep + sy * p.w + sx; int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; - afpvec4 a0 = bottom_blob_data[v_offset_0]; - afpvec4 a1 = bottom_blob_data[v_offset_0 + 1]; - afpvec4 b0 = bottom_blob_data[v_offset_1]; - afpvec4 b1 = bottom_blob_data[v_offset_1 + 1]; + afpvec4 a0 = afpvec4(bottom_blob_data[v_offset_0]); + afpvec4 a1 = afpvec4(bottom_blob_data[v_offset_0 + 1]); + afpvec4 b0 = afpvec4(bottom_blob_data[v_offset_1]); + afpvec4 b1 = afpvec4(bottom_blob_data[v_offset_1 + 1]); afp fx = fxy.r; afp fy = fxy.g; @@ -105,8 +108,6 @@ void main() afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx; afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx; - res = a * (afp(1.f) - fy) + b * fy; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(a * (afp(1.f) - fy) + b * fy); } - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(res); } diff --git a/src/layer/shader/lrn_norm.comp b/src/layer/shader/lrn_norm.comp index 74dcf07d4..45e2bc2e1 100644 --- a/src/layer/shader/lrn_norm.comp +++ b/src/layer/shader/lrn_norm.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -95,7 +98,7 @@ void main() scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta)); } - afp v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; + afp v = afp(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); v *= scale; diff --git a/src/layer/shader/lrn_norm_across_channel_pack4.comp b/src/layer/shader/lrn_norm_across_channel_pack4.comp index d866f9c61..7089f6ddb 100644 --- a/src/layer/shader/lrn_norm_across_channel_pack4.comp +++ b/src/layer/shader/lrn_norm_across_channel_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -75,7 +78,7 @@ void main() afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); - afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; + afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); v *= scale; diff --git a/src/layer/shader/lrn_norm_within_channel_pack4.comp b/src/layer/shader/lrn_norm_within_channel_pack4.comp index 80cf44aa5..9b1fe0ad5 100644 --- a/src/layer/shader/lrn_norm_within_channel_pack4.comp +++ b/src/layer/shader/lrn_norm_within_channel_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -75,7 +78,7 @@ void main() afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); - afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; + afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); v *= scale; diff --git a/src/layer/shader/lrn_square_pad.comp b/src/layer/shader/lrn_square_pad.comp index d7ab5730f..1b951aa9b 100644 --- a/src/layer/shader/lrn_square_pad.comp +++ b/src/layer/shader/lrn_square_pad.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -62,7 +65,7 @@ void main() if (z >= 0 && z < p.c) { int v_offset = z * p.cstep + gy * p.w + gx; - afp v = bottom_blob_data[v_offset]; + afp v = afp(bottom_blob_data[v_offset]); res = v * v; } else @@ -78,7 +81,7 @@ void main() if (x >= 0 && x < p.w && y >= 0 && y < p.h) { int v_offset = gz * p.cstep + y * p.w + x; - afp v = bottom_blob_data[v_offset]; + afp v = afp(bottom_blob_data[v_offset]); res = v * v; } else diff --git a/src/layer/shader/lrn_square_pad_across_channel_pack4.comp b/src/layer/shader/lrn_square_pad_across_channel_pack4.comp index 95b7fb42f..45a8a69fb 100644 --- a/src/layer/shader/lrn_square_pad_across_channel_pack4.comp +++ b/src/layer/shader/lrn_square_pad_across_channel_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -62,7 +65,7 @@ void main() if (z >= 0 && z < p.c) { int v_offset = z * p.cstep + gy * p.w + gx; - afpvec4 v4 = bottom_blob_data[v_offset]; + afpvec4 v4 = afpvec4(bottom_blob_data[v_offset]); int lane = (gz - pad_head) % 4; diff --git a/src/layer/shader/lrn_square_pad_within_channel_pack4.comp b/src/layer/shader/lrn_square_pad_within_channel_pack4.comp index 6a0cddd1b..4f8900bcf 100644 --- a/src/layer/shader/lrn_square_pad_within_channel_pack4.comp +++ b/src/layer/shader/lrn_square_pad_within_channel_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,7 +66,7 @@ void main() if (x >= 0 && x < p.w && y >= 0 && y < p.h) { int v_offset = gz * p.cstep + y * p.w + x; - afpvec4 v = bottom_blob_data[v_offset]; + afpvec4 v = afpvec4(bottom_blob_data[v_offset]); res = v * v; } else diff --git a/src/layer/shader/packing_1to4.comp b/src/layer/shader/packing_1to4.comp index b02f855c3..4416b0e25 100644 --- a/src/layer/shader/packing_1to4.comp +++ b/src/layer/shader/packing_1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -55,35 +58,25 @@ void main() { ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); - // prevent out of range access -// x4 = min(x4, p.w - 1); - v_offset = x4; } else if (p.dims == 2) { ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); - // prevent out of range access -// y4 = min(y4, p.h - 1); - v_offset = y4 * p.w + gx; } else // if (p.dims == 3) { ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); - // prevent out of range access - z4 = min(z4, p.c - 1); - v_offset = z4 * p.cstep + ivec4(gy * p.w + gx); } - sfpvec4 v; - v.r = bottom_blob_data[v_offset.r]; - v.g = bottom_blob_data[v_offset.g]; - v.b = bottom_blob_data[v_offset.b]; - v.a = bottom_blob_data[v_offset.a]; + int gi = gz * p.outcstep + gy * p.outw + gx; - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + top_blob_data[gi].r = bottom_blob_data[v_offset.r]; + top_blob_data[gi].g = bottom_blob_data[v_offset.g]; + top_blob_data[gi].b = bottom_blob_data[v_offset.b]; + top_blob_data[gi].a = bottom_blob_data[v_offset.a]; } diff --git a/src/layer/shader/packing_4to1.comp b/src/layer/shader/packing_4to1.comp index 6341263a9..eeae3a590 100644 --- a/src/layer/shader/packing_4to1.comp +++ b/src/layer/shader/packing_4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -70,10 +73,10 @@ void main() v_offset = z4 * p.outcstep + ivec4(gy * p.outw + gx); } - sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int gi = gz * p.cstep + gy * p.w + gx; - top_blob_data[v_offset.r] = v.r; - top_blob_data[v_offset.g] = v.g; - top_blob_data[v_offset.b] = v.b; - top_blob_data[v_offset.a] = v.a; + top_blob_data[v_offset.r] = bottom_blob_data[gi].r; + top_blob_data[v_offset.g] = bottom_blob_data[gi].g; + top_blob_data[v_offset.b] = bottom_blob_data[gi].b; + top_blob_data[v_offset.a] = bottom_blob_data[gi].a; } diff --git a/src/layer/shader/padding.comp b/src/layer/shader/padding.comp index 2ee723580..91030807d 100644 --- a/src/layer/shader/padding.comp +++ b/src/layer/shader/padding.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -56,8 +59,6 @@ void main() if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; - sfp res; - int x = gx - left; int y = gy - top; @@ -66,11 +67,11 @@ void main() if (x >= 0 && x < p.w && y >= 0 && y < p.h) { int v_offset = gz * p.cstep + y * p.w + x; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } else { - res = sfp(value); + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(value); } } else if (type == 1) @@ -79,8 +80,6 @@ void main() y = clamp(y, 0, p.h - 1); int v_offset = gz * p.cstep + y * p.w + x; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res; } diff --git a/src/layer/shader/padding_pack4.comp b/src/layer/shader/padding_pack4.comp index 8aea3b2b1..e74e0cffd 100644 --- a/src/layer/shader/padding_pack4.comp +++ b/src/layer/shader/padding_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -56,8 +59,6 @@ void main() if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; - sfpvec4 res; - int x = gx - left; int y = gy - top; @@ -66,11 +67,11 @@ void main() if (x >= 0 && x < p.w && y >= 0 && y < p.h) { int v_offset = gz * p.cstep + y * p.w + x; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } else { - res = sfpvec4(value); + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value); } } else if (type == 1) @@ -79,8 +80,6 @@ void main() y = clamp(y, 0, p.h - 1); int v_offset = gz * p.cstep + y * p.w + x; - res = bottom_blob_data[v_offset]; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res; } diff --git a/src/layer/shader/permute.comp b/src/layer/shader/permute.comp index 166f2dd6d..cb9d7a3db 100644 --- a/src/layer/shader/permute.comp +++ b/src/layer/shader/permute.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/permute_pack4to1.comp b/src/layer/shader/permute_pack4to1.comp index 3ca3c371c..b3cadbb43 100644 --- a/src/layer/shader/permute_pack4to1.comp +++ b/src/layer/shader/permute_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -104,10 +107,10 @@ void main() } } - sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int gi = gz * p.cstep + gy * p.w + gx; - top_blob_data[v_offset.r] = v.r; - top_blob_data[v_offset.g] = v.g; - top_blob_data[v_offset.b] = v.b; - top_blob_data[v_offset.a] = v.a; + top_blob_data[v_offset.r] = bottom_blob_data[gi].r; + top_blob_data[v_offset.g] = bottom_blob_data[gi].g; + top_blob_data[v_offset.b] = bottom_blob_data[gi].b; + top_blob_data[v_offset.a] = bottom_blob_data[gi].a; } diff --git a/src/layer/shader/pooling.comp b/src/layer/shader/pooling.comp index 31c8dd900..3fbd3830f 100644 --- a/src/layer/shader/pooling.comp +++ b/src/layer/shader/pooling.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/pooling_global.comp b/src/layer/shader/pooling_global.comp index 2777e9a86..8a4d3d566 100644 --- a/src/layer/shader/pooling_global.comp +++ b/src/layer/shader/pooling_global.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/pooling_global_pack4.comp b/src/layer/shader/pooling_global_pack4.comp index eafad171d..34582a535 100644 --- a/src/layer/shader/pooling_global_pack4.comp +++ b/src/layer/shader/pooling_global_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/pooling_pack4.comp b/src/layer/shader/pooling_pack4.comp index c9c30d7a3..166860385 100644 --- a/src/layer/shader/pooling_pack4.comp +++ b/src/layer/shader/pooling_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/prelu.comp b/src/layer/shader/prelu.comp index fc33a2a92..11376bd5b 100644 --- a/src/layer/shader/prelu.comp +++ b/src/layer/shader/prelu.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -47,9 +50,9 @@ void main() if (p.dims == 1) { - afp v = bottom_top_blob_data[gx]; + afp v = afp(bottom_top_blob_data[gx]); - afp slope = num_slope > 1 ? slope_blob_data[gx] : slope_blob_data[0]; + afp slope = num_slope > 1 ? afp(slope_blob_data[gx]) : afp(slope_blob_data[0]); v = v < afp(0.f) ? v * slope : v; @@ -62,9 +65,9 @@ void main() { const int gi = gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); - afp slope = num_slope > 1 ? slope_blob_data[gy] : slope_blob_data[0]; + afp slope = num_slope > 1 ? afp(slope_blob_data[gy]) : afp(slope_blob_data[0]); v = v < afp(0.f) ? v * slope : v; @@ -77,9 +80,9 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); - afp slope = num_slope > 1 ? slope_blob_data[gz] : slope_blob_data[0]; + afp slope = num_slope > 1 ? afp(slope_blob_data[gz]) : afp(slope_blob_data[0]); v = v < afp(0.f) ? v * slope : v; diff --git a/src/layer/shader/prelu_pack4.comp b/src/layer/shader/prelu_pack4.comp index 73294d4fb..85b9f372b 100644 --- a/src/layer/shader/prelu_pack4.comp +++ b/src/layer/shader/prelu_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -47,9 +50,9 @@ void main() if (p.dims == 1) { - afpvec4 v = bottom_top_blob_data[gx]; + afpvec4 v = afpvec4(bottom_top_blob_data[gx]); - afpvec4 slope = num_slope > 1 ? slope_blob_data[gx] : afpvec4(slope_blob_data[0]); + afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gx]) : afpvec4(slope_blob_data[0]); v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); @@ -62,9 +65,9 @@ void main() { const int gi = gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - afpvec4 slope = num_slope > 1 ? slope_blob_data[gy] : afpvec4(slope_blob_data[0]); + afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gy]) : afpvec4(slope_blob_data[0]); v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); @@ -77,9 +80,9 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); - afpvec4 slope = num_slope > 1 ? slope_blob_data[gz] : afpvec4(slope_blob_data[0]); + afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gz]) : afpvec4(slope_blob_data[0]); v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); diff --git a/src/layer/shader/priorbox.comp b/src/layer/shader/priorbox.comp index 30da063dc..29f7c4946 100644 --- a/src/layer/shader/priorbox.comp +++ b/src/layer/shader/priorbox.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -69,14 +72,12 @@ void main() afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h); - sfpvec4 variance = sfpvec4(variances_0, variances_1, variances_2, variances_3); - afpvec4 box; afp box_w; afp box_h; - afp min_size = min_sizes_data[gx]; + afp min_size = afp(min_sizes_data[gx]); // min size box box_w = box_h = min_size; @@ -84,14 +85,17 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); - top_blob_data[var_offset] = variance; + top_blob_data[var_offset].r = sfp(variances_0); + top_blob_data[var_offset].g = sfp(variances_1); + top_blob_data[var_offset].b = sfp(variances_2); + top_blob_data[var_offset].a = sfp(variances_3); v_offset += 1; var_offset += 1; if (num_max_size > 0) { - afp max_size = max_sizes_data[gx]; + afp max_size = afp(max_sizes_data[gx]); // max size box box_w = box_h = sqrt(min_size * max_size); @@ -99,7 +103,10 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); - top_blob_data[var_offset] = variance; + top_blob_data[var_offset].r = sfp(variances_0); + top_blob_data[var_offset].g = sfp(variances_1); + top_blob_data[var_offset].b = sfp(variances_2); + top_blob_data[var_offset].a = sfp(variances_3); v_offset += 1; var_offset += 1; @@ -108,7 +115,7 @@ void main() // all aspect_ratios for (int pi = 0; pi < num_aspect_ratio; pi++) { - afp ar = aspect_ratios_data[pi]; + afp ar = afp(aspect_ratios_data[pi]); box_w = min_size * sqrt(ar); box_h = min_size / sqrt(ar); @@ -116,7 +123,10 @@ void main() box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); - top_blob_data[var_offset] = variance; + top_blob_data[var_offset].r = sfp(variances_0); + top_blob_data[var_offset].g = sfp(variances_1); + top_blob_data[var_offset].b = sfp(variances_2); + top_blob_data[var_offset].a = sfp(variances_3); v_offset += 1; var_offset += 1; @@ -126,7 +136,10 @@ void main() box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); - top_blob_data[var_offset] = variance; + top_blob_data[var_offset].r = sfp(variances_0); + top_blob_data[var_offset].g = sfp(variances_1); + top_blob_data[var_offset].b = sfp(variances_2); + top_blob_data[var_offset].a = sfp(variances_3); v_offset += 1; var_offset += 1; diff --git a/src/layer/shader/priorbox_mxnet.comp b/src/layer/shader/priorbox_mxnet.comp index 8738eceb6..3dc04b46e 100644 --- a/src/layer/shader/priorbox_mxnet.comp +++ b/src/layer/shader/priorbox_mxnet.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -58,7 +61,7 @@ void main() afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); // ratio = 1, various sizes - afp size = min_sizes_data[gx]; + afp size = afp(min_sizes_data[gx]); afp cw = size * afp(p.h) / afp(p.w) / afp(2); afp ch = size / afp(2); @@ -69,7 +72,7 @@ void main() if (gx == num_sizes - 1) { // various ratios, size = min_size = size[0] - afp size = min_sizes_data[0]; + afp size = afp(min_sizes_data[0]); for (int pi = 1; pi < num_ratios; pi++) { afp ratio = sqrt(afp(aspect_ratios_data[pi])); diff --git a/src/layer/shader/relu.comp b/src/layer/shader/relu.comp index 1660b67c1..613f1efad 100644 --- a/src/layer/shader/relu.comp +++ b/src/layer/shader/relu.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); if (slope == 0) v = max(v, afp(0.f)); diff --git a/src/layer/shader/relu_pack4.comp b/src/layer/shader/relu_pack4.comp index 3832a2392..c065f3a27 100644 --- a/src/layer/shader/relu_pack4.comp +++ b/src/layer/shader/relu_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); if (slope == 0) v = max(v, afp(0.f)); diff --git a/src/layer/shader/reorg.comp b/src/layer/shader/reorg.comp index 498fa4aae..350cc4844 100644 --- a/src/layer/shader/reorg.comp +++ b/src/layer/shader/reorg.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -58,7 +61,5 @@ void main() int v_offset = z * p.cstep + y * p.w + x; - sfp v = bottom_blob_data[v_offset]; - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; } diff --git a/src/layer/shader/reorg_pack1to4.comp b/src/layer/shader/reorg_pack1to4.comp index 137afb6fc..06a0fcc97 100644 --- a/src/layer/shader/reorg_pack1to4.comp +++ b/src/layer/shader/reorg_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -60,12 +63,10 @@ void main() ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; - sfpvec4 v; - - v.r = bottom_blob_data[v_offset.r]; - v.g = bottom_blob_data[v_offset.g]; - v.b = bottom_blob_data[v_offset.b]; - v.a = bottom_blob_data[v_offset.a]; + int gi = gz * p.outcstep + gy * p.outw + gx; - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + top_blob_data[gi].r = bottom_blob_data[v_offset.r]; + top_blob_data[gi].g = bottom_blob_data[v_offset.g]; + top_blob_data[gi].b = bottom_blob_data[v_offset.b]; + top_blob_data[gi].a = bottom_blob_data[v_offset.a]; } diff --git a/src/layer/shader/reorg_pack4.comp b/src/layer/shader/reorg_pack4.comp index 297ede079..d36e56b9c 100644 --- a/src/layer/shader/reorg_pack4.comp +++ b/src/layer/shader/reorg_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -63,33 +66,25 @@ void main() ivec4 lane4 = z4 % 4; // v = v4[lane] - sfpvec4 v; - - sfpvec4 v4; - - v4 = bottom_blob_data[v_offset.r]; - if (lane4.r == 0) v.r = v4.r; - else if (lane4.r == 1) v.r = v4.g; - else if (lane4.r == 2) v.r = v4.b; - else /* if (lane4.r == 3) */ v.r = v4.a; - - v4 = bottom_blob_data[v_offset.g]; - if (lane4.g == 0) v.g = v4.r; - else if (lane4.g == 1) v.g = v4.g; - else if (lane4.g == 2) v.g = v4.b; - else /* if (lane4.g == 3) */ v.g = v4.a; - - v4 = bottom_blob_data[v_offset.b]; - if (lane4.b == 0) v.b = v4.r; - else if (lane4.b == 1) v.b = v4.g; - else if (lane4.b == 2) v.b = v4.b; - else /* if (lane4.b == 3) */ v.b = v4.a; - - v4 = bottom_blob_data[v_offset.a]; - if (lane4.a == 0) v.a = v4.r; - else if (lane4.a == 1) v.a = v4.g; - else if (lane4.a == 2) v.a = v4.b; - else /* if (lane4.a == 3) */ v.a = v4.a; - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + int gi = gz * p.outcstep + gy * p.outw + gx; + + if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; + else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; + else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; + else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; + + if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; + else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; + else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; + else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; + + if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; + else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; + else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; + else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; + + if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; + else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; + else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; + else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a; } diff --git a/src/layer/shader/reshape.comp b/src/layer/shader/reshape.comp index 5b3edf3f8..ebcea9bbe 100644 --- a/src/layer/shader/reshape.comp +++ b/src/layer/shader/reshape.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -65,9 +68,19 @@ void main() int v_offset = z * p.cstep + y * p.w + x; - sfp v = bottom_blob_data[v_offset]; - - if (ndim == 1) top_blob_data[gx] = v; - if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; - if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * p.outw + gx; + } + if (ndim == 3) + { + gi = gz * p.outcstep + gy * p.outw + gx; + } + + top_blob_data[gi] = bottom_blob_data[v_offset]; } diff --git a/src/layer/shader/reshape_pack1to4.comp b/src/layer/shader/reshape_pack1to4.comp index 49385174f..f11a0a815 100644 --- a/src/layer/shader/reshape_pack1to4.comp +++ b/src/layer/shader/reshape_pack1to4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -65,13 +68,22 @@ void main() ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; - sfpvec4 v; - v.r = bottom_blob_data[v_offset.r]; - v.g = bottom_blob_data[v_offset.g]; - v.b = bottom_blob_data[v_offset.b]; - v.a = bottom_blob_data[v_offset.a]; - - if (ndim == 1) top_blob_data[gx] = v; - if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; - if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * p.outw + gx; + } + if (ndim == 3) + { + gi = gz * p.outcstep + gy * p.outw + gx; + } + + top_blob_data[gi].r = bottom_blob_data[v_offset.r]; + top_blob_data[gi].g = bottom_blob_data[v_offset.g]; + top_blob_data[gi].b = bottom_blob_data[v_offset.b]; + top_blob_data[gi].a = bottom_blob_data[v_offset.a]; } diff --git a/src/layer/shader/reshape_pack4.comp b/src/layer/shader/reshape_pack4.comp index 002097d94..d308a40d5 100644 --- a/src/layer/shader/reshape_pack4.comp +++ b/src/layer/shader/reshape_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -85,35 +88,37 @@ void main() lane4 = z4 % 4; } - sfpvec4 v; - - sfpvec4 v4; - - v4 = bottom_blob_data[v_offset.r]; - if (lane4.r == 0) v.r = v4.r; - else if (lane4.r == 1) v.r = v4.g; - else if (lane4.r == 2) v.r = v4.b; - else /* if (lane4.r == 3) */ v.r = v4.a; - - v4 = bottom_blob_data[v_offset.g]; - if (lane4.g == 0) v.g = v4.r; - else if (lane4.g == 1) v.g = v4.g; - else if (lane4.g == 2) v.g = v4.b; - else /* if (lane4.g == 3) */ v.g = v4.a; - - v4 = bottom_blob_data[v_offset.b]; - if (lane4.b == 0) v.b = v4.r; - else if (lane4.b == 1) v.b = v4.g; - else if (lane4.b == 2) v.b = v4.b; - else /* if (lane4.b == 3) */ v.b = v4.a; - - v4 = bottom_blob_data[v_offset.a]; - if (lane4.a == 0) v.a = v4.r; - else if (lane4.a == 1) v.a = v4.g; - else if (lane4.a == 2) v.a = v4.b; - else /* if (lane4.a == 3) */ v.a = v4.a; - - if (ndim == 1) top_blob_data[gx] = v; - if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; - if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + int gi; + if (ndim == 1) + { + gi = gx; + } + if (ndim == 2) + { + gi = gy * p.outw + gx; + } + if (ndim == 3) + { + gi = gz * p.outcstep + gy * p.outw + gx; + } + + if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; + else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; + else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; + else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; + + if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; + else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; + else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; + else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; + + if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; + else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; + else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; + else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; + + if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; + else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; + else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; + else /* if (lane4.a == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.a].a; } diff --git a/src/layer/shader/reshape_pack4to1.comp b/src/layer/shader/reshape_pack4to1.comp index f561c2924..d7094391f 100644 --- a/src/layer/shader/reshape_pack4to1.comp +++ b/src/layer/shader/reshape_pack4to1.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -81,10 +84,10 @@ void main() v_offset = z4 * p.outcstep + y4 * p.outw + x4; } - sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; + int gi = gz * p.cstep + gy * p.w + gx; - top_blob_data[v_offset.r] = v.r; - top_blob_data[v_offset.g] = v.g; - top_blob_data[v_offset.b] = v.b; - top_blob_data[v_offset.a] = v.a; + top_blob_data[v_offset.r] = bottom_blob_data[gi].r; + top_blob_data[v_offset.g] = bottom_blob_data[gi].g; + top_blob_data[v_offset.b] = bottom_blob_data[gi].b; + top_blob_data[v_offset.a] = bottom_blob_data[gi].a; } diff --git a/src/layer/shader/scale.comp b/src/layer/shader/scale.comp index f5bcad2e7..a9373d162 100644 --- a/src/layer/shader/scale.comp +++ b/src/layer/shader/scale.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -48,7 +51,7 @@ void main() if (p.dims == 1) { - afp v = bottom_top_blob_data[gx]; + afp v = afp(bottom_top_blob_data[gx]); if (bias_term == 1) v = afp(scale_blob_data[gx]) * v + afp(bias_blob_data[gx]); @@ -64,7 +67,7 @@ void main() { const int gi = gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); if (bias_term == 1) v = afp(scale_blob_data[gy]) * v + afp(bias_blob_data[gy]); @@ -80,7 +83,7 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); if (bias_term == 1) v = afp(scale_blob_data[gz]) * v + afp(bias_blob_data[gz]); diff --git a/src/layer/shader/scale_pack4.comp b/src/layer/shader/scale_pack4.comp index 08a6c6f40..aecdeb3d1 100644 --- a/src/layer/shader/scale_pack4.comp +++ b/src/layer/shader/scale_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -48,7 +51,7 @@ void main() if (p.dims == 1) { - afpvec4 v = bottom_top_blob_data[gx]; + afpvec4 v = afpvec4(bottom_top_blob_data[gx]); if (bias_term == 1) v = afpvec4(scale_blob_data[gx]) * v + afpvec4(bias_blob_data[gx]); @@ -64,7 +67,7 @@ void main() { const int gi = gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); if (bias_term == 1) v = afpvec4(scale_blob_data[gy]) * v + afpvec4(bias_blob_data[gy]); @@ -80,7 +83,7 @@ void main() { const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); if (bias_term == 1) v = afpvec4(scale_blob_data[gz]) * v + afpvec4(bias_blob_data[gz]); diff --git a/src/layer/shader/shufflechannel.comp b/src/layer/shader/shufflechannel.comp index 3a672c423..ec851a488 100644 --- a/src/layer/shader/shufflechannel.comp +++ b/src/layer/shader/shufflechannel.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/shufflechannel_pack4.comp b/src/layer/shader/shufflechannel_pack4.comp index c6afb92e9..ef8c76b11 100644 --- a/src/layer/shader/shufflechannel_pack4.comp +++ b/src/layer/shader/shufflechannel_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -62,33 +65,25 @@ void main() ivec4 lane4 = z4 % 4; // v = v4[lane] - sfpvec4 v; - - sfpvec4 v4; - - v4 = bottom_blob_data[v_offset.r]; - if (lane4.r == 0) v.r = v4.r; - else if (lane4.r == 1) v.r = v4.g; - else if (lane4.r == 2) v.r = v4.b; - else /* if (lane4.r == 3) */ v.r = v4.a; - - v4 = bottom_blob_data[v_offset.g]; - if (lane4.g == 0) v.g = v4.r; - else if (lane4.g == 1) v.g = v4.g; - else if (lane4.g == 2) v.g = v4.b; - else /* if (lane4.g == 3) */ v.g = v4.a; - - v4 = bottom_blob_data[v_offset.b]; - if (lane4.b == 0) v.b = v4.r; - else if (lane4.b == 1) v.b = v4.g; - else if (lane4.b == 2) v.b = v4.b; - else /* if (lane4.b == 3) */ v.b = v4.a; - - v4 = bottom_blob_data[v_offset.a]; - if (lane4.a == 0) v.a = v4.r; - else if (lane4.a == 1) v.a = v4.g; - else if (lane4.a == 2) v.a = v4.b; - else /* if (lane4.a == 3) */ v.a = v4.a; - - top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; + int gi = gz * p.outcstep + gy * p.outw + gx; + + if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; + else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; + else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; + else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; + + if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; + else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; + else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; + else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; + + if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; + else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; + else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; + else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; + + if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; + else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; + else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; + else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a; } diff --git a/src/layer/shader/sigmoid.comp b/src/layer/shader/sigmoid.comp index 4db64499d..6351a0514 100644 --- a/src/layer/shader/sigmoid.comp +++ b/src/layer/shader/sigmoid.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); v = afp(1.f) / (afp(1.f) + exp(-v)); diff --git a/src/layer/shader/sigmoid_pack4.comp b/src/layer/shader/sigmoid_pack4.comp index 7e73a4db1..8c779178a 100644 --- a/src/layer/shader/sigmoid_pack4.comp +++ b/src/layer/shader/sigmoid_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); v = afp(1.f) / (afp(1.f) + exp(-v)); diff --git a/src/layer/shader/softmax_div_sum.comp b/src/layer/shader/softmax_div_sum.comp index 116b4592f..d6cf2d922 100644 --- a/src/layer/shader/softmax_div_sum.comp +++ b/src/layer/shader/softmax_div_sum.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -53,8 +56,8 @@ void main() if (p.dims == 1) // axis == 0 { - afp sum = sum_workspace_data[0]; - afp v = bottom_top_blob_data[gx]; + afp sum = afp(sum_workspace_data[0]); + afp v = afp(bottom_top_blob_data[gx]); bottom_top_blob_data[gx] = sfp(v / sum); return; } @@ -62,8 +65,8 @@ void main() if (p.dims == 2 && axis == 0) { int gi = gy * p.w + gx; - afp sum = sum_workspace_data[gx]; - afp v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gx]); + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(v / sum); return; } @@ -71,8 +74,8 @@ void main() if (p.dims == 2 && axis == 1) { int gi = gy * p.w + gx; - afp sum = sum_workspace_data[gy]; - afp v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gy]); + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(v / sum); return; } @@ -80,8 +83,8 @@ void main() if (p.dims == 3 && axis == 0) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gy * p.w + gx]; - afp v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gy * p.w + gx]); + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(v / sum); return; } @@ -89,8 +92,8 @@ void main() if (p.dims == 3 && axis == 1) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gz * p.w + gx]; - afp v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gz * p.w + gx]); + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(v / sum); return; } @@ -98,8 +101,8 @@ void main() if (p.dims == 3 && axis == 2) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gz * p.h + gy]; - afp v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gz * p.h + gy]); + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(v / sum); return; } diff --git a/src/layer/shader/softmax_div_sum_pack4.comp b/src/layer/shader/softmax_div_sum_pack4.comp index 05a0fe846..14aeef06a 100644 --- a/src/layer/shader/softmax_div_sum_pack4.comp +++ b/src/layer/shader/softmax_div_sum_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -53,8 +56,8 @@ void main() if (p.dims == 1) // axis == 0 { - afp sum = sum_workspace_data[0]; - afpvec4 v = bottom_top_blob_data[gx]; + afp sum = afp(sum_workspace_data[0]); + afpvec4 v = afpvec4(bottom_top_blob_data[gx]); bottom_top_blob_data[gx] = sfpvec4(v / sum); return; } @@ -62,8 +65,8 @@ void main() if (p.dims == 2 && axis == 0) { int gi = gy * p.w + gx; - afp sum = sum_workspace_data[gx]; - afpvec4 v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gx]); + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(v / sum); return; } @@ -71,8 +74,8 @@ void main() if (p.dims == 2 && axis == 1) { int gi = gy * p.w + gx; - afp sum = sum_workspace_data[gy]; - afpvec4 v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gy]); + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(v / sum); return; } @@ -80,8 +83,8 @@ void main() if (p.dims == 3 && axis == 0) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gy * p.w + gx]; - afpvec4 v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gy * p.w + gx]); + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(v / sum); return; } @@ -89,8 +92,8 @@ void main() if (p.dims == 3 && axis == 1) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gz * p.w + gx]; - afpvec4 v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gz * p.w + gx]); + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(v / sum); return; } @@ -98,8 +101,8 @@ void main() if (p.dims == 3 && axis == 2) { int gi = gz * p.cstep + gy * p.w + gx; - afp sum = sum_workspace_data[gz * p.h + gy]; - afpvec4 v = bottom_top_blob_data[gi]; + afp sum = afp(sum_workspace_data[gz * p.h + gy]); + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(v / sum); return; } diff --git a/src/layer/shader/softmax_exp_sub_max.comp b/src/layer/shader/softmax_exp_sub_max.comp index 81f8ef620..fd06a817e 100644 --- a/src/layer/shader/softmax_exp_sub_max.comp +++ b/src/layer/shader/softmax_exp_sub_max.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/softmax_exp_sub_max_pack4.comp b/src/layer/shader/softmax_exp_sub_max_pack4.comp index 2d0a4e410..9ff016169 100644 --- a/src/layer/shader/softmax_exp_sub_max_pack4.comp +++ b/src/layer/shader/softmax_exp_sub_max_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/softmax_reduce_max.comp b/src/layer/shader/softmax_reduce_max.comp index af122ff30..de48ba897 100644 --- a/src/layer/shader/softmax_reduce_max.comp +++ b/src/layer/shader/softmax_reduce_max.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/softmax_reduce_max_pack4.comp b/src/layer/shader/softmax_reduce_max_pack4.comp index a67a7b2fc..9dc074ae6 100644 --- a/src/layer/shader/softmax_reduce_max_pack4.comp +++ b/src/layer/shader/softmax_reduce_max_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/softmax_reduce_sum.comp b/src/layer/shader/softmax_reduce_sum.comp index e9bed6f36..cd55483c3 100644 --- a/src/layer/shader/softmax_reduce_sum.comp +++ b/src/layer/shader/softmax_reduce_sum.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/softmax_reduce_sum_pack4.comp b/src/layer/shader/softmax_reduce_sum_pack4.comp index d11430d72..4b3f0826b 100644 --- a/src/layer/shader/softmax_reduce_sum_pack4.comp +++ b/src/layer/shader/softmax_reduce_sum_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif diff --git a/src/layer/shader/tanh.comp b/src/layer/shader/tanh.comp index 8c8d6fb2a..f178f6c6e 100644 --- a/src/layer/shader/tanh.comp +++ b/src/layer/shader/tanh.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfp(tanh(v)); } diff --git a/src/layer/shader/tanh_pack4.comp b/src/layer/shader/tanh_pack4.comp index 052e702f8..d3771af71 100644 --- a/src/layer/shader/tanh_pack4.comp +++ b/src/layer/shader/tanh_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -44,7 +47,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); bottom_top_blob_data[gi] = sfpvec4(tanh(v)); } diff --git a/src/layer/shader/unaryop.comp b/src/layer/shader/unaryop.comp index 7a9512ffb..8f4ff91b2 100644 --- a/src/layer/shader/unaryop.comp +++ b/src/layer/shader/unaryop.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afp v = bottom_top_blob_data[gi]; + afp v = afp(bottom_top_blob_data[gi]); afp res; diff --git a/src/layer/shader/unaryop_pack4.comp b/src/layer/shader/unaryop_pack4.comp index 5ac0258c7..3fa124b09 100644 --- a/src/layer/shader/unaryop_pack4.comp +++ b/src/layer/shader/unaryop_pack4.comp @@ -14,7 +14,10 @@ #version 450 -#if NCNN_fp16_storage || NCNN_fp16_arithmetic +#if NCNN_fp16_storage +#extension GL_EXT_shader_16bit_storage: require +#endif +#if NCNN_fp16_arithmetic #extension GL_AMD_gpu_shader_half_float: require #endif @@ -46,7 +49,7 @@ void main() const int gi = gz * p.cstep + gy * p.w + gx; - afpvec4 v = bottom_top_blob_data[gi]; + afpvec4 v = afpvec4(bottom_top_blob_data[gi]); afpvec4 res;