| @@ -112,7 +112,7 @@ macro(ncnn_add_layer class) | |||||
| add_custom_command( | add_custom_command( | ||||
| OUTPUT ${SHADER_fp16s_SPV_HEX_FILE} | OUTPUT ${SHADER_fp16s_SPV_HEX_FILE} | ||||
| COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | COMMAND ${GLSLANGVALIDATOR_EXECUTABLE} | ||||
| ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dsfpmat4=f16mat4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC} | |||||
| ARGS -Dsfp=float16_t -Dsfpvec2=f16vec2 -Dsfpvec4=f16vec4 -Dafp=float -Dafpvec2=vec2 -Dafpvec4=vec4 -Dafpmat4=mat4 -DNCNN_fp16_storage=1 -V -s -e ${SHADER_fp16s_SRC_NAME_WE} --source-entrypoint main -x -o ${SHADER_fp16s_SPV_HEX_FILE} ${SHADER_SRC} | |||||
| DEPENDS ${SHADER_SRC} | DEPENDS ${SHADER_SRC} | ||||
| COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv" | COMMENT "Building SPIR-V module ${SHADER_fp16s_SRC_NAME_WE}.spv" | ||||
| VERBATIM | VERBATIM | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(abs(v)); | bottom_top_blob_data[gi] = sfp(abs(v)); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(abs(v)); | bottom_top_blob_data[gi] = sfpvec4(abs(v)); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,9 +49,9 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afp v = bottom_top_blob_data[gx]; | |||||
| afp v = afp(bottom_top_blob_data[gx]); | |||||
| v = b_data[gx] * v + a_data[gx]; | |||||
| v = afp(b_data[gx]) * v + afp(a_data[gx]); | |||||
| bottom_top_blob_data[gx] = sfp(v); | bottom_top_blob_data[gx] = sfp(v); | ||||
| @@ -59,9 +62,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| v = b_data[gy] * v + a_data[gy]; | |||||
| v = afp(b_data[gy]) * v + afp(a_data[gy]); | |||||
| bottom_top_blob_data[gi] = sfp(v); | bottom_top_blob_data[gi] = sfp(v); | ||||
| @@ -72,9 +75,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| v = b_data[gz] * v + a_data[gz]; | |||||
| v = afp(b_data[gz]) * v + afp(a_data[gz]); | |||||
| bottom_top_blob_data[gi] = sfp(v); | bottom_top_blob_data[gi] = sfp(v); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,9 +49,9 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afpvec4 v = bottom_top_blob_data[gx]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gx]); | |||||
| v = b_data[gx] * v + a_data[gx]; | |||||
| v = afpvec4(b_data[gx]) * v + afpvec4(a_data[gx]); | |||||
| bottom_top_blob_data[gx] = sfpvec4(v); | bottom_top_blob_data[gx] = sfpvec4(v); | ||||
| @@ -59,9 +62,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| v = b_data[gy] * v + a_data[gy]; | |||||
| v = afpvec4(b_data[gy]) * v + afpvec4(a_data[gy]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v); | bottom_top_blob_data[gi] = sfpvec4(v); | ||||
| @@ -72,9 +75,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| v = b_data[gz] * v + a_data[gz]; | |||||
| v = afpvec4(b_data[gz]) * v + afpvec4(a_data[gz]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v); | bottom_top_blob_data[gi] = sfpvec4(v); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -62,7 +65,7 @@ void main() | |||||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | const int gi = gz * p.outcstep + gy * p.outw + gx; | ||||
| afp v1 = a_blob_data[gi]; | |||||
| afp v1 = afp(a_blob_data[gi]); | |||||
| afp res; | afp res; | ||||
| @@ -87,7 +90,7 @@ void main() | |||||
| if (p.adims == p.bdims) | if (p.adims == p.bdims) | ||||
| { | { | ||||
| afp v2 = b_blob_data[gi]; | |||||
| afp v2 = afp(b_blob_data[gi]); | |||||
| if (op_type == 0) res = v1 + v2; | if (op_type == 0) res = v1 + v2; | ||||
| if (op_type == 1) res = v1 - v2; | if (op_type == 1) res = v1 - v2; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -62,7 +65,7 @@ void main() | |||||
| const int gi = gz * p.outcstep + gy * p.outw + gx; | const int gi = gz * p.outcstep + gy * p.outw + gx; | ||||
| afpvec4 v1 = a_blob_data[gi]; | |||||
| afpvec4 v1 = afpvec4(a_blob_data[gi]); | |||||
| afpvec4 res; | afpvec4 res; | ||||
| @@ -87,7 +90,7 @@ void main() | |||||
| if (p.adims == p.bdims) | if (p.adims == p.bdims) | ||||
| { | { | ||||
| afpvec4 v2 = b_blob_data[gi]; | |||||
| afpvec4 v2 = afpvec4(b_blob_data[gi]); | |||||
| if (op_type == 0) res = v1 + v2; | if (op_type == 0) res = v1 + v2; | ||||
| if (op_type == 1) res = v1 - v2; | if (op_type == 1) res = v1 - v2; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -47,7 +50,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(clamp(v, afp(const_min), afp(const_max))); | bottom_top_blob_data[gi] = sfp(clamp(v, afp(const_min), afp(const_max))); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -47,7 +50,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(clamp(v, afp(const_min), afp(const_max))); | bottom_top_blob_data[gi] = sfpvec4(clamp(v, afp(const_min), afp(const_max))); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -53,39 +56,34 @@ void main() | |||||
| if (gx >= p.w || gy >= p.h || gz >= p.c) | if (gx >= p.w || gy >= p.h || gz >= p.c) | ||||
| return; | return; | ||||
| int v_offset; | |||||
| sfp v; | |||||
| if (p.dims == 1) // axis == 0 | if (p.dims == 1) // axis == 0 | ||||
| { | { | ||||
| v_offset = gx + p.offset; | |||||
| v = bottom_blob_data[gx]; | |||||
| int v_offset = gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gx]; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 0) | else if (p.dims == 2 && axis == 0) | ||||
| { | { | ||||
| v_offset = (gy + p.offset) * p.outw + gx; | |||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| int v_offset = (gy + p.offset) * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 1) | else if (p.dims == 2 && axis == 1) | ||||
| { | { | ||||
| v_offset = gy * p.outw + gx + p.offset; | |||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| int v_offset = gy * p.outw + gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 0) | else if (p.dims == 3 && axis == 0) | ||||
| { | { | ||||
| v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 1) | else if (p.dims == 3 && axis == 1) | ||||
| { | { | ||||
| v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 2) | else if (p.dims == 3 && axis == 2) | ||||
| { | { | ||||
| v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| top_blob_data[v_offset] = v; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -53,39 +56,34 @@ void main() | |||||
| if (gx >= p.w || gy >= p.h || gz >= p.c) | if (gx >= p.w || gy >= p.h || gz >= p.c) | ||||
| return; | return; | ||||
| int v_offset; | |||||
| sfpvec4 v; | |||||
| if (p.dims == 1) // axis == 0 | if (p.dims == 1) // axis == 0 | ||||
| { | { | ||||
| v_offset = gx + p.offset; | |||||
| v = bottom_blob_data[gx]; | |||||
| int v_offset = gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gx]; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 0) | else if (p.dims == 2 && axis == 0) | ||||
| { | { | ||||
| v_offset = (gy + p.offset) * p.outw + gx; | |||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| int v_offset = (gy + p.offset) * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 1) | else if (p.dims == 2 && axis == 1) | ||||
| { | { | ||||
| v_offset = gy * p.outw + gx + p.offset; | |||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| int v_offset = gy * p.outw + gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 0) | else if (p.dims == 3 && axis == 0) | ||||
| { | { | ||||
| v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = (gz + p.offset) * p.outcstep + gy * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 1) | else if (p.dims == 3 && axis == 1) | ||||
| { | { | ||||
| v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = gz * p.outcstep + (gy + p.offset) * p.outw + gx; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 2) | else if (p.dims == 3 && axis == 2) | ||||
| { | { | ||||
| v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; | |||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int v_offset = gz * p.outcstep + gy * p.outw + gx + p.offset; | |||||
| top_blob_data[v_offset] = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| } | } | ||||
| top_blob_data[v_offset] = v; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -54,41 +57,41 @@ void main() | |||||
| return; | return; | ||||
| ivec4 v_offset; | ivec4 v_offset; | ||||
| sfpvec4 v; | |||||
| int gi; | |||||
| if (p.dims == 1) // axis == 0 | if (p.dims == 1) // axis == 0 | ||||
| { | { | ||||
| v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3); | v_offset = ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3); | ||||
| v = bottom_blob_data[gx]; | |||||
| gi = gx; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 0) | else if (p.dims == 2 && axis == 0) | ||||
| { | { | ||||
| v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx; | v_offset = (ivec4(gx * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outw + gx; | ||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| gi = gy * p.w + gx; | |||||
| } | } | ||||
| else if (p.dims == 2 && axis == 1) | else if (p.dims == 2 && axis == 1) | ||||
| { | { | ||||
| v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset; | v_offset = (ivec4(gx * 4) + ivec4(0, 1, 2, 3)) * p.outw + gx + p.offset; | ||||
| v = bottom_blob_data[gy * p.w + gx]; | |||||
| gi = gy * p.w + gx; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 0) | else if (p.dims == 3 && axis == 0) | ||||
| { | { | ||||
| v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx; | v_offset = (ivec4(gz * 4 + p.offset) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx; | ||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| gi = gz * p.cstep + gy * p.w + gx; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 1) | else if (p.dims == 3 && axis == 1) | ||||
| { | { | ||||
| v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx; | v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + (gy + p.offset) * p.outw + gx; | ||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| gi = gz * p.cstep + gy * p.w + gx; | |||||
| } | } | ||||
| else if (p.dims == 3 && axis == 2) | else if (p.dims == 3 && axis == 2) | ||||
| { | { | ||||
| v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset; | v_offset = (ivec4(gz * 4) + ivec4(0, 1, 2, 3)) * p.outcstep + gy * p.outw + gx + p.offset; | ||||
| v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| gi = gz * p.cstep + gy * p.w + gx; | |||||
| } | } | ||||
| top_blob_data[v_offset.r] = v.r; | |||||
| top_blob_data[v_offset.g] = v.g; | |||||
| top_blob_data[v_offset.b] = v.b; | |||||
| top_blob_data[v_offset.a] = v.a; | |||||
| top_blob_data[v_offset.r] = bottom_blob_data[gi].r; | |||||
| top_blob_data[v_offset.g] = bottom_blob_data[gi].g; | |||||
| top_blob_data[v_offset.b] = bottom_blob_data[gi].b; | |||||
| top_blob_data[v_offset.a] = bottom_blob_data[gi].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -80,9 +83,9 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afp v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afp v = afp(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| afpvec4 k = weight_data[w_offset + x]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + x]); | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in; | |||||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | ||||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | ||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||||
| #else | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | ||||
| #endif | |||||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | ||||
| layout (push_constant) uniform parameter | layout (push_constant) uniform parameter | ||||
| @@ -63,7 +71,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -80,9 +88,19 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afpmat4 k = weight_data[w_offset + x]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| afpmat4 k = afpmat4( | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 0]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 1]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 2]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 3]) | |||||
| ); | |||||
| #else | |||||
| afpmat4 k = afpmat4(weight_data[w_offset + x]); | |||||
| #endif | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -80,9 +83,9 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| afpvec4 k = weight_data[w_offset + x]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + x]); | |||||
| sum += dot(v, k); | sum += dot(v, k); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -89,9 +92,9 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afp v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afp v = afp(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| afpvec4 k = weight_data[w_offset + x]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + x]); | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in; | |||||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | ||||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | ||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||||
| #else | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | ||||
| #endif | |||||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | ||||
| layout (push_constant) uniform parameter | layout (push_constant) uniform parameter | ||||
| @@ -64,7 +72,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -89,9 +97,19 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| afpmat4 k = afpmat4( | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 0]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 1]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 2]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 3]) | |||||
| ); | |||||
| #else | |||||
| afpmat4 k = weight_data[w_offset + x]; | afpmat4 k = weight_data[w_offset + x]; | ||||
| #endif | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -89,9 +92,9 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| afpvec4 k = weight_data[w_offset + x]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + x]); | |||||
| sum += dot(v, k); | sum += dot(v, k); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,11 +67,11 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| sum = afpvec4(0.0); | |||||
| sum = afpvec4(0.f); | |||||
| } | } | ||||
| // depth-wise convolution | // depth-wise convolution | ||||
| @@ -79,9 +82,9 @@ void main() | |||||
| { | { | ||||
| for (int x = 0; x < kernel_w; x++) | for (int x = 0; x < kernel_w; x++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset + x * dilation_w]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset + x * dilation_w]); | |||||
| afpvec4 k = weight_data[w_offset + x]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + x]); | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -100,9 +103,9 @@ void main() | |||||
| for (int z = 0; z < p.c; z++) | for (int z = 0; z < p.c; z++) | ||||
| { | { | ||||
| afp v = bottom_blob_data[v_offset]; | |||||
| afp v = afp(bottom_blob_data[v_offset]); | |||||
| afpvec4 k = weight_data[w_offset]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset]); | |||||
| sum += v * k; | sum += v * k; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -32,7 +35,12 @@ layout (local_size_z_id = 235) in; | |||||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | ||||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | ||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||||
| #else | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | ||||
| #endif | |||||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | ||||
| layout (push_constant) uniform parameter | layout (push_constant) uniform parameter | ||||
| @@ -63,7 +71,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -100,9 +108,19 @@ void main() | |||||
| for (int z = 0; z < p.c; z++) | for (int z = 0; z < p.c; z++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpmat4 k = weight_data[w_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| afpmat4 k = afpmat4( | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 0]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 1]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 2]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 3]) | |||||
| ); | |||||
| #else | |||||
| afpmat4 k = afpmat4(weight_data[w_offset]); | |||||
| #endif | |||||
| sum += v * k; | sum += v * k; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -100,9 +103,9 @@ void main() | |||||
| for (int z = 0; z < p.c; z++) | for (int z = 0; z < p.c; z++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| afpvec4 k = weight_data[w_offset]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset]); | |||||
| sum += dot(v, k); | sum += dot(v, k); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -109,9 +112,9 @@ void main() | |||||
| for (int z = 0; z < channels_g; z++) | for (int z = 0; z < channels_g; z++) | ||||
| { | { | ||||
| afp v = bottom_blob_data[v_offset]; | |||||
| afp v = afp(bottom_blob_data[v_offset]); | |||||
| afpvec4 k = weight_data[w_offset]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset]); | |||||
| sum += v * k; | sum += v * k; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -33,7 +36,12 @@ layout (local_size_z_id = 235) in; | |||||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | ||||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | ||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||||
| #else | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | ||||
| #endif | |||||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | ||||
| layout (push_constant) uniform parameter | layout (push_constant) uniform parameter | ||||
| @@ -64,7 +72,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -109,9 +117,19 @@ void main() | |||||
| for (int z = 0; z < channels_g; z++) | for (int z = 0; z < channels_g; z++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpmat4 k = weight_data[w_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| afpmat4 k = afpmat4( | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 0]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 1]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 2]), | |||||
| afpvec4(weight_data[(w_offset + x) * 4 + 3]) | |||||
| ); | |||||
| #else | |||||
| afpmat4 k = afpmat4(weight_data[w_offset]); | |||||
| #endif | |||||
| sum += v * k; | sum += v * k; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afp(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -109,9 +112,9 @@ void main() | |||||
| for (int z = 0; z < channels_g; z++) | for (int z = 0; z < channels_g; z++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| afpvec4 k = weight_data[w_offset]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset]); | |||||
| sum += dot(v, k); | sum += dot(v, k); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gz]; | |||||
| sum = afpvec4(bias_data[gz]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -101,9 +104,9 @@ void main() | |||||
| int v_offset = v_offset_0 + sy * p.w + sx; | int v_offset = v_offset_0 + sy * p.w + sx; | ||||
| int w_offset = w_offset_0 + y * kernel_w + x; | int w_offset = w_offset_0 + y * kernel_w + x; | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| afpvec4 k = weight_data[w_offset]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset]); | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| v *= afp(scale); | v *= afp(scale); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| v *= afp(scale); | v *= afp(scale); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -52,8 +55,8 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v1 = bottom_blob1_data[gi]; | |||||
| afp v2 = bottom_blob2_data[gi]; | |||||
| afp v1 = afp(bottom_blob1_data[gi]); | |||||
| afp v2 = afp(bottom_blob2_data[gi]); | |||||
| afp res; | afp res; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -52,8 +55,8 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v1 = bottom_blob1_data[gi]; | |||||
| afpvec4 v2 = bottom_blob2_data[gi]; | |||||
| afpvec4 v1 = afpvec4(bottom_blob1_data[gi]); | |||||
| afpvec4 v2 = afpvec4(bottom_blob2_data[gi]); | |||||
| afpvec4 res; | afpvec4 res; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -74,11 +77,8 @@ void main() | |||||
| v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4; | v_offset = ivec4(z * 4 * p.cstep) + si4 * 4 + k4; | ||||
| } | } | ||||
| sfpvec4 v; | |||||
| v.r = bottom_blob_data[v_offset.r]; | |||||
| v.g = bottom_blob_data[v_offset.g]; | |||||
| v.b = bottom_blob_data[v_offset.b]; | |||||
| v.a = bottom_blob_data[v_offset.a]; | |||||
| top_blob_data[gx] = v; | |||||
| top_blob_data[gx].r = bottom_blob_data[v_offset.r]; | |||||
| top_blob_data[gx].g = bottom_blob_data[v_offset.g]; | |||||
| top_blob_data[gx].b = bottom_blob_data[v_offset.b]; | |||||
| top_blob_data[gx].a = bottom_blob_data[v_offset.a]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -57,7 +60,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gx]; | |||||
| sum = afp(bias_data[gx]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -57,20 +60,20 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gx]; | |||||
| sum = afpvec4(bias_data[gx]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| sum = afpvec4(0.0); | |||||
| sum = afpvec4(0.f); | |||||
| } | } | ||||
| int w_offset = gx * p.w; | int w_offset = gx * p.w; | ||||
| for (int i = 0; i < p.w; i++) | for (int i = 0; i < p.w; i++) | ||||
| { | { | ||||
| afp v = bottom_blob_data[i]; | |||||
| afp v = afp(bottom_blob_data[i]); | |||||
| afpvec4 k = weight_data[w_offset + i]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + i]); | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -26,7 +29,12 @@ layout (local_size_z_id = 235) in; | |||||
| layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | layout (binding = 0) readonly buffer bottom_blob { sfpvec4 bottom_blob_data[]; }; | ||||
| layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | layout (binding = 1) writeonly buffer top_blob { sfpvec4 top_blob_data[]; }; | ||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpvec4 weight_data[]; }; | |||||
| #else | |||||
| layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | layout (binding = 2) readonly buffer weight_blob { sfpmat4 weight_data[]; }; | ||||
| #endif | |||||
| layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | layout (binding = 3) readonly buffer bias_blob { sfpvec4 bias_data[]; }; | ||||
| layout (push_constant) uniform parameter | layout (push_constant) uniform parameter | ||||
| @@ -57,20 +65,30 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gx]; | |||||
| sum = afpvec4(bias_data[gx]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| sum = afpvec4(0.0); | |||||
| sum = afpvec4(0.f); | |||||
| } | } | ||||
| int w_offset = gx * p.w; | int w_offset = gx * p.w; | ||||
| for (int i = 0; i < p.w; i++) | for (int i = 0; i < p.w; i++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[i]; | |||||
| afpmat4 k = weight_data[w_offset + i]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[i]); | |||||
| #if NCNN_fp16_storage && !NCNN_fp16_arithmetic | |||||
| // GL_EXT_shader_16bit_storage does not define f16mat4 type :( | |||||
| afpmat4 k = afpmat4( | |||||
| afpvec4(weight_data[(w_offset + i) * 4 + 0]), | |||||
| afpvec4(weight_data[(w_offset + i) * 4 + 1]), | |||||
| afpvec4(weight_data[(w_offset + i) * 4 + 2]), | |||||
| afpvec4(weight_data[(w_offset + i) * 4 + 3]) | |||||
| ); | |||||
| #else | |||||
| afpmat4 k = afpmat4(weight_data[w_offset + i]); | |||||
| #endif | |||||
| sum += v * k; | sum += v * k; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -57,7 +60,7 @@ void main() | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| { | { | ||||
| sum = bias_data[gx]; | |||||
| sum = afp(bias_data[gx]); | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| @@ -68,9 +71,9 @@ void main() | |||||
| for (int i = 0; i < p.w; i++) | for (int i = 0; i < p.w; i++) | ||||
| { | { | ||||
| afpvec4 v = bottom_blob_data[i]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[i]); | |||||
| afpvec4 k = weight_data[w_offset + i]; | |||||
| afpvec4 k = afpvec4(weight_data[w_offset + i]); | |||||
| sum += dot(v, k); | sum += dot(v, k); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -54,8 +57,6 @@ void main() | |||||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | ||||
| return; | return; | ||||
| afp res; | |||||
| if (resize_type == 1) // nearest | if (resize_type == 1) // nearest | ||||
| { | { | ||||
| afpvec2 gxy = afpvec2(gx, gy); | afpvec2 gxy = afpvec2(gx, gy); | ||||
| @@ -67,7 +68,7 @@ void main() | |||||
| int v_offset = gz * p.cstep + sy * p.w + sx; | int v_offset = gz * p.cstep + sy * p.w + sx; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| else if (resize_type == 2) // bilinear | else if (resize_type == 2) // bilinear | ||||
| { | { | ||||
| @@ -94,17 +95,15 @@ void main() | |||||
| int v_offset_0 = gz * p.cstep + sy * p.w + sx; | int v_offset_0 = gz * p.cstep + sy * p.w + sx; | ||||
| int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; | int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; | ||||
| afp a0 = bottom_blob_data[v_offset_0]; | |||||
| afp a1 = bottom_blob_data[v_offset_0 + 1]; | |||||
| afp b0 = bottom_blob_data[v_offset_1]; | |||||
| afp b1 = bottom_blob_data[v_offset_1 + 1]; | |||||
| afp a0 = afp(bottom_blob_data[v_offset_0]); | |||||
| afp a1 = afp(bottom_blob_data[v_offset_0 + 1]); | |||||
| afp b0 = afp(bottom_blob_data[v_offset_1]); | |||||
| afp b1 = afp(bottom_blob_data[v_offset_1 + 1]); | |||||
| afp fx = fxy.r; | afp fx = fxy.r; | ||||
| afp fy = fxy.g; | afp fy = fxy.g; | ||||
| afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx; | afpvec2 ab = afpvec2(a0, b0) * (afp(1.f) - fx) + afpvec2(a1, b1) * fx; | ||||
| res = ab.r * (afp(1.f) - fy) + ab.g * fy; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(ab.r * (afp(1.f) - fy) + ab.g * fy); | |||||
| } | } | ||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(res); | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -54,7 +57,7 @@ void main() | |||||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | ||||
| return; | return; | ||||
| afpvec4 res; | |||||
| // afpvec4 res; | |||||
| if (resize_type == 1) // nearest | if (resize_type == 1) // nearest | ||||
| { | { | ||||
| @@ -67,7 +70,7 @@ void main() | |||||
| int v_offset = gz * p.cstep + sy * p.w + sx; | int v_offset = gz * p.cstep + sy * p.w + sx; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| else if (resize_type == 2) // bilinear | else if (resize_type == 2) // bilinear | ||||
| { | { | ||||
| @@ -94,10 +97,10 @@ void main() | |||||
| int v_offset_0 = gz * p.cstep + sy * p.w + sx; | int v_offset_0 = gz * p.cstep + sy * p.w + sx; | ||||
| int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; | int v_offset_1 = gz * p.cstep + (sy + 1) * p.w + sx; | ||||
| afpvec4 a0 = bottom_blob_data[v_offset_0]; | |||||
| afpvec4 a1 = bottom_blob_data[v_offset_0 + 1]; | |||||
| afpvec4 b0 = bottom_blob_data[v_offset_1]; | |||||
| afpvec4 b1 = bottom_blob_data[v_offset_1 + 1]; | |||||
| afpvec4 a0 = afpvec4(bottom_blob_data[v_offset_0]); | |||||
| afpvec4 a1 = afpvec4(bottom_blob_data[v_offset_0 + 1]); | |||||
| afpvec4 b0 = afpvec4(bottom_blob_data[v_offset_1]); | |||||
| afpvec4 b1 = afpvec4(bottom_blob_data[v_offset_1 + 1]); | |||||
| afp fx = fxy.r; | afp fx = fxy.r; | ||||
| afp fy = fxy.g; | afp fy = fxy.g; | ||||
| @@ -105,8 +108,6 @@ void main() | |||||
| afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx; | afpvec4 a = a0 * (afp(1.f) - fx) + a1 * fx; | ||||
| afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx; | afpvec4 b = b0 * (afp(1.f) - fx) + b1 * fx; | ||||
| res = a * (afp(1.f) - fy) + b * fy; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(a * (afp(1.f) - fy) + b * fy); | |||||
| } | } | ||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(res); | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -95,7 +98,7 @@ void main() | |||||
| scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta)); | scale = pow(afp(bias_constant) + alpha_div_size * sum, afp(-beta)); | ||||
| } | } | ||||
| afp v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; | |||||
| afp v = afp(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); | |||||
| v *= scale; | v *= scale; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -75,7 +78,7 @@ void main() | |||||
| afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); | afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); | ||||
| afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); | |||||
| v *= scale; | v *= scale; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -75,7 +78,7 @@ void main() | |||||
| afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); | afpvec4 scale = pow(afp(bias_constant) + alpha_div_size * sum, afpvec4(-beta)); | ||||
| afpvec4 v = bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gz * p.outcstep + gy * p.outw + gx]); | |||||
| v *= scale; | v *= scale; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -62,7 +65,7 @@ void main() | |||||
| if (z >= 0 && z < p.c) | if (z >= 0 && z < p.c) | ||||
| { | { | ||||
| int v_offset = z * p.cstep + gy * p.w + gx; | int v_offset = z * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_blob_data[v_offset]; | |||||
| afp v = afp(bottom_blob_data[v_offset]); | |||||
| res = v * v; | res = v * v; | ||||
| } | } | ||||
| else | else | ||||
| @@ -78,7 +81,7 @@ void main() | |||||
| if (x >= 0 && x < p.w && y >= 0 && y < p.h) | if (x >= 0 && x < p.w && y >= 0 && y < p.h) | ||||
| { | { | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| afp v = bottom_blob_data[v_offset]; | |||||
| afp v = afp(bottom_blob_data[v_offset]); | |||||
| res = v * v; | res = v * v; | ||||
| } | } | ||||
| else | else | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -62,7 +65,7 @@ void main() | |||||
| if (z >= 0 && z < p.c) | if (z >= 0 && z < p.c) | ||||
| { | { | ||||
| int v_offset = z * p.cstep + gy * p.w + gx; | int v_offset = z * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v4 = bottom_blob_data[v_offset]; | |||||
| afpvec4 v4 = afpvec4(bottom_blob_data[v_offset]); | |||||
| int lane = (gz - pad_head) % 4; | int lane = (gz - pad_head) % 4; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,7 +66,7 @@ void main() | |||||
| if (x >= 0 && x < p.w && y >= 0 && y < p.h) | if (x >= 0 && x < p.w && y >= 0 && y < p.h) | ||||
| { | { | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| afpvec4 v = bottom_blob_data[v_offset]; | |||||
| afpvec4 v = afpvec4(bottom_blob_data[v_offset]); | |||||
| res = v * v; | res = v * v; | ||||
| } | } | ||||
| else | else | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -55,35 +58,25 @@ void main() | |||||
| { | { | ||||
| ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); | ivec4 x4 = ivec4(gx * 4) + ivec4(0, 1, 2, 3); | ||||
| // prevent out of range access | |||||
| // x4 = min(x4, p.w - 1); | |||||
| v_offset = x4; | v_offset = x4; | ||||
| } | } | ||||
| else if (p.dims == 2) | else if (p.dims == 2) | ||||
| { | { | ||||
| ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); | ivec4 y4 = ivec4(gy * 4) + ivec4(0, 1, 2, 3); | ||||
| // prevent out of range access | |||||
| // y4 = min(y4, p.h - 1); | |||||
| v_offset = y4 * p.w + gx; | v_offset = y4 * p.w + gx; | ||||
| } | } | ||||
| else // if (p.dims == 3) | else // if (p.dims == 3) | ||||
| { | { | ||||
| ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); | ivec4 z4 = ivec4(gz * 4) + ivec4(0, 1, 2, 3); | ||||
| // prevent out of range access | |||||
| z4 = min(z4, p.c - 1); | |||||
| v_offset = z4 * p.cstep + ivec4(gy * p.w + gx); | v_offset = z4 * p.cstep + ivec4(gy * p.w + gx); | ||||
| } | } | ||||
| sfpvec4 v; | |||||
| v.r = bottom_blob_data[v_offset.r]; | |||||
| v.g = bottom_blob_data[v_offset.g]; | |||||
| v.b = bottom_blob_data[v_offset.b]; | |||||
| v.a = bottom_blob_data[v_offset.a]; | |||||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| top_blob_data[gi].r = bottom_blob_data[v_offset.r]; | |||||
| top_blob_data[gi].g = bottom_blob_data[v_offset.g]; | |||||
| top_blob_data[gi].b = bottom_blob_data[v_offset.b]; | |||||
| top_blob_data[gi].a = bottom_blob_data[v_offset.a]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -70,10 +73,10 @@ void main() | |||||
| v_offset = z4 * p.outcstep + ivec4(gy * p.outw + gx); | v_offset = z4 * p.outcstep + ivec4(gy * p.outw + gx); | ||||
| } | } | ||||
| sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int gi = gz * p.cstep + gy * p.w + gx; | |||||
| top_blob_data[v_offset.r] = v.r; | |||||
| top_blob_data[v_offset.g] = v.g; | |||||
| top_blob_data[v_offset.b] = v.b; | |||||
| top_blob_data[v_offset.a] = v.a; | |||||
| top_blob_data[v_offset.r] = bottom_blob_data[gi].r; | |||||
| top_blob_data[v_offset.g] = bottom_blob_data[gi].g; | |||||
| top_blob_data[v_offset.b] = bottom_blob_data[gi].b; | |||||
| top_blob_data[v_offset.a] = bottom_blob_data[gi].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -56,8 +59,6 @@ void main() | |||||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | ||||
| return; | return; | ||||
| sfp res; | |||||
| int x = gx - left; | int x = gx - left; | ||||
| int y = gy - top; | int y = gy - top; | ||||
| @@ -66,11 +67,11 @@ void main() | |||||
| if (x >= 0 && x < p.w && y >= 0 && y < p.h) | if (x >= 0 && x < p.w && y >= 0 && y < p.h) | ||||
| { | { | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| res = sfp(value); | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfp(value); | |||||
| } | } | ||||
| } | } | ||||
| else if (type == 1) | else if (type == 1) | ||||
| @@ -79,8 +80,6 @@ void main() | |||||
| y = clamp(y, 0, p.h - 1); | y = clamp(y, 0, p.h - 1); | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -56,8 +59,6 @@ void main() | |||||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | ||||
| return; | return; | ||||
| sfpvec4 res; | |||||
| int x = gx - left; | int x = gx - left; | ||||
| int y = gy - top; | int y = gy - top; | ||||
| @@ -66,11 +67,11 @@ void main() | |||||
| if (x >= 0 && x < p.w && y >= 0 && y < p.h) | if (x >= 0 && x < p.w && y >= 0 && y < p.h) | ||||
| { | { | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| else | else | ||||
| { | { | ||||
| res = sfpvec4(value); | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = sfpvec4(value); | |||||
| } | } | ||||
| } | } | ||||
| else if (type == 1) | else if (type == 1) | ||||
| @@ -79,8 +80,6 @@ void main() | |||||
| y = clamp(y, 0, p.h - 1); | y = clamp(y, 0, p.h - 1); | ||||
| int v_offset = gz * p.cstep + y * p.w + x; | int v_offset = gz * p.cstep + y * p.w + x; | ||||
| res = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = res; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -104,10 +107,10 @@ void main() | |||||
| } | } | ||||
| } | } | ||||
| sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int gi = gz * p.cstep + gy * p.w + gx; | |||||
| top_blob_data[v_offset.r] = v.r; | |||||
| top_blob_data[v_offset.g] = v.g; | |||||
| top_blob_data[v_offset.b] = v.b; | |||||
| top_blob_data[v_offset.a] = v.a; | |||||
| top_blob_data[v_offset.r] = bottom_blob_data[gi].r; | |||||
| top_blob_data[v_offset.g] = bottom_blob_data[gi].g; | |||||
| top_blob_data[v_offset.b] = bottom_blob_data[gi].b; | |||||
| top_blob_data[v_offset.a] = bottom_blob_data[gi].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -47,9 +50,9 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afp v = bottom_top_blob_data[gx]; | |||||
| afp v = afp(bottom_top_blob_data[gx]); | |||||
| afp slope = num_slope > 1 ? slope_blob_data[gx] : slope_blob_data[0]; | |||||
| afp slope = num_slope > 1 ? afp(slope_blob_data[gx]) : afp(slope_blob_data[0]); | |||||
| v = v < afp(0.f) ? v * slope : v; | v = v < afp(0.f) ? v * slope : v; | ||||
| @@ -62,9 +65,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| afp slope = num_slope > 1 ? slope_blob_data[gy] : slope_blob_data[0]; | |||||
| afp slope = num_slope > 1 ? afp(slope_blob_data[gy]) : afp(slope_blob_data[0]); | |||||
| v = v < afp(0.f) ? v * slope : v; | v = v < afp(0.f) ? v * slope : v; | ||||
| @@ -77,9 +80,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| afp slope = num_slope > 1 ? slope_blob_data[gz] : slope_blob_data[0]; | |||||
| afp slope = num_slope > 1 ? afp(slope_blob_data[gz]) : afp(slope_blob_data[0]); | |||||
| v = v < afp(0.f) ? v * slope : v; | v = v < afp(0.f) ? v * slope : v; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -47,9 +50,9 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afpvec4 v = bottom_top_blob_data[gx]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gx]); | |||||
| afpvec4 slope = num_slope > 1 ? slope_blob_data[gx] : afpvec4(slope_blob_data[0]); | |||||
| afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gx]) : afpvec4(slope_blob_data[0]); | |||||
| v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | ||||
| @@ -62,9 +65,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| afpvec4 slope = num_slope > 1 ? slope_blob_data[gy] : afpvec4(slope_blob_data[0]); | |||||
| afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gy]) : afpvec4(slope_blob_data[0]); | |||||
| v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | ||||
| @@ -77,9 +80,9 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| afpvec4 slope = num_slope > 1 ? slope_blob_data[gz] : afpvec4(slope_blob_data[0]); | |||||
| afpvec4 slope = num_slope > 1 ? afpvec4(slope_blob_data[gz]) : afpvec4(slope_blob_data[0]); | |||||
| v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | v = mix(v, v * slope, lessThan(v, afpvec4(0.f))); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -69,14 +72,12 @@ void main() | |||||
| afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h); | afpvec4 image_norm = afp(1.f) / afpvec4(p.image_w, p.image_h, p.image_w, p.image_h); | ||||
| sfpvec4 variance = sfpvec4(variances_0, variances_1, variances_2, variances_3); | |||||
| afpvec4 box; | afpvec4 box; | ||||
| afp box_w; | afp box_w; | ||||
| afp box_h; | afp box_h; | ||||
| afp min_size = min_sizes_data[gx]; | |||||
| afp min_size = afp(min_sizes_data[gx]); | |||||
| // min size box | // min size box | ||||
| box_w = box_h = min_size; | box_w = box_h = min_size; | ||||
| @@ -84,14 +85,17 @@ void main() | |||||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | ||||
| top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | ||||
| top_blob_data[var_offset] = variance; | |||||
| top_blob_data[var_offset].r = sfp(variances_0); | |||||
| top_blob_data[var_offset].g = sfp(variances_1); | |||||
| top_blob_data[var_offset].b = sfp(variances_2); | |||||
| top_blob_data[var_offset].a = sfp(variances_3); | |||||
| v_offset += 1; | v_offset += 1; | ||||
| var_offset += 1; | var_offset += 1; | ||||
| if (num_max_size > 0) | if (num_max_size > 0) | ||||
| { | { | ||||
| afp max_size = max_sizes_data[gx]; | |||||
| afp max_size = afp(max_sizes_data[gx]); | |||||
| // max size box | // max size box | ||||
| box_w = box_h = sqrt(min_size * max_size); | box_w = box_h = sqrt(min_size * max_size); | ||||
| @@ -99,7 +103,10 @@ void main() | |||||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | ||||
| top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | ||||
| top_blob_data[var_offset] = variance; | |||||
| top_blob_data[var_offset].r = sfp(variances_0); | |||||
| top_blob_data[var_offset].g = sfp(variances_1); | |||||
| top_blob_data[var_offset].b = sfp(variances_2); | |||||
| top_blob_data[var_offset].a = sfp(variances_3); | |||||
| v_offset += 1; | v_offset += 1; | ||||
| var_offset += 1; | var_offset += 1; | ||||
| @@ -108,7 +115,7 @@ void main() | |||||
| // all aspect_ratios | // all aspect_ratios | ||||
| for (int pi = 0; pi < num_aspect_ratio; pi++) | for (int pi = 0; pi < num_aspect_ratio; pi++) | ||||
| { | { | ||||
| afp ar = aspect_ratios_data[pi]; | |||||
| afp ar = afp(aspect_ratios_data[pi]); | |||||
| box_w = min_size * sqrt(ar); | box_w = min_size * sqrt(ar); | ||||
| box_h = min_size / sqrt(ar); | box_h = min_size / sqrt(ar); | ||||
| @@ -116,7 +123,10 @@ void main() | |||||
| box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | box = (center + afpvec4(-box_w, -box_h, box_w, box_h) * afp(0.5f)) * image_norm; | ||||
| top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | ||||
| top_blob_data[var_offset] = variance; | |||||
| top_blob_data[var_offset].r = sfp(variances_0); | |||||
| top_blob_data[var_offset].g = sfp(variances_1); | |||||
| top_blob_data[var_offset].b = sfp(variances_2); | |||||
| top_blob_data[var_offset].a = sfp(variances_3); | |||||
| v_offset += 1; | v_offset += 1; | ||||
| var_offset += 1; | var_offset += 1; | ||||
| @@ -126,7 +136,10 @@ void main() | |||||
| box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; | box = (center + afpvec4(-box_h, -box_w, box_h, box_w) * afp(0.5f)) * image_norm; | ||||
| top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | top_blob_data[v_offset] = clip == 1 ? sfpvec4(clamp(box, afp(0.f), afp(1.f))) : sfpvec4(box); | ||||
| top_blob_data[var_offset] = variance; | |||||
| top_blob_data[var_offset].r = sfp(variances_0); | |||||
| top_blob_data[var_offset].g = sfp(variances_1); | |||||
| top_blob_data[var_offset].b = sfp(variances_2); | |||||
| top_blob_data[var_offset].a = sfp(variances_3); | |||||
| v_offset += 1; | v_offset += 1; | ||||
| var_offset += 1; | var_offset += 1; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -58,7 +61,7 @@ void main() | |||||
| afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); | afpvec4 center = afpvec4(center_x, center_y, center_x, center_y); | ||||
| // ratio = 1, various sizes | // ratio = 1, various sizes | ||||
| afp size = min_sizes_data[gx]; | |||||
| afp size = afp(min_sizes_data[gx]); | |||||
| afp cw = size * afp(p.h) / afp(p.w) / afp(2); | afp cw = size * afp(p.h) / afp(p.w) / afp(2); | ||||
| afp ch = size / afp(2); | afp ch = size / afp(2); | ||||
| @@ -69,7 +72,7 @@ void main() | |||||
| if (gx == num_sizes - 1) | if (gx == num_sizes - 1) | ||||
| { | { | ||||
| // various ratios, size = min_size = size[0] | // various ratios, size = min_size = size[0] | ||||
| afp size = min_sizes_data[0]; | |||||
| afp size = afp(min_sizes_data[0]); | |||||
| for (int pi = 1; pi < num_ratios; pi++) | for (int pi = 1; pi < num_ratios; pi++) | ||||
| { | { | ||||
| afp ratio = sqrt(afp(aspect_ratios_data[pi])); | afp ratio = sqrt(afp(aspect_ratios_data[pi])); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| if (slope == 0) | if (slope == 0) | ||||
| v = max(v, afp(0.f)); | v = max(v, afp(0.f)); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| if (slope == 0) | if (slope == 0) | ||||
| v = max(v, afp(0.f)); | v = max(v, afp(0.f)); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -58,7 +61,5 @@ void main() | |||||
| int v_offset = z * p.cstep + y * p.w + x; | int v_offset = z * p.cstep + y * p.w + x; | ||||
| sfp v = bottom_blob_data[v_offset]; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -60,12 +63,10 @@ void main() | |||||
| ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | ||||
| sfpvec4 v; | |||||
| v.r = bottom_blob_data[v_offset.r]; | |||||
| v.g = bottom_blob_data[v_offset.g]; | |||||
| v.b = bottom_blob_data[v_offset.b]; | |||||
| v.a = bottom_blob_data[v_offset.a]; | |||||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| top_blob_data[gi].r = bottom_blob_data[v_offset.r]; | |||||
| top_blob_data[gi].g = bottom_blob_data[v_offset.g]; | |||||
| top_blob_data[gi].b = bottom_blob_data[v_offset.b]; | |||||
| top_blob_data[gi].a = bottom_blob_data[v_offset.a]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -63,33 +66,25 @@ void main() | |||||
| ivec4 lane4 = z4 % 4; | ivec4 lane4 = z4 % 4; | ||||
| // v = v4[lane] | // v = v4[lane] | ||||
| sfpvec4 v; | |||||
| sfpvec4 v4; | |||||
| v4 = bottom_blob_data[v_offset.r]; | |||||
| if (lane4.r == 0) v.r = v4.r; | |||||
| else if (lane4.r == 1) v.r = v4.g; | |||||
| else if (lane4.r == 2) v.r = v4.b; | |||||
| else /* if (lane4.r == 3) */ v.r = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.g]; | |||||
| if (lane4.g == 0) v.g = v4.r; | |||||
| else if (lane4.g == 1) v.g = v4.g; | |||||
| else if (lane4.g == 2) v.g = v4.b; | |||||
| else /* if (lane4.g == 3) */ v.g = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.b]; | |||||
| if (lane4.b == 0) v.b = v4.r; | |||||
| else if (lane4.b == 1) v.b = v4.g; | |||||
| else if (lane4.b == 2) v.b = v4.b; | |||||
| else /* if (lane4.b == 3) */ v.b = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.a]; | |||||
| if (lane4.a == 0) v.a = v4.r; | |||||
| else if (lane4.a == 1) v.a = v4.g; | |||||
| else if (lane4.a == 2) v.a = v4.b; | |||||
| else /* if (lane4.a == 3) */ v.a = v4.a; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; | |||||
| else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; | |||||
| else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; | |||||
| else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; | |||||
| if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; | |||||
| else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; | |||||
| else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; | |||||
| else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; | |||||
| if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; | |||||
| else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; | |||||
| else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; | |||||
| else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; | |||||
| if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; | |||||
| else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; | |||||
| else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; | |||||
| else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -65,9 +68,19 @@ void main() | |||||
| int v_offset = z * p.cstep + y * p.w + x; | int v_offset = z * p.cstep + y * p.w + x; | ||||
| sfp v = bottom_blob_data[v_offset]; | |||||
| if (ndim == 1) top_blob_data[gx] = v; | |||||
| if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; | |||||
| if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| int gi; | |||||
| if (ndim == 1) | |||||
| { | |||||
| gi = gx; | |||||
| } | |||||
| if (ndim == 2) | |||||
| { | |||||
| gi = gy * p.outw + gx; | |||||
| } | |||||
| if (ndim == 3) | |||||
| { | |||||
| gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| } | |||||
| top_blob_data[gi] = bottom_blob_data[v_offset]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -65,13 +68,22 @@ void main() | |||||
| ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | ivec4 v_offset = z4 * p.cstep + y4 * p.w + x4; | ||||
| sfpvec4 v; | |||||
| v.r = bottom_blob_data[v_offset.r]; | |||||
| v.g = bottom_blob_data[v_offset.g]; | |||||
| v.b = bottom_blob_data[v_offset.b]; | |||||
| v.a = bottom_blob_data[v_offset.a]; | |||||
| if (ndim == 1) top_blob_data[gx] = v; | |||||
| if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; | |||||
| if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| int gi; | |||||
| if (ndim == 1) | |||||
| { | |||||
| gi = gx; | |||||
| } | |||||
| if (ndim == 2) | |||||
| { | |||||
| gi = gy * p.outw + gx; | |||||
| } | |||||
| if (ndim == 3) | |||||
| { | |||||
| gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| } | |||||
| top_blob_data[gi].r = bottom_blob_data[v_offset.r]; | |||||
| top_blob_data[gi].g = bottom_blob_data[v_offset.g]; | |||||
| top_blob_data[gi].b = bottom_blob_data[v_offset.b]; | |||||
| top_blob_data[gi].a = bottom_blob_data[v_offset.a]; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -85,35 +88,37 @@ void main() | |||||
| lane4 = z4 % 4; | lane4 = z4 % 4; | ||||
| } | } | ||||
| sfpvec4 v; | |||||
| sfpvec4 v4; | |||||
| v4 = bottom_blob_data[v_offset.r]; | |||||
| if (lane4.r == 0) v.r = v4.r; | |||||
| else if (lane4.r == 1) v.r = v4.g; | |||||
| else if (lane4.r == 2) v.r = v4.b; | |||||
| else /* if (lane4.r == 3) */ v.r = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.g]; | |||||
| if (lane4.g == 0) v.g = v4.r; | |||||
| else if (lane4.g == 1) v.g = v4.g; | |||||
| else if (lane4.g == 2) v.g = v4.b; | |||||
| else /* if (lane4.g == 3) */ v.g = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.b]; | |||||
| if (lane4.b == 0) v.b = v4.r; | |||||
| else if (lane4.b == 1) v.b = v4.g; | |||||
| else if (lane4.b == 2) v.b = v4.b; | |||||
| else /* if (lane4.b == 3) */ v.b = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.a]; | |||||
| if (lane4.a == 0) v.a = v4.r; | |||||
| else if (lane4.a == 1) v.a = v4.g; | |||||
| else if (lane4.a == 2) v.a = v4.b; | |||||
| else /* if (lane4.a == 3) */ v.a = v4.a; | |||||
| if (ndim == 1) top_blob_data[gx] = v; | |||||
| if (ndim == 2) top_blob_data[gy * p.outw + gx] = v; | |||||
| if (ndim == 3) top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| int gi; | |||||
| if (ndim == 1) | |||||
| { | |||||
| gi = gx; | |||||
| } | |||||
| if (ndim == 2) | |||||
| { | |||||
| gi = gy * p.outw + gx; | |||||
| } | |||||
| if (ndim == 3) | |||||
| { | |||||
| gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| } | |||||
| if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; | |||||
| else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; | |||||
| else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; | |||||
| else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; | |||||
| if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; | |||||
| else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; | |||||
| else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; | |||||
| else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; | |||||
| if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; | |||||
| else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; | |||||
| else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; | |||||
| else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; | |||||
| if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; | |||||
| else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; | |||||
| else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; | |||||
| else /* if (lane4.a == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.a].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -81,10 +84,10 @@ void main() | |||||
| v_offset = z4 * p.outcstep + y4 * p.outw + x4; | v_offset = z4 * p.outcstep + y4 * p.outw + x4; | ||||
| } | } | ||||
| sfpvec4 v = bottom_blob_data[gz * p.cstep + gy * p.w + gx]; | |||||
| int gi = gz * p.cstep + gy * p.w + gx; | |||||
| top_blob_data[v_offset.r] = v.r; | |||||
| top_blob_data[v_offset.g] = v.g; | |||||
| top_blob_data[v_offset.b] = v.b; | |||||
| top_blob_data[v_offset.a] = v.a; | |||||
| top_blob_data[v_offset.r] = bottom_blob_data[gi].r; | |||||
| top_blob_data[v_offset.g] = bottom_blob_data[gi].g; | |||||
| top_blob_data[v_offset.b] = bottom_blob_data[gi].b; | |||||
| top_blob_data[v_offset.a] = bottom_blob_data[gi].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -48,7 +51,7 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afp v = bottom_top_blob_data[gx]; | |||||
| afp v = afp(bottom_top_blob_data[gx]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afp(scale_blob_data[gx]) * v + afp(bias_blob_data[gx]); | v = afp(scale_blob_data[gx]) * v + afp(bias_blob_data[gx]); | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afp(scale_blob_data[gy]) * v + afp(bias_blob_data[gy]); | v = afp(scale_blob_data[gy]) * v + afp(bias_blob_data[gy]); | ||||
| @@ -80,7 +83,7 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afp(scale_blob_data[gz]) * v + afp(bias_blob_data[gz]); | v = afp(scale_blob_data[gz]) * v + afp(bias_blob_data[gz]); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -48,7 +51,7 @@ void main() | |||||
| if (p.dims == 1) | if (p.dims == 1) | ||||
| { | { | ||||
| afpvec4 v = bottom_top_blob_data[gx]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gx]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afpvec4(scale_blob_data[gx]) * v + afpvec4(bias_blob_data[gx]); | v = afpvec4(scale_blob_data[gx]) * v + afpvec4(bias_blob_data[gx]); | ||||
| @@ -64,7 +67,7 @@ void main() | |||||
| { | { | ||||
| const int gi = gy * p.w + gx; | const int gi = gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afpvec4(scale_blob_data[gy]) * v + afpvec4(bias_blob_data[gy]); | v = afpvec4(scale_blob_data[gy]) * v + afpvec4(bias_blob_data[gy]); | ||||
| @@ -80,7 +83,7 @@ void main() | |||||
| { | { | ||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| if (bias_term == 1) | if (bias_term == 1) | ||||
| v = afpvec4(scale_blob_data[gz]) * v + afpvec4(bias_blob_data[gz]); | v = afpvec4(scale_blob_data[gz]) * v + afpvec4(bias_blob_data[gz]); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -62,33 +65,25 @@ void main() | |||||
| ivec4 lane4 = z4 % 4; | ivec4 lane4 = z4 % 4; | ||||
| // v = v4[lane] | // v = v4[lane] | ||||
| sfpvec4 v; | |||||
| sfpvec4 v4; | |||||
| v4 = bottom_blob_data[v_offset.r]; | |||||
| if (lane4.r == 0) v.r = v4.r; | |||||
| else if (lane4.r == 1) v.r = v4.g; | |||||
| else if (lane4.r == 2) v.r = v4.b; | |||||
| else /* if (lane4.r == 3) */ v.r = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.g]; | |||||
| if (lane4.g == 0) v.g = v4.r; | |||||
| else if (lane4.g == 1) v.g = v4.g; | |||||
| else if (lane4.g == 2) v.g = v4.b; | |||||
| else /* if (lane4.g == 3) */ v.g = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.b]; | |||||
| if (lane4.b == 0) v.b = v4.r; | |||||
| else if (lane4.b == 1) v.b = v4.g; | |||||
| else if (lane4.b == 2) v.b = v4.b; | |||||
| else /* if (lane4.b == 3) */ v.b = v4.a; | |||||
| v4 = bottom_blob_data[v_offset.a]; | |||||
| if (lane4.a == 0) v.a = v4.r; | |||||
| else if (lane4.a == 1) v.a = v4.g; | |||||
| else if (lane4.a == 2) v.a = v4.b; | |||||
| else /* if (lane4.a == 3) */ v.a = v4.a; | |||||
| top_blob_data[gz * p.outcstep + gy * p.outw + gx] = v; | |||||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||||
| if (lane4.r == 0) top_blob_data[gi].r = bottom_blob_data[v_offset.r].r; | |||||
| else if (lane4.r == 1) top_blob_data[gi].r = bottom_blob_data[v_offset.r].g; | |||||
| else if (lane4.r == 2) top_blob_data[gi].r = bottom_blob_data[v_offset.r].b; | |||||
| else /* if (lane4.r == 3) */ top_blob_data[gi].r = bottom_blob_data[v_offset.r].a; | |||||
| if (lane4.g == 0) top_blob_data[gi].g = bottom_blob_data[v_offset.g].r; | |||||
| else if (lane4.g == 1) top_blob_data[gi].g = bottom_blob_data[v_offset.g].g; | |||||
| else if (lane4.g == 2) top_blob_data[gi].g = bottom_blob_data[v_offset.g].b; | |||||
| else /* if (lane4.g == 3) */ top_blob_data[gi].g = bottom_blob_data[v_offset.g].a; | |||||
| if (lane4.b == 0) top_blob_data[gi].b = bottom_blob_data[v_offset.b].r; | |||||
| else if (lane4.b == 1) top_blob_data[gi].b = bottom_blob_data[v_offset.b].g; | |||||
| else if (lane4.b == 2) top_blob_data[gi].b = bottom_blob_data[v_offset.b].b; | |||||
| else /* if (lane4.b == 3) */ top_blob_data[gi].b = bottom_blob_data[v_offset.b].a; | |||||
| if (lane4.a == 0) top_blob_data[gi].a = bottom_blob_data[v_offset.a].r; | |||||
| else if (lane4.a == 1) top_blob_data[gi].a = bottom_blob_data[v_offset.a].g; | |||||
| else if (lane4.a == 2) top_blob_data[gi].a = bottom_blob_data[v_offset.a].b; | |||||
| else /* if (lane4.a == 3) */ top_blob_data[gi].a = bottom_blob_data[v_offset.a].a; | |||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| v = afp(1.f) / (afp(1.f) + exp(-v)); | v = afp(1.f) / (afp(1.f) + exp(-v)); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| v = afp(1.f) / (afp(1.f) + exp(-v)); | v = afp(1.f) / (afp(1.f) + exp(-v)); | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -53,8 +56,8 @@ void main() | |||||
| if (p.dims == 1) // axis == 0 | if (p.dims == 1) // axis == 0 | ||||
| { | { | ||||
| afp sum = sum_workspace_data[0]; | |||||
| afp v = bottom_top_blob_data[gx]; | |||||
| afp sum = afp(sum_workspace_data[0]); | |||||
| afp v = afp(bottom_top_blob_data[gx]); | |||||
| bottom_top_blob_data[gx] = sfp(v / sum); | bottom_top_blob_data[gx] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -62,8 +65,8 @@ void main() | |||||
| if (p.dims == 2 && axis == 0) | if (p.dims == 2 && axis == 0) | ||||
| { | { | ||||
| int gi = gy * p.w + gx; | int gi = gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gx]; | |||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gx]); | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(v / sum); | bottom_top_blob_data[gi] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -71,8 +74,8 @@ void main() | |||||
| if (p.dims == 2 && axis == 1) | if (p.dims == 2 && axis == 1) | ||||
| { | { | ||||
| int gi = gy * p.w + gx; | int gi = gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gy]; | |||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gy]); | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(v / sum); | bottom_top_blob_data[gi] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -80,8 +83,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 0) | if (p.dims == 3 && axis == 0) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gy * p.w + gx]; | |||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gy * p.w + gx]); | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(v / sum); | bottom_top_blob_data[gi] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -89,8 +92,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 1) | if (p.dims == 3 && axis == 1) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gz * p.w + gx]; | |||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gz * p.w + gx]); | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(v / sum); | bottom_top_blob_data[gi] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -98,8 +101,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 2) | if (p.dims == 3 && axis == 2) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gz * p.h + gy]; | |||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gz * p.h + gy]); | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(v / sum); | bottom_top_blob_data[gi] = sfp(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -53,8 +56,8 @@ void main() | |||||
| if (p.dims == 1) // axis == 0 | if (p.dims == 1) // axis == 0 | ||||
| { | { | ||||
| afp sum = sum_workspace_data[0]; | |||||
| afpvec4 v = bottom_top_blob_data[gx]; | |||||
| afp sum = afp(sum_workspace_data[0]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gx]); | |||||
| bottom_top_blob_data[gx] = sfpvec4(v / sum); | bottom_top_blob_data[gx] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -62,8 +65,8 @@ void main() | |||||
| if (p.dims == 2 && axis == 0) | if (p.dims == 2 && axis == 0) | ||||
| { | { | ||||
| int gi = gy * p.w + gx; | int gi = gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gx]; | |||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gx]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | bottom_top_blob_data[gi] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -71,8 +74,8 @@ void main() | |||||
| if (p.dims == 2 && axis == 1) | if (p.dims == 2 && axis == 1) | ||||
| { | { | ||||
| int gi = gy * p.w + gx; | int gi = gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gy]; | |||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gy]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | bottom_top_blob_data[gi] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -80,8 +83,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 0) | if (p.dims == 3 && axis == 0) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gy * p.w + gx]; | |||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gy * p.w + gx]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | bottom_top_blob_data[gi] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -89,8 +92,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 1) | if (p.dims == 3 && axis == 1) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gz * p.w + gx]; | |||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gz * p.w + gx]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | bottom_top_blob_data[gi] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -98,8 +101,8 @@ void main() | |||||
| if (p.dims == 3 && axis == 2) | if (p.dims == 3 && axis == 2) | ||||
| { | { | ||||
| int gi = gz * p.cstep + gy * p.w + gx; | int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp sum = sum_workspace_data[gz * p.h + gy]; | |||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afp sum = afp(sum_workspace_data[gz * p.h + gy]); | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(v / sum); | bottom_top_blob_data[gi] = sfpvec4(v / sum); | ||||
| return; | return; | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfp(tanh(v)); | bottom_top_blob_data[gi] = sfp(tanh(v)); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -44,7 +47,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| bottom_top_blob_data[gi] = sfpvec4(tanh(v)); | bottom_top_blob_data[gi] = sfpvec4(tanh(v)); | ||||
| } | } | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afp v = bottom_top_blob_data[gi]; | |||||
| afp v = afp(bottom_top_blob_data[gi]); | |||||
| afp res; | afp res; | ||||
| @@ -14,7 +14,10 @@ | |||||
| #version 450 | #version 450 | ||||
| #if NCNN_fp16_storage || NCNN_fp16_arithmetic | |||||
| #if NCNN_fp16_storage | |||||
| #extension GL_EXT_shader_16bit_storage: require | |||||
| #endif | |||||
| #if NCNN_fp16_arithmetic | |||||
| #extension GL_AMD_gpu_shader_half_float: require | #extension GL_AMD_gpu_shader_half_float: require | ||||
| #endif | #endif | ||||
| @@ -46,7 +49,7 @@ void main() | |||||
| const int gi = gz * p.cstep + gy * p.w + gx; | const int gi = gz * p.cstep + gy * p.w + gx; | ||||
| afpvec4 v = bottom_top_blob_data[gi]; | |||||
| afpvec4 v = afpvec4(bottom_top_blob_data[gi]); | |||||
| afpvec4 res; | afpvec4 res; | ||||