fix adaptive avg pooling accumulation overflow in vulkan using fp16 arithmetic (#2698)

5 years ago · 41fba71fa0
--- a/src/layer/vulkan/shader/pooling_adaptive.comp
+++ b/src/layer/vulkan/shader/pooling_adaptive.comp
@@ -111,7 +111,7 @@ void main()
    }
    if (pooling_type == 1)
    {
        res = afp(0.f);
        float res_fp32 = 0.f;  // force accumulation in fp32
        int area = 0;

 #if NCNN_image_shader
@@ -119,7 +119,7 @@ void main()
        {
            for (int x = 0; x < kernel_w; x++)
            {
                res += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz));
                res_fp32 += image3d_ld1(bottom_blob, ivec3(sx + x, sy + y, gz));
                area += 1;
            }
        }
@@ -130,7 +130,7 @@ void main()
        {
            for (int x = 0; x < kernel_w; x++)
            {
                res += buffer_ld1(bottom_blob_data, v_offset + x);
                res_fp32 += buffer_ld1(bottom_blob_data, v_offset + x);
                area += 1;
            }

@@ -138,7 +138,8 @@ void main()
        }
 #endif

        res /= afp(area);
        res_fp32 /= float(area);
        res = afp(res_fp32);  // cast to fp16 if possible
    }

 #if NCNN_image_shader
--- a/src/layer/vulkan/shader/pooling_adaptive_pack4.comp
+++ b/src/layer/vulkan/shader/pooling_adaptive_pack4.comp
@@ -111,7 +111,7 @@ void main()
    }
    else if (pooling_type == 1)
    {
        res = afpvec4(0.f);
        vec4 res_fp32 = vec4(0.f);  // force accumulation in fp32
        int area = 0;

 #if NCNN_image_shader
@@ -119,7 +119,7 @@ void main()
        {
            for (int x = 0; x < kernel_w; x++)
            {
                res += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz));
                res_fp32 += image3d_ld4(bottom_blob, ivec3(sx + x, sy + y, gz));
                area += 1;
            }
        }
@@ -130,7 +130,7 @@ void main()
        {
            for (int x = 0; x < kernel_w; x++)
            {
                res += buffer_ld4(bottom_blob_data, v_offset + x);
                res_fp32 += buffer_ld4(bottom_blob_data, v_offset + x);
                area += 1;
            }

@@ -138,7 +138,8 @@ void main()
        }
 #endif

        res /= afp(area);
        res_fp32 /= float(area);
        res = afpvec4(res_fp32);  // cast to fp16 if possible
    }

 #if NCNN_image_shader
--- a/src/layer/vulkan/shader/pooling_adaptive_pack8.comp
+++ b/src/layer/vulkan/shader/pooling_adaptive_pack8.comp
@@ -114,7 +114,7 @@ void main()
    }
    else if (pooling_type == 1)
    {
        res = afpvec8(afpvec4(0.f), afpvec4(0.f));
        mat2x4 res_fp32 = mat2x4(vec4(0.f), vec4(0.f));  // force accumulation in fp32
        int area = 0;
        
 #if NCNN_image_shader
@@ -123,8 +123,8 @@ void main()
            for (int x = 0; x < kernel_w; x++)
            {
                afpvec8 v = image3d_ld8(bottom_blob, ivec3(sx + x, sy + y, gz));
                res[0] += v[0];
                res[1] += v[1];
                res_fp32[0] += v[0];
                res_fp32[1] += v[1];
                area += 1;
            }
        }
@@ -136,8 +136,8 @@ void main()
            for (int x = 0; x < kernel_w; x++)
            {
                afpvec8 v = buffer_ld8(bottom_blob_data, v_offset + x);
                res[0] += v[0];
                res[1] += v[1];
                res_fp32[0] += v[0];
                res_fp32[1] += v[1];
                area += 1;
            }

@@ -145,8 +145,9 @@ void main()
        }
 #endif

        res[0] /= afp(area);
        res[1] /= afp(area);
        res_fp32[0] /= float(area);
        res_fp32[1] /= float(area);
        res = afpvec8(res_fp32);  // cast to fp16 if possible
    }

 #if NCNN_image_shader