fix: code style

10 months ago · 7f218e0cd1
--- a/src/layer/vulkan/layernorm_vulkan.cpp
+++ b/src/layer/vulkan/layernorm_vulkan.cpp
@@ -16,6 +16,7 @@ namespace ncnn {
 // =================================================================================================
 static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt)
 {
    return;
    if (m.empty())
    {
        printf("--- %s ---\n", name);
@@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
    return 0;
 }

 int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
 int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const
 {
    int elemsize_bak = _bottom_top_blob.elemsize;
    VkMat bottom_top_blob;
    vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);

    int w = bottom_top_blob.w;
    int h = bottom_top_blob.h;
    int channels = bottom_top_blob.c;
@@ -429,10 +434,9 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
    coeff_bindings[1] = mean_workspace;
    coeff_bindings[2] = var_workspace;

    std::vector<vk_constant_type> coeff_constants(3);
    coeff_constants[0].i = 1;
    coeff_constants[1].i = num_groups_per_channel;
    coeff_constants[2].i = channels;
    std::vector<vk_constant_type> coeff_constants(2);
    coeff_constants[0].i = num_groups_per_channel;
    coeff_constants[1].i = channels;

    VkMat dispatcher_coeffs;
    dispatcher_coeffs.w = 1;
@@ -462,6 +466,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
    print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt);
    // ===============================================

    vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt);

    return 0;
 }

--- a/src/layer/vulkan/shader/layernorm_coeffs.comp
+++ b/src/layer/vulkan/shader/layernorm_coeffs.comp
@@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
 layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
 layout (binding = 2) readonly buffer var_blob { float var_data[]; };

 layout (push_constant) uniform parameter {
    int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x
 layout (push_constant) uniform parameter
 {
    int num_groups_per_channel;
    int num_channels;
    int c;
 } p;

 void main() {
 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);
    if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return;
    if (gx >= 1 ||gy >= p.num_groups_per_channel || gz >= p.c)
        return;

    int group_id = gz * p.num_groups_per_channel + gy;

@@ -32,4 +35,4 @@ void main() {

    buffer_st1(coeffs_blob_data, group_id * 2, afp(a));
    buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b));
 }
 }
--- a/src/layer/vulkan/shader/layernorm_norm.comp
+++ b/src/layer/vulkan/shader/layernorm_norm.comp
@@ -3,12 +3,10 @@

 #version 450

 // This specialization constant is now used to control the affine transformation
 layout (constant_id = 0) const int affine = 0;

 layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
 // Separate bindings for gamma and beta if affine is enabled
 layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; };
 layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; };

@@ -27,47 +25,38 @@ void main()
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    // Boundary check against the original tensor dimensions
    if (gx >= p.w || gy >= p.h || gz >= p.c)
    return;
        return;

    int group_id;
    int inner_id;

    // Determine the group ID and the element's ID within that group.
    // This logic correctly maps each invocation to its normalization group.
    if (p.affine_size == p.w)
    {
        // Normalization is performed per row
        group_id = gz * p.h + gy;
        inner_id = gx;
    }
    else // if (p.affine_size == p.w * p.h)
    {
        // Normalization is performed per channel
        group_id = gz;
        inner_id = gy * p.w + gx;
    }

    // Fetch the pre-calculated normalization coefficients a and b.
    // There is one (a, b) pair per group.
    afp a = buffer_ld1(coeffs_blob_data, group_id * 2);
    afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1);

    // Calculate the correct linear index for the element, respecting cstep.
    int linear_index = gz * p.cstep + gy * p.w + gx;
    afp v = buffer_ld1(bottom_top_blob_data, linear_index);

    // Apply the base normalization: (x - mean) / sqrt(var + eps)
    // (x - mean) / sqrt(var + eps)
    v = v * a + b;

    // Apply the learned affine transformation if enabled
    if (affine == 1) {
    if (affine == 1)
    {
        afp gamma = buffer_ld1(gamma_data, inner_id);
        afp beta  = buffer_ld1(beta_data, inner_id);
        v = v * gamma + beta;
    }

    // Write the final result back to the same location
    buffer_st1(bottom_top_blob_data, linear_index, v);
 }
--- a/src/layer/vulkan/shader/layernorm_reduce_mean.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_mean.comp
@@ -6,25 +6,30 @@
 layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; };
 layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; };

 layout (push_constant) uniform parameter {
    int w; int h; int c; int cstep;
 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;
    float group_size;
 } p;

 void main() {
    // Each invocation calculates the mean for one group, identified by (gy, gz)
 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);
    if (gy >= p.h || gz >= p.c) return;
    if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request
    return;

    float sum = 0.f;
    // Base offset for the current group's row of data in the 3D sum_blob
    int v_offset = gz * p.cstep + gy * p.w;
    for (int i = 0; i < p.w; i++) {
    for (int i = 0; i < p.w; i++)
    {
        sum += sum_blob_data[v_offset + i];
    }

    // Output is a linear buffer indexed by the flattened group_id
    int group_id = gz * p.h + gy;
    mean_data[group_id] = sum / p.group_size;
 }
 }
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
@@ -3,50 +3,69 @@

 #version 450

 // This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
 layout (binding = 0) readonly buffer input_blob { sfp input_data[]; };
 layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
 layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };

 layout (push_constant) uniform parameter {
    int w; int h; int c; int cstep;
    int outw; int outh; int outc; int outcstep;
 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main() {
    // Global invocation IDs map to the output buffer dimensions
 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    float sum;

    // sx is the starting element index for reduction within a group
    int sx = gx * 4;

    // Correctly calculate the base offset for the group in the input tensor.
    // gz * p.cstep -> Jumps to the start of the correct channel plane.
    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
    // The stride between rows is p.w (the width of a row).
    int base_offset = gz * p.cstep + gy * p.w;
    int v_offset = gz * p.cstep + gy * p.w + sx;

    float sum;
    int r_offset = base_offset + sx;

    if (sx >= p.w - 3) {
        if (sx >= p.w) {
            sum = 0.0f;
        } else if (sx == p.w - 1) {
            sum = float(buffer_ld1(input_data, r_offset));
        } else if (sx == p.w - 2) {
            sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1));
        } else { // sx == p.w - 3
                 sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2));
        }
    } else {
        sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3));
    if (sx == p.w - 1)
    {
        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));

        sum = v0;
    }
    else if (sx == p.w - 2)
    {
        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));

        sum = v0 + v1;
    }
    else if (sx == p.w - 3)
    {
        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));

        sum = v0 + v1 + v2;
    }
    else
    {
        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
        float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3));

        sum = v0 + v1 + v2 + v3;
    }


    // Output index is a packed 3D index
    int out_index = gz * p.outcstep + gy * p.outw + gx;
    output_data[out_index] = sum;
 }
    int gi = gz * p.outcstep + gy * p.outw + gx;
    sum_blob_data[gi] = sum;
 }
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
@@ -3,50 +3,68 @@

 #version 450

 // This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
 layout (binding = 0) readonly buffer input_blob { float input_data[]; };
 layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
 layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; };
 layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };

 layout (push_constant) uniform parameter {
    int w; int h; int c; int cstep;
    int outw; int outh; int outc; int outcstep;
 layout (push_constant) uniform parameter
 {
    int w;
    int h;
    int c;
    int cstep;

    int outw;
    int outh;
    int outc;
    int outcstep;
 } p;

 void main() {
    // Global invocation IDs map to the output buffer dimensions
 void main()
 {
    int gx = int(gl_GlobalInvocationID.x);
    int gy = int(gl_GlobalInvocationID.y);
    int gz = int(gl_GlobalInvocationID.z);

    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
        return;

    float sum;

    // sx is the starting element index for reduction within a group
    int sx = gx * 4;

    // Correctly calculate the base offset for the group in the input tensor.
    // gz * p.cstep -> Jumps to the start of the correct channel plane.
    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
    // The stride between rows is p.w (the width of a row).
    int base_offset = gz * p.cstep + gy * p.w;
    int v_offset = gz * p.cstep + gy * p.w + sx;

    float sum;
    int r_offset = base_offset + sx;

    if (sx >= p.w - 3) {
        if (sx >= p.w) {
            sum = 0.0f;
        } else if (sx == p.w - 1) {
            sum = input_data[r_offset];
        } else if (sx == p.w - 2) {
            sum = input_data[r_offset] + input_data[r_offset + 1];
        } else { // sx == p.w - 3
                 sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2];
        }
    } else {
        sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3];
    if (sx >= p.w)
    {
        sum = 0.0f;
    }
    else if (sx == p.w - 1)
    {
        float v0 = bottom_top_blob_data[v_offset];
        sum = v0;
    }
    else if (sx == p.w - 2)
    {
        float v0 = bottom_top_blob_data[v_offset];
        float v1 = bottom_top_blob_data[v_offset + 1];
        sum = v0 + v1;
    }
    else if (sx == p.w - 3)
    {
        float v0 = bottom_top_blob_data[v_offset];
        float v1 = bottom_top_blob_data[v_offset + 1];
        float v2 = bottom_top_blob_data[v_offset + 2];
        sum = v0 + v1 + v2;
    }
    else
    {
        float v0 = bottom_top_blob_data[v_offset];
        float v1 = bottom_top_blob_data[v_offset + 1];
        float v2 = bottom_top_blob_data[v_offset + 2];
        float v3 = bottom_top_blob_data[v_offset + 3];
        sum = v0 + v1 + v2 + v3;
    }

    // Output index is a packed 3D index
    int out_index = gz * p.outcstep + gy * p.outw + gx;
    output_data[out_index] = sum;
 }
    int gi = gz * p.outcstep + gy * p.outw + gx;
    sum_blob_data[gi] = sum;
 }
--- a/src/layer/vulkan/shader/layernorm_sub_mean_square.comp
+++ b/src/layer/vulkan/shader/layernorm_sub_mean_square.comp
@@ -5,7 +5,7 @@

 layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
 // The output buffer's type must match the input type if the framework reuses the memory type

 layout (binding = 2) writeonly buffer square_blob { sfp square_blob_data[]; };

 layout (push_constant) uniform parameter {
@@ -40,10 +40,9 @@ void main() {

    int linear_index = gz * p.cstep + gy * p.w + gx;

    // Perform calculation in fp32 for precision
    float v = float(buffer_ld1(bottom_top_blob_data, linear_index));

    v = v - mean;
    // Write back in the native storage format (sfp)

    buffer_st1(square_blob_data, linear_index, afp(v * v));
 }