From 7f218e0cd11d1cbc6ff52ef078ea4a7d68bbc877 Mon Sep 17 00:00:00 2001
From: ice <1391525377@qq.com>
Date: Wed, 6 Aug 2025 22:47:54 +0800
Subject: [PATCH] fix: code style

---
 src/layer/vulkan/layernorm_vulkan.cpp         | 16 ++--
 src/layer/vulkan/shader/layernorm_coeffs.comp | 15 ++--
 src/layer/vulkan/shader/layernorm_norm.comp   | 19 +---
 .../vulkan/shader/layernorm_reduce_mean.comp  | 23 +++--
 .../layernorm_reduce_sum4_fp16_to_fp32.comp   | 87 +++++++++++--------
 .../shader/layernorm_reduce_sum4_fp32.comp    | 86 ++++++++++--------
 .../shader/layernorm_sub_mean_square.comp     |  5 +-
 7 files changed, 145 insertions(+), 106 deletions(-)

diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp
index a226c4bbd..24a877f37 100644
--- a/src/layer/vulkan/layernorm_vulkan.cpp
+++ b/src/layer/vulkan/layernorm_vulkan.cpp
@@ -16,6 +16,7 @@ namespace ncnn {
 // =================================================================================================
 static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt)
 {
+    return;
     if (m.empty())
     {
         printf("--- %s ---\n", name);
@@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     return 0;
 }
 
-int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const
 {
+    int elemsize_bak = _bottom_top_blob.elemsize;
+    VkMat bottom_top_blob;
+    vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
@@ -429,10 +434,9 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
     coeff_bindings[1] = mean_workspace;
     coeff_bindings[2] = var_workspace;
 
-    std::vector<vk_constant_type> coeff_constants(3);
-    coeff_constants[0].i = 1;
-    coeff_constants[1].i = num_groups_per_channel;
-    coeff_constants[2].i = channels;
+    std::vector<vk_constant_type> coeff_constants(2);
+    coeff_constants[0].i = num_groups_per_channel;
+    coeff_constants[1].i = channels;
 
     VkMat dispatcher_coeffs;
     dispatcher_coeffs.w = 1;
@@ -462,6 +466,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
     print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt);
     // ===============================================
 
+    vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt);
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/shader/layernorm_coeffs.comp b/src/layer/vulkan/shader/layernorm_coeffs.comp
index 7f189f389..1131a27e3 100644
--- a/src/layer/vulkan/shader/layernorm_coeffs.comp
+++ b/src/layer/vulkan/shader/layernorm_coeffs.comp
@@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
 layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
 layout (binding = 2) readonly buffer var_blob { float var_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x
+layout (push_constant) uniform parameter
+{
     int num_groups_per_channel;
-    int num_channels;
+    int c;
 } p;
 
-void main() {
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
-    if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return;
+    if (gx >= 1 ||gy >= p.num_groups_per_channel || gz >= p.c)
+        return;
 
     int group_id = gz * p.num_groups_per_channel + gy;
 
@@ -32,4 +35,4 @@ void main() {
 
     buffer_st1(coeffs_blob_data, group_id * 2, afp(a));
     buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b));
-}
\ No newline at end of file
+}
diff --git a/src/layer/vulkan/shader/layernorm_norm.comp b/src/layer/vulkan/shader/layernorm_norm.comp
index 46b016069..d966c44fa 100644
--- a/src/layer/vulkan/shader/layernorm_norm.comp
+++ b/src/layer/vulkan/shader/layernorm_norm.comp
@@ -3,12 +3,10 @@
 
 #version 450
 
-// This specialization constant is now used to control the affine transformation
 layout (constant_id = 0) const int affine = 0;
 
 layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
-// Separate bindings for gamma and beta if affine is enabled
 layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; };
 layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; };
 
@@ -27,47 +25,38 @@ void main()
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    // Boundary check against the original tensor dimensions
     if (gx >= p.w || gy >= p.h || gz >= p.c)
-    return;
+        return;
 
     int group_id;
     int inner_id;
 
-    // Determine the group ID and the element's ID within that group.
-    // This logic correctly maps each invocation to its normalization group.
     if (p.affine_size == p.w)
     {
-        // Normalization is performed per row
         group_id = gz * p.h + gy;
         inner_id = gx;
     }
     else // if (p.affine_size == p.w * p.h)
     {
-        // Normalization is performed per channel
         group_id = gz;
         inner_id = gy * p.w + gx;
     }
 
-    // Fetch the pre-calculated normalization coefficients a and b.
-    // There is one (a, b) pair per group.
     afp a = buffer_ld1(coeffs_blob_data, group_id * 2);
     afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1);
 
-    // Calculate the correct linear index for the element, respecting cstep.
     int linear_index = gz * p.cstep + gy * p.w + gx;
     afp v = buffer_ld1(bottom_top_blob_data, linear_index);
 
-    // Apply the base normalization: (x - mean) / sqrt(var + eps)
+    // (x - mean) / sqrt(var + eps)
     v = v * a + b;
 
-    // Apply the learned affine transformation if enabled
-    if (affine == 1) {
+    if (affine == 1)
+    {
         afp gamma = buffer_ld1(gamma_data, inner_id);
         afp beta  = buffer_ld1(beta_data, inner_id);
         v = v * gamma + beta;
     }
 
-    // Write the final result back to the same location
     buffer_st1(bottom_top_blob_data, linear_index, v);
 }
diff --git a/src/layer/vulkan/shader/layernorm_reduce_mean.comp b/src/layer/vulkan/shader/layernorm_reduce_mean.comp
index 985061500..389e46e65 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_mean.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_mean.comp
@@ -6,25 +6,30 @@
 layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; };
 layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
     float group_size;
 } p;
 
-void main() {
-    // Each invocation calculates the mean for one group, identified by (gy, gz)
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
-    if (gy >= p.h || gz >= p.c) return;
+    if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request
+    return;
 
     float sum = 0.f;
-    // Base offset for the current group's row of data in the 3D sum_blob
     int v_offset = gz * p.cstep + gy * p.w;
-    for (int i = 0; i < p.w; i++) {
+    for (int i = 0; i < p.w; i++)
+    {
         sum += sum_blob_data[v_offset + i];
     }
 
-    // Output is a linear buffer indexed by the flattened group_id
     int group_id = gz * p.h + gy;
     mean_data[group_id] = sum / p.group_size;
-}
\ No newline at end of file
+}
diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
index 81c1afa98..ca362ea39 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
@@ -3,50 +3,69 @@
 
 #version 450
 
-// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
-layout (binding = 0) readonly buffer input_blob { sfp input_data[]; };
-layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
+layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
-    int outw; int outh; int outc; int outcstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
 } p;
 
-void main() {
-    // Global invocation IDs map to the output buffer dimensions
+void main()
+{
     int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
 
-    // sx is the starting element index for reduction within a group
     int sx = gx * 4;
 
-    // Correctly calculate the base offset for the group in the input tensor.
-    // gz * p.cstep -> Jumps to the start of the correct channel plane.
-    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
-    // The stride between rows is p.w (the width of a row).
-    int base_offset = gz * p.cstep + gy * p.w;
+    int v_offset = gz * p.cstep + gy * p.w + sx;
 
-    float sum;
-    int r_offset = base_offset + sx;
-
-    if (sx >= p.w - 3) {
-        if (sx >= p.w) {
-            sum = 0.0f;
-        } else if (sx == p.w - 1) {
-            sum = float(buffer_ld1(input_data, r_offset));
-        } else if (sx == p.w - 2) {
-            sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1));
-        } else { // sx == p.w - 3
-                 sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2));
-        }
-    } else {
-        sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3));
+    if (sx == p.w - 1)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+
+        sum = v0;
+    }
+    else if (sx == p.w - 2)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+
+        sum = v0 + v1;
+    }
+    else if (sx == p.w - 3)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
+
+        sum = v0 + v1 + v2;
     }
+    else
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
+        float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3));
+
+        sum = v0 + v1 + v2 + v3;
+    }
+
 
-    // Output index is a packed 3D index
-    int out_index = gz * p.outcstep + gy * p.outw + gx;
-    output_data[out_index] = sum;
-}
\ No newline at end of file
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+    sum_blob_data[gi] = sum;
+}
diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
index ce5c26782..6b9518415 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
@@ -3,50 +3,68 @@
 
 #version 450
 
-// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
-layout (binding = 0) readonly buffer input_blob { float input_data[]; };
-layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
+layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
-    int outw; int outh; int outc; int outcstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
 } p;
 
-void main() {
-    // Global invocation IDs map to the output buffer dimensions
+void main()
+{
     int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
 
-    // sx is the starting element index for reduction within a group
     int sx = gx * 4;
 
-    // Correctly calculate the base offset for the group in the input tensor.
-    // gz * p.cstep -> Jumps to the start of the correct channel plane.
-    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
-    // The stride between rows is p.w (the width of a row).
-    int base_offset = gz * p.cstep + gy * p.w;
+    int v_offset = gz * p.cstep + gy * p.w + sx;
 
-    float sum;
-    int r_offset = base_offset + sx;
-
-    if (sx >= p.w - 3) {
-        if (sx >= p.w) {
-            sum = 0.0f;
-        } else if (sx == p.w - 1) {
-            sum = input_data[r_offset];
-        } else if (sx == p.w - 2) {
-            sum = input_data[r_offset] + input_data[r_offset + 1];
-        } else { // sx == p.w - 3
-                 sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2];
-        }
-    } else {
-        sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3];
+    if (sx >= p.w)
+    {
+        sum = 0.0f;
+    }
+    else if (sx == p.w - 1)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        sum = v0;
+    }
+    else if (sx == p.w - 2)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        sum = v0 + v1;
+    }
+    else if (sx == p.w - 3)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        float v2 = bottom_top_blob_data[v_offset + 2];
+        sum = v0 + v1 + v2;
+    }
+    else
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        float v2 = bottom_top_blob_data[v_offset + 2];
+        float v3 = bottom_top_blob_data[v_offset + 3];
+        sum = v0 + v1 + v2 + v3;
     }
 
-    // Output index is a packed 3D index
-    int out_index = gz * p.outcstep + gy * p.outw + gx;
-    output_data[out_index] = sum;
-}
\ No newline at end of file
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+    sum_blob_data[gi] = sum;
+}
diff --git a/src/layer/vulkan/shader/layernorm_sub_mean_square.comp b/src/layer/vulkan/shader/layernorm_sub_mean_square.comp
index 6fd1d8151..cec9bd32d 100644
--- a/src/layer/vulkan/shader/layernorm_sub_mean_square.comp
+++ b/src/layer/vulkan/shader/layernorm_sub_mean_square.comp
@@ -5,7 +5,7 @@
 
 layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
-// The output buffer's type must match the input type if the framework reuses the memory type
+
 layout (binding = 2) writeonly buffer square_blob { sfp square_blob_data[]; };
 
 layout (push_constant) uniform parameter {
@@ -40,10 +40,9 @@ void main() {
 
     int linear_index = gz * p.cstep + gy * p.w + gx;
 
-    // Perform calculation in fp32 for precision
     float v = float(buffer_ld1(bottom_top_blob_data, linear_index));
 
     v = v - mean;
-    // Write back in the native storage format (sfp)
+
     buffer_st1(square_blob_data, linear_index, afp(v * v));
 }
\ No newline at end of file