From 7a7e5c106baefa6e19ab96c61137e3b269d9995d Mon Sep 17 00:00:00 2001
From: ice <1391525377@qq.com>
Date: Wed, 6 Aug 2025 22:47:54 +0800
Subject: [PATCH 1/2] fix: code style

---
 src/layer/vulkan/layernorm_vulkan.cpp         |  9 +-
 src/layer/vulkan/shader/layernorm_coeffs.comp | 11 ++-
 src/layer/vulkan/shader/layernorm_norm.comp   | 19 +---
 .../vulkan/shader/layernorm_reduce_mean.comp  | 23 +++--
 .../layernorm_reduce_sum4_fp16_to_fp32.comp   | 87 +++++++++++--------
 .../shader/layernorm_reduce_sum4_fp32.comp    | 86 ++++++++++--------
 6 files changed, 138 insertions(+), 97 deletions(-)

diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp
index a226c4bbd..838572152 100644
--- a/src/layer/vulkan/layernorm_vulkan.cpp
+++ b/src/layer/vulkan/layernorm_vulkan.cpp
@@ -16,6 +16,7 @@ namespace ncnn {
 // =================================================================================================
 static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt)
 {
+    return;
     if (m.empty())
     {
         printf("--- %s ---\n", name);
@@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
     return 0;
 }
 
-int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
+int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const
 {
+    int elemsize_bak = _bottom_top_blob.elemsize;
+    VkMat bottom_top_blob;
+    vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);
+
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
     int channels = bottom_top_blob.c;
@@ -462,6 +467,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
     print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt);
     // ===============================================
 
+    vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt);
+
     return 0;
 }
 
diff --git a/src/layer/vulkan/shader/layernorm_coeffs.comp b/src/layer/vulkan/shader/layernorm_coeffs.comp
index 7f189f389..95557df2a 100644
--- a/src/layer/vulkan/shader/layernorm_coeffs.comp
+++ b/src/layer/vulkan/shader/layernorm_coeffs.comp
@@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
 layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
 layout (binding = 2) readonly buffer var_blob { float var_data[]; };
 
-layout (push_constant) uniform parameter {
+layout (push_constant) uniform parameter
+{
     int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x
     int num_groups_per_channel;
     int num_channels;
 } p;
 
-void main() {
+void main()
+{
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
-    if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return;
+    if (gy >= p.num_groups_per_channel || gz >= p.num_channels)
+        return;
 
     int group_id = gz * p.num_groups_per_channel + gy;
 
@@ -32,4 +35,4 @@ void main() {
 
     buffer_st1(coeffs_blob_data, group_id * 2, afp(a));
     buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b));
-}
\ No newline at end of file
+}
diff --git a/src/layer/vulkan/shader/layernorm_norm.comp b/src/layer/vulkan/shader/layernorm_norm.comp
index 46b016069..d966c44fa 100644
--- a/src/layer/vulkan/shader/layernorm_norm.comp
+++ b/src/layer/vulkan/shader/layernorm_norm.comp
@@ -3,12 +3,10 @@
 
 #version 450
 
-// This specialization constant is now used to control the affine transformation
 layout (constant_id = 0) const int affine = 0;
 
 layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
-// Separate bindings for gamma and beta if affine is enabled
 layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; };
 layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; };
 
@@ -27,47 +25,38 @@ void main()
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    // Boundary check against the original tensor dimensions
     if (gx >= p.w || gy >= p.h || gz >= p.c)
-    return;
+        return;
 
     int group_id;
     int inner_id;
 
-    // Determine the group ID and the element's ID within that group.
-    // This logic correctly maps each invocation to its normalization group.
     if (p.affine_size == p.w)
     {
-        // Normalization is performed per row
         group_id = gz * p.h + gy;
         inner_id = gx;
     }
     else // if (p.affine_size == p.w * p.h)
     {
-        // Normalization is performed per channel
         group_id = gz;
         inner_id = gy * p.w + gx;
     }
 
-    // Fetch the pre-calculated normalization coefficients a and b.
-    // There is one (a, b) pair per group.
     afp a = buffer_ld1(coeffs_blob_data, group_id * 2);
     afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1);
 
-    // Calculate the correct linear index for the element, respecting cstep.
     int linear_index = gz * p.cstep + gy * p.w + gx;
     afp v = buffer_ld1(bottom_top_blob_data, linear_index);
 
-    // Apply the base normalization: (x - mean) / sqrt(var + eps)
+    // (x - mean) / sqrt(var + eps)
     v = v * a + b;
 
-    // Apply the learned affine transformation if enabled
-    if (affine == 1) {
+    if (affine == 1)
+    {
         afp gamma = buffer_ld1(gamma_data, inner_id);
         afp beta  = buffer_ld1(beta_data, inner_id);
         v = v * gamma + beta;
     }
 
-    // Write the final result back to the same location
     buffer_st1(bottom_top_blob_data, linear_index, v);
 }
diff --git a/src/layer/vulkan/shader/layernorm_reduce_mean.comp b/src/layer/vulkan/shader/layernorm_reduce_mean.comp
index 985061500..389e46e65 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_mean.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_mean.comp
@@ -6,25 +6,30 @@
 layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; };
 layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
     float group_size;
 } p;
 
-void main() {
-    // Each invocation calculates the mean for one group, identified by (gy, gz)
+void main()
+{
+    int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
-    if (gy >= p.h || gz >= p.c) return;
+    if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request
+    return;
 
     float sum = 0.f;
-    // Base offset for the current group's row of data in the 3D sum_blob
     int v_offset = gz * p.cstep + gy * p.w;
-    for (int i = 0; i < p.w; i++) {
+    for (int i = 0; i < p.w; i++)
+    {
         sum += sum_blob_data[v_offset + i];
     }
 
-    // Output is a linear buffer indexed by the flattened group_id
     int group_id = gz * p.h + gy;
     mean_data[group_id] = sum / p.group_size;
-}
\ No newline at end of file
+}
diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
index 81c1afa98..ca362ea39 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
@@ -3,50 +3,69 @@
 
 #version 450
 
-// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
-layout (binding = 0) readonly buffer input_blob { sfp input_data[]; };
-layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
+layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
-    int outw; int outh; int outc; int outcstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
 } p;
 
-void main() {
-    // Global invocation IDs map to the output buffer dimensions
+void main()
+{
     int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
 
-    // sx is the starting element index for reduction within a group
     int sx = gx * 4;
 
-    // Correctly calculate the base offset for the group in the input tensor.
-    // gz * p.cstep -> Jumps to the start of the correct channel plane.
-    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
-    // The stride between rows is p.w (the width of a row).
-    int base_offset = gz * p.cstep + gy * p.w;
+    int v_offset = gz * p.cstep + gy * p.w + sx;
 
-    float sum;
-    int r_offset = base_offset + sx;
-
-    if (sx >= p.w - 3) {
-        if (sx >= p.w) {
-            sum = 0.0f;
-        } else if (sx == p.w - 1) {
-            sum = float(buffer_ld1(input_data, r_offset));
-        } else if (sx == p.w - 2) {
-            sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1));
-        } else { // sx == p.w - 3
-                 sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2));
-        }
-    } else {
-        sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3));
+    if (sx == p.w - 1)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+
+        sum = v0;
+    }
+    else if (sx == p.w - 2)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+
+        sum = v0 + v1;
+    }
+    else if (sx == p.w - 3)
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
+
+        sum = v0 + v1 + v2;
     }
+    else
+    {
+        float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
+        float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
+        float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
+        float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3));
+
+        sum = v0 + v1 + v2 + v3;
+    }
+
 
-    // Output index is a packed 3D index
-    int out_index = gz * p.outcstep + gy * p.outw + gx;
-    output_data[out_index] = sum;
-}
\ No newline at end of file
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+    sum_blob_data[gi] = sum;
+}
diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
index ce5c26782..6b9518415 100644
--- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
+++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
@@ -3,50 +3,68 @@
 
 #version 450
 
-// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
-layout (binding = 0) readonly buffer input_blob { float input_data[]; };
-layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
+layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; };
+layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };
 
-layout (push_constant) uniform parameter {
-    int w; int h; int c; int cstep;
-    int outw; int outh; int outc; int outcstep;
+layout (push_constant) uniform parameter
+{
+    int w;
+    int h;
+    int c;
+    int cstep;
+
+    int outw;
+    int outh;
+    int outc;
+    int outcstep;
 } p;
 
-void main() {
-    // Global invocation IDs map to the output buffer dimensions
+void main()
+{
     int gx = int(gl_GlobalInvocationID.x);
     int gy = int(gl_GlobalInvocationID.y);
     int gz = int(gl_GlobalInvocationID.z);
 
-    if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
+    if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
+        return;
+
+    float sum;
 
-    // sx is the starting element index for reduction within a group
     int sx = gx * 4;
 
-    // Correctly calculate the base offset for the group in the input tensor.
-    // gz * p.cstep -> Jumps to the start of the correct channel plane.
-    // gy * p.w      -> Jumps to the start of the correct row (group) within that plane.
-    // The stride between rows is p.w (the width of a row).
-    int base_offset = gz * p.cstep + gy * p.w;
+    int v_offset = gz * p.cstep + gy * p.w + sx;
 
-    float sum;
-    int r_offset = base_offset + sx;
-
-    if (sx >= p.w - 3) {
-        if (sx >= p.w) {
-            sum = 0.0f;
-        } else if (sx == p.w - 1) {
-            sum = input_data[r_offset];
-        } else if (sx == p.w - 2) {
-            sum = input_data[r_offset] + input_data[r_offset + 1];
-        } else { // sx == p.w - 3
-                 sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2];
-        }
-    } else {
-        sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3];
+    if (sx >= p.w)
+    {
+        sum = 0.0f;
+    }
+    else if (sx == p.w - 1)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        sum = v0;
+    }
+    else if (sx == p.w - 2)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        sum = v0 + v1;
+    }
+    else if (sx == p.w - 3)
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        float v2 = bottom_top_blob_data[v_offset + 2];
+        sum = v0 + v1 + v2;
+    }
+    else
+    {
+        float v0 = bottom_top_blob_data[v_offset];
+        float v1 = bottom_top_blob_data[v_offset + 1];
+        float v2 = bottom_top_blob_data[v_offset + 2];
+        float v3 = bottom_top_blob_data[v_offset + 3];
+        sum = v0 + v1 + v2 + v3;
     }
 
-    // Output index is a packed 3D index
-    int out_index = gz * p.outcstep + gy * p.outw + gx;
-    output_data[out_index] = sum;
-}
\ No newline at end of file
+    int gi = gz * p.outcstep + gy * p.outw + gx;
+    sum_blob_data[gi] = sum;
+}

From 82260d150b8fd21514445944b7a586fcd8c4ca2b Mon Sep 17 00:00:00 2001
From: futz12 <56149058+futz12@users.noreply.github.com>
Date: Wed, 6 Aug 2025 14:50:27 +0000
Subject: [PATCH 2/2] apply code-format changes

---
 src/layer/vulkan/layernorm_vulkan.cpp | 55 +++++++++++++++++----------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp
index 838572152..602bb1327 100644
--- a/src/layer/vulkan/layernorm_vulkan.cpp
+++ b/src/layer/vulkan/layernorm_vulkan.cpp
@@ -43,7 +43,7 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const
     cmd.reset();
 
     Mat cpu_mat;
-    convert_packing(staging_mat,cpu_mat,1);
+    convert_packing(staging_mat, cpu_mat, 1);
 
     printf("--- %s ---\n", name);
     printf("Dims: %d, w: %d, h: %d, d: %d, c: %d, cstep: %zu, elemsize: %zu, elempack: %d\n",
@@ -58,9 +58,9 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const
         {
             printf("cpu_mat[%d]: \n", i);
             // 打印矩阵
-            for (int j = 0; j< cpu_mat.h; j++)
+            for (int j = 0; j < cpu_mat.h; j++)
             {
-                for (int k = 0; k< cpu_mat.w;k++)
+                for (int k = 0; k < cpu_mat.w; k++)
                 {
                     printf("%f ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]);
                 }
@@ -75,16 +75,15 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const
         {
             printf("cpu_mat[%d]: \n", i);
             // 打印矩阵
-            for (int j = 0; j< cpu_mat.h; j++)
+            for (int j = 0; j < cpu_mat.h; j++)
             {
-                for (int k = 0; k< cpu_mat.w;k++)
+                for (int k = 0; k < cpu_mat.w; k++)
                 {
                     printf("%f ", ncnn::float16_to_float32(ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]));
                 }
                 printf("\n");
             }
         }
-
     }
     else if (cpu_mat.elemsize == 1u) // int8
     {
@@ -93,9 +92,9 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const
         {
             printf("cpu_mat[%d]: \n", i);
             // 打印矩阵
-            for (int j = 0; j< cpu_mat.h; j++)
+            for (int j = 0; j < cpu_mat.h; j++)
             {
-                for (int k = 0; k< cpu_mat.w;k++)
+                for (int k = 0; k < cpu_mat.w; k++)
                 {
                     printf("%d ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]);
                 }
@@ -190,7 +189,7 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
 {
     int elemsize_bak = _bottom_top_blob.elemsize;
     VkMat bottom_top_blob;
-    vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);
+    vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1, cmd, opt);
 
     int w = bottom_top_blob.w;
     int h = bottom_top_blob.h;
@@ -208,19 +207,27 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
 
     int group_size;
     int num_groups_per_channel;
-    if (dims == 1) {
+    if (dims == 1)
+    {
         group_size = w;
         num_groups_per_channel = 1;
         channels = 1;
-    } else if (dims == 2) {
+    }
+    else if (dims == 2)
+    {
         group_size = w;
         num_groups_per_channel = h;
         channels = 1;
-    } else { // dims == 3
-        if (affine_size == w) {
+    }
+    else
+    {   // dims == 3
+        if (affine_size == w)
+        {
             group_size = w;
             num_groups_per_channel = h;
-        } else { // affine_size == w * h
+        }
+        else
+        {   // affine_size == w * h
             group_size = w * h;
             num_groups_per_channel = 1;
         }
@@ -258,9 +265,12 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
         dispatcher.c = channels;
 
         int pb = 0;
-        if (elemsize == 4u) {
+        if (elemsize == 4u)
+        {
             cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher);
-        } else {
+        }
+        else
+        {
             cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher);
         }
         pb++;
@@ -269,7 +279,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
         print_vkmat(sum_workspace, "1. MEAN: After Initial Reduce", cmd, opt);
         // ===============================================
 
-        while (sum_workspace.w > 1) {
+        while (sum_workspace.w > 1)
+        {
             int current_w = sum_workspace.w;
             reduced_w = (current_w + 3) / 4;
             VkMat sum_workspace_reduced;
@@ -365,9 +376,12 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
         dispatcher.c = channels;
 
         int pb = 0;
-        if (elemsize == 4u) {
+        if (elemsize == 4u)
+        {
             cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher);
-        } else {
+        }
+        else
+        {
             cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher);
         }
         pb++;
@@ -376,7 +390,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c
         print_vkmat(sqsum_workspace, "2. VAR: After Initial Reduce", cmd, opt);
         // ===============================================
 
-        while (sqsum_workspace.w > 1) {
+        while (sqsum_workspace.w > 1)
+        {
             int current_w = sqsum_workspace.w;
             reduced_w = (current_w + 3) / 4;
             VkMat sum_workspace_reduced;