From 7a7e5c106baefa6e19ab96c61137e3b269d9995d Mon Sep 17 00:00:00 2001 From: ice <1391525377@qq.com> Date: Wed, 6 Aug 2025 22:47:54 +0800 Subject: [PATCH 1/2] fix: code style --- src/layer/vulkan/layernorm_vulkan.cpp | 9 +- src/layer/vulkan/shader/layernorm_coeffs.comp | 11 ++- src/layer/vulkan/shader/layernorm_norm.comp | 19 +--- .../vulkan/shader/layernorm_reduce_mean.comp | 23 +++-- .../layernorm_reduce_sum4_fp16_to_fp32.comp | 87 +++++++++++-------- .../shader/layernorm_reduce_sum4_fp32.comp | 86 ++++++++++-------- 6 files changed, 138 insertions(+), 97 deletions(-) diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index a226c4bbd..838572152 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -16,6 +16,7 @@ namespace ncnn { // ================================================================================================= static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt) { + return; if (m.empty()) { printf("--- %s ---\n", name); @@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) return 0; } -int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const +int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const { + int elemsize_bak = _bottom_top_blob.elemsize; + VkMat bottom_top_blob; + vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt); + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; @@ -462,6 +467,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt); // =============================================== + vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt); + return 0; } diff --git a/src/layer/vulkan/shader/layernorm_coeffs.comp b/src/layer/vulkan/shader/layernorm_coeffs.comp index 7f189f389..95557df2a 100644 --- a/src/layer/vulkan/shader/layernorm_coeffs.comp +++ b/src/layer/vulkan/shader/layernorm_coeffs.comp @@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; layout (binding = 1) readonly buffer mean_blob { float mean_data[]; }; layout (binding = 2) readonly buffer var_blob { float var_data[]; }; -layout (push_constant) uniform parameter { +layout (push_constant) uniform parameter +{ int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x int num_groups_per_channel; int num_channels; } p; -void main() { +void main() +{ int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return; + if (gy >= p.num_groups_per_channel || gz >= p.num_channels) + return; int group_id = gz * p.num_groups_per_channel + gy; @@ -32,4 +35,4 @@ void main() { buffer_st1(coeffs_blob_data, group_id * 2, afp(a)); buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b)); -} \ No newline at end of file +} diff --git a/src/layer/vulkan/shader/layernorm_norm.comp b/src/layer/vulkan/shader/layernorm_norm.comp index 46b016069..d966c44fa 100644 --- a/src/layer/vulkan/shader/layernorm_norm.comp +++ b/src/layer/vulkan/shader/layernorm_norm.comp @@ -3,12 +3,10 @@ #version 450 -// This specialization constant is now used to control the affine transformation layout (constant_id = 0) const int affine = 0; layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; -// Separate bindings for gamma and beta if affine is enabled layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; }; layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; }; @@ -27,47 +25,38 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - // Boundary check against the original tensor dimensions if (gx >= p.w || gy >= p.h || gz >= p.c) - return; + return; int group_id; int inner_id; - // Determine the group ID and the element's ID within that group. - // This logic correctly maps each invocation to its normalization group. if (p.affine_size == p.w) { - // Normalization is performed per row group_id = gz * p.h + gy; inner_id = gx; } else // if (p.affine_size == p.w * p.h) { - // Normalization is performed per channel group_id = gz; inner_id = gy * p.w + gx; } - // Fetch the pre-calculated normalization coefficients a and b. - // There is one (a, b) pair per group. afp a = buffer_ld1(coeffs_blob_data, group_id * 2); afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1); - // Calculate the correct linear index for the element, respecting cstep. int linear_index = gz * p.cstep + gy * p.w + gx; afp v = buffer_ld1(bottom_top_blob_data, linear_index); - // Apply the base normalization: (x - mean) / sqrt(var + eps) + // (x - mean) / sqrt(var + eps) v = v * a + b; - // Apply the learned affine transformation if enabled - if (affine == 1) { + if (affine == 1) + { afp gamma = buffer_ld1(gamma_data, inner_id); afp beta = buffer_ld1(beta_data, inner_id); v = v * gamma + beta; } - // Write the final result back to the same location buffer_st1(bottom_top_blob_data, linear_index, v); } diff --git a/src/layer/vulkan/shader/layernorm_reduce_mean.comp b/src/layer/vulkan/shader/layernorm_reduce_mean.comp index 985061500..389e46e65 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_mean.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_mean.comp @@ -6,25 +6,30 @@ layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; }; layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; float group_size; } p; -void main() { - // Each invocation calculates the mean for one group, identified by (gy, gz) +void main() +{ + int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gy >= p.h || gz >= p.c) return; + if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request + return; float sum = 0.f; - // Base offset for the current group's row of data in the 3D sum_blob int v_offset = gz * p.cstep + gy * p.w; - for (int i = 0; i < p.w; i++) { + for (int i = 0; i < p.w; i++) + { sum += sum_blob_data[v_offset + i]; } - // Output is a linear buffer indexed by the flattened group_id int group_id = gz * p.h + gy; mean_data[group_id] = sum / p.group_size; -} \ No newline at end of file +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp index 81c1afa98..ca362ea39 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp @@ -3,50 +3,69 @@ #version 450 -// This shader correctly reduces a 3D dispatched problem over non-contiguous memory. -layout (binding = 0) readonly buffer input_blob { sfp input_data[]; }; -layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; +layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; - int outw; int outh; int outc; int outcstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; } p; -void main() { - // Global invocation IDs map to the output buffer dimensions +void main() +{ int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; - // sx is the starting element index for reduction within a group int sx = gx * 4; - // Correctly calculate the base offset for the group in the input tensor. - // gz * p.cstep -> Jumps to the start of the correct channel plane. - // gy * p.w -> Jumps to the start of the correct row (group) within that plane. - // The stride between rows is p.w (the width of a row). - int base_offset = gz * p.cstep + gy * p.w; + int v_offset = gz * p.cstep + gy * p.w + sx; - float sum; - int r_offset = base_offset + sx; - - if (sx >= p.w - 3) { - if (sx >= p.w) { - sum = 0.0f; - } else if (sx == p.w - 1) { - sum = float(buffer_ld1(input_data, r_offset)); - } else if (sx == p.w - 2) { - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)); - } else { // sx == p.w - 3 - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)); - } - } else { - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3)); + if (sx == p.w - 1) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + + sum = v0; + } + else if (sx == p.w - 2) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + + sum = v0 + v1; + } + else if (sx == p.w - 3) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); + + sum = v0 + v1 + v2; } + else + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); + float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3)); + + sum = v0 + v1 + v2 + v3; + } + - // Output index is a packed 3D index - int out_index = gz * p.outcstep + gy * p.outw + gx; - output_data[out_index] = sum; -} \ No newline at end of file + int gi = gz * p.outcstep + gy * p.outw + gx; + sum_blob_data[gi] = sum; +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp index ce5c26782..6b9518415 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp @@ -3,50 +3,68 @@ #version 450 -// This shader correctly reduces a 3D dispatched problem over non-contiguous memory. -layout (binding = 0) readonly buffer input_blob { float input_data[]; }; -layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; +layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; - int outw; int outh; int outc; int outcstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; } p; -void main() { - // Global invocation IDs map to the output buffer dimensions +void main() +{ int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; - // sx is the starting element index for reduction within a group int sx = gx * 4; - // Correctly calculate the base offset for the group in the input tensor. - // gz * p.cstep -> Jumps to the start of the correct channel plane. - // gy * p.w -> Jumps to the start of the correct row (group) within that plane. - // The stride between rows is p.w (the width of a row). - int base_offset = gz * p.cstep + gy * p.w; + int v_offset = gz * p.cstep + gy * p.w + sx; - float sum; - int r_offset = base_offset + sx; - - if (sx >= p.w - 3) { - if (sx >= p.w) { - sum = 0.0f; - } else if (sx == p.w - 1) { - sum = input_data[r_offset]; - } else if (sx == p.w - 2) { - sum = input_data[r_offset] + input_data[r_offset + 1]; - } else { // sx == p.w - 3 - sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2]; - } - } else { - sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3]; + if (sx >= p.w) + { + sum = 0.0f; + } + else if (sx == p.w - 1) + { + float v0 = bottom_top_blob_data[v_offset]; + sum = v0; + } + else if (sx == p.w - 2) + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + sum = v0 + v1; + } + else if (sx == p.w - 3) + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + float v2 = bottom_top_blob_data[v_offset + 2]; + sum = v0 + v1 + v2; + } + else + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + float v2 = bottom_top_blob_data[v_offset + 2]; + float v3 = bottom_top_blob_data[v_offset + 3]; + sum = v0 + v1 + v2 + v3; } - // Output index is a packed 3D index - int out_index = gz * p.outcstep + gy * p.outw + gx; - output_data[out_index] = sum; -} \ No newline at end of file + int gi = gz * p.outcstep + gy * p.outw + gx; + sum_blob_data[gi] = sum; +} From 82260d150b8fd21514445944b7a586fcd8c4ca2b Mon Sep 17 00:00:00 2001 From: futz12 <56149058+futz12@users.noreply.github.com> Date: Wed, 6 Aug 2025 14:50:27 +0000 Subject: [PATCH 2/2] apply code-format changes --- src/layer/vulkan/layernorm_vulkan.cpp | 55 +++++++++++++++++---------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index 838572152..602bb1327 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -43,7 +43,7 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const cmd.reset(); Mat cpu_mat; - convert_packing(staging_mat,cpu_mat,1); + convert_packing(staging_mat, cpu_mat, 1); printf("--- %s ---\n", name); printf("Dims: %d, w: %d, h: %d, d: %d, c: %d, cstep: %zu, elemsize: %zu, elempack: %d\n", @@ -58,9 +58,9 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const { printf("cpu_mat[%d]: \n", i); // 打印矩阵 - for (int j = 0; j< cpu_mat.h; j++) + for (int j = 0; j < cpu_mat.h; j++) { - for (int k = 0; k< cpu_mat.w;k++) + for (int k = 0; k < cpu_mat.w; k++) { printf("%f ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]); } @@ -75,16 +75,15 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const { printf("cpu_mat[%d]: \n", i); // 打印矩阵 - for (int j = 0; j< cpu_mat.h; j++) + for (int j = 0; j < cpu_mat.h; j++) { - for (int k = 0; k< cpu_mat.w;k++) + for (int k = 0; k < cpu_mat.w; k++) { printf("%f ", ncnn::float16_to_float32(ptr[i * cpu_mat.cstep + j * cpu_mat.w + k])); } printf("\n"); } } - } else if (cpu_mat.elemsize == 1u) // int8 { @@ -93,9 +92,9 @@ static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const { printf("cpu_mat[%d]: \n", i); // 打印矩阵 - for (int j = 0; j< cpu_mat.h; j++) + for (int j = 0; j < cpu_mat.h; j++) { - for (int k = 0; k< cpu_mat.w;k++) + for (int k = 0; k < cpu_mat.w; k++) { printf("%d ", ptr[i * cpu_mat.cstep + j * cpu_mat.w + k]); } @@ -190,7 +189,7 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c { int elemsize_bak = _bottom_top_blob.elemsize; VkMat bottom_top_blob; - vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt); + vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1, cmd, opt); int w = bottom_top_blob.w; int h = bottom_top_blob.h; @@ -208,19 +207,27 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c int group_size; int num_groups_per_channel; - if (dims == 1) { + if (dims == 1) + { group_size = w; num_groups_per_channel = 1; channels = 1; - } else if (dims == 2) { + } + else if (dims == 2) + { group_size = w; num_groups_per_channel = h; channels = 1; - } else { // dims == 3 - if (affine_size == w) { + } + else + { // dims == 3 + if (affine_size == w) + { group_size = w; num_groups_per_channel = h; - } else { // affine_size == w * h + } + else + { // affine_size == w * h group_size = w * h; num_groups_per_channel = 1; } @@ -258,9 +265,12 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c dispatcher.c = channels; int pb = 0; - if (elemsize == 4u) { + if (elemsize == 4u) + { cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher); - } else { + } + else + { cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher); } pb++; @@ -269,7 +279,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c print_vkmat(sum_workspace, "1. MEAN: After Initial Reduce", cmd, opt); // =============================================== - while (sum_workspace.w > 1) { + while (sum_workspace.w > 1) + { int current_w = sum_workspace.w; reduced_w = (current_w + 3) / 4; VkMat sum_workspace_reduced; @@ -365,9 +376,12 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c dispatcher.c = channels; int pb = 0; - if (elemsize == 4u) { + if (elemsize == 4u) + { cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp32[pb % 2], bindings, constants, dispatcher); - } else { + } + else + { cmd.record_pipeline(pipeline_layernorm_reduce_sum4_fp16_to_fp32, bindings, constants, dispatcher); } pb++; @@ -376,7 +390,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, c print_vkmat(sqsum_workspace, "2. VAR: After Initial Reduce", cmd, opt); // =============================================== - while (sqsum_workspace.w > 1) { + while (sqsum_workspace.w > 1) + { int current_w = sqsum_workspace.w; reduced_w = (current_w + 3) / 4; VkMat sum_workspace_reduced;