From 7f218e0cd11d1cbc6ff52ef078ea4a7d68bbc877 Mon Sep 17 00:00:00 2001 From: ice <1391525377@qq.com> Date: Wed, 6 Aug 2025 22:47:54 +0800 Subject: [PATCH] fix: code style --- src/layer/vulkan/layernorm_vulkan.cpp | 16 ++-- src/layer/vulkan/shader/layernorm_coeffs.comp | 15 ++-- src/layer/vulkan/shader/layernorm_norm.comp | 19 +--- .../vulkan/shader/layernorm_reduce_mean.comp | 23 +++-- .../layernorm_reduce_sum4_fp16_to_fp32.comp | 87 +++++++++++-------- .../shader/layernorm_reduce_sum4_fp32.comp | 86 ++++++++++-------- .../shader/layernorm_sub_mean_square.comp | 5 +- 7 files changed, 145 insertions(+), 106 deletions(-) diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index a226c4bbd..24a877f37 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -16,6 +16,7 @@ namespace ncnn { // ================================================================================================= static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt) { + return; if (m.empty()) { printf("--- %s ---\n", name); @@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) return 0; } -int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const +int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const { + int elemsize_bak = _bottom_top_blob.elemsize; + VkMat bottom_top_blob; + vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt); + int w = bottom_top_blob.w; int h = bottom_top_blob.h; int channels = bottom_top_blob.c; @@ -429,10 +434,9 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co coeff_bindings[1] = mean_workspace; coeff_bindings[2] = var_workspace; - std::vector coeff_constants(3); - coeff_constants[0].i = 1; - coeff_constants[1].i = num_groups_per_channel; - coeff_constants[2].i = channels; + std::vector coeff_constants(2); + coeff_constants[0].i = num_groups_per_channel; + coeff_constants[1].i = channels; VkMat dispatcher_coeffs; dispatcher_coeffs.w = 1; @@ -462,6 +466,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt); // =============================================== + vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt); + return 0; } diff --git a/src/layer/vulkan/shader/layernorm_coeffs.comp b/src/layer/vulkan/shader/layernorm_coeffs.comp index 7f189f389..1131a27e3 100644 --- a/src/layer/vulkan/shader/layernorm_coeffs.comp +++ b/src/layer/vulkan/shader/layernorm_coeffs.comp @@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; layout (binding = 1) readonly buffer mean_blob { float mean_data[]; }; layout (binding = 2) readonly buffer var_blob { float var_data[]; }; -layout (push_constant) uniform parameter { - int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x +layout (push_constant) uniform parameter +{ int num_groups_per_channel; - int num_channels; + int c; } p; -void main() { +void main() +{ + int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return; + if (gx >= 1 ||gy >= p.num_groups_per_channel || gz >= p.c) + return; int group_id = gz * p.num_groups_per_channel + gy; @@ -32,4 +35,4 @@ void main() { buffer_st1(coeffs_blob_data, group_id * 2, afp(a)); buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b)); -} \ No newline at end of file +} diff --git a/src/layer/vulkan/shader/layernorm_norm.comp b/src/layer/vulkan/shader/layernorm_norm.comp index 46b016069..d966c44fa 100644 --- a/src/layer/vulkan/shader/layernorm_norm.comp +++ b/src/layer/vulkan/shader/layernorm_norm.comp @@ -3,12 +3,10 @@ #version 450 -// This specialization constant is now used to control the affine transformation layout (constant_id = 0) const int affine = 0; layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; -// Separate bindings for gamma and beta if affine is enabled layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; }; layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; }; @@ -27,47 +25,38 @@ void main() int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - // Boundary check against the original tensor dimensions if (gx >= p.w || gy >= p.h || gz >= p.c) - return; + return; int group_id; int inner_id; - // Determine the group ID and the element's ID within that group. - // This logic correctly maps each invocation to its normalization group. if (p.affine_size == p.w) { - // Normalization is performed per row group_id = gz * p.h + gy; inner_id = gx; } else // if (p.affine_size == p.w * p.h) { - // Normalization is performed per channel group_id = gz; inner_id = gy * p.w + gx; } - // Fetch the pre-calculated normalization coefficients a and b. - // There is one (a, b) pair per group. afp a = buffer_ld1(coeffs_blob_data, group_id * 2); afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1); - // Calculate the correct linear index for the element, respecting cstep. int linear_index = gz * p.cstep + gy * p.w + gx; afp v = buffer_ld1(bottom_top_blob_data, linear_index); - // Apply the base normalization: (x - mean) / sqrt(var + eps) + // (x - mean) / sqrt(var + eps) v = v * a + b; - // Apply the learned affine transformation if enabled - if (affine == 1) { + if (affine == 1) + { afp gamma = buffer_ld1(gamma_data, inner_id); afp beta = buffer_ld1(beta_data, inner_id); v = v * gamma + beta; } - // Write the final result back to the same location buffer_st1(bottom_top_blob_data, linear_index, v); } diff --git a/src/layer/vulkan/shader/layernorm_reduce_mean.comp b/src/layer/vulkan/shader/layernorm_reduce_mean.comp index 985061500..389e46e65 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_mean.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_mean.comp @@ -6,25 +6,30 @@ layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; }; layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; float group_size; } p; -void main() { - // Each invocation calculates the mean for one group, identified by (gy, gz) +void main() +{ + int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gy >= p.h || gz >= p.c) return; + if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request + return; float sum = 0.f; - // Base offset for the current group's row of data in the 3D sum_blob int v_offset = gz * p.cstep + gy * p.w; - for (int i = 0; i < p.w; i++) { + for (int i = 0; i < p.w; i++) + { sum += sum_blob_data[v_offset + i]; } - // Output is a linear buffer indexed by the flattened group_id int group_id = gz * p.h + gy; mean_data[group_id] = sum / p.group_size; -} \ No newline at end of file +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp index 81c1afa98..ca362ea39 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp @@ -3,50 +3,69 @@ #version 450 -// This shader correctly reduces a 3D dispatched problem over non-contiguous memory. -layout (binding = 0) readonly buffer input_blob { sfp input_data[]; }; -layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; +layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; - int outw; int outh; int outc; int outcstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; } p; -void main() { - // Global invocation IDs map to the output buffer dimensions +void main() +{ int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; - // sx is the starting element index for reduction within a group int sx = gx * 4; - // Correctly calculate the base offset for the group in the input tensor. - // gz * p.cstep -> Jumps to the start of the correct channel plane. - // gy * p.w -> Jumps to the start of the correct row (group) within that plane. - // The stride between rows is p.w (the width of a row). - int base_offset = gz * p.cstep + gy * p.w; + int v_offset = gz * p.cstep + gy * p.w + sx; - float sum; - int r_offset = base_offset + sx; - - if (sx >= p.w - 3) { - if (sx >= p.w) { - sum = 0.0f; - } else if (sx == p.w - 1) { - sum = float(buffer_ld1(input_data, r_offset)); - } else if (sx == p.w - 2) { - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)); - } else { // sx == p.w - 3 - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)); - } - } else { - sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3)); + if (sx == p.w - 1) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + + sum = v0; + } + else if (sx == p.w - 2) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + + sum = v0 + v1; + } + else if (sx == p.w - 3) + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); + + sum = v0 + v1 + v2; } + else + { + float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); + float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); + float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); + float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3)); + + sum = v0 + v1 + v2 + v3; + } + - // Output index is a packed 3D index - int out_index = gz * p.outcstep + gy * p.outw + gx; - output_data[out_index] = sum; -} \ No newline at end of file + int gi = gz * p.outcstep + gy * p.outw + gx; + sum_blob_data[gi] = sum; +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp index ce5c26782..6b9518415 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp @@ -3,50 +3,68 @@ #version 450 -// This shader correctly reduces a 3D dispatched problem over non-contiguous memory. -layout (binding = 0) readonly buffer input_blob { float input_data[]; }; -layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; +layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; }; +layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; -layout (push_constant) uniform parameter { - int w; int h; int c; int cstep; - int outw; int outh; int outc; int outcstep; +layout (push_constant) uniform parameter +{ + int w; + int h; + int c; + int cstep; + + int outw; + int outh; + int outc; + int outcstep; } p; -void main() { - // Global invocation IDs map to the output buffer dimensions +void main() +{ int gx = int(gl_GlobalInvocationID.x); int gy = int(gl_GlobalInvocationID.y); int gz = int(gl_GlobalInvocationID.z); - if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; + if (gx >= p.outw || gy >= p.outh || gz >= p.outc) + return; + + float sum; - // sx is the starting element index for reduction within a group int sx = gx * 4; - // Correctly calculate the base offset for the group in the input tensor. - // gz * p.cstep -> Jumps to the start of the correct channel plane. - // gy * p.w -> Jumps to the start of the correct row (group) within that plane. - // The stride between rows is p.w (the width of a row). - int base_offset = gz * p.cstep + gy * p.w; + int v_offset = gz * p.cstep + gy * p.w + sx; - float sum; - int r_offset = base_offset + sx; - - if (sx >= p.w - 3) { - if (sx >= p.w) { - sum = 0.0f; - } else if (sx == p.w - 1) { - sum = input_data[r_offset]; - } else if (sx == p.w - 2) { - sum = input_data[r_offset] + input_data[r_offset + 1]; - } else { // sx == p.w - 3 - sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2]; - } - } else { - sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3]; + if (sx >= p.w) + { + sum = 0.0f; + } + else if (sx == p.w - 1) + { + float v0 = bottom_top_blob_data[v_offset]; + sum = v0; + } + else if (sx == p.w - 2) + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + sum = v0 + v1; + } + else if (sx == p.w - 3) + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + float v2 = bottom_top_blob_data[v_offset + 2]; + sum = v0 + v1 + v2; + } + else + { + float v0 = bottom_top_blob_data[v_offset]; + float v1 = bottom_top_blob_data[v_offset + 1]; + float v2 = bottom_top_blob_data[v_offset + 2]; + float v3 = bottom_top_blob_data[v_offset + 3]; + sum = v0 + v1 + v2 + v3; } - // Output index is a packed 3D index - int out_index = gz * p.outcstep + gy * p.outw + gx; - output_data[out_index] = sum; -} \ No newline at end of file + int gi = gz * p.outcstep + gy * p.outw + gx; + sum_blob_data[gi] = sum; +} diff --git a/src/layer/vulkan/shader/layernorm_sub_mean_square.comp b/src/layer/vulkan/shader/layernorm_sub_mean_square.comp index 6fd1d8151..cec9bd32d 100644 --- a/src/layer/vulkan/shader/layernorm_sub_mean_square.comp +++ b/src/layer/vulkan/shader/layernorm_sub_mean_square.comp @@ -5,7 +5,7 @@ layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout (binding = 1) readonly buffer mean_blob { float mean_data[]; }; -// The output buffer's type must match the input type if the framework reuses the memory type + layout (binding = 2) writeonly buffer square_blob { sfp square_blob_data[]; }; layout (push_constant) uniform parameter { @@ -40,10 +40,9 @@ void main() { int linear_index = gz * p.cstep + gy * p.w + gx; - // Perform calculation in fp32 for precision float v = float(buffer_ld1(bottom_top_blob_data, linear_index)); v = v - mean; - // Write back in the native storage format (sfp) + buffer_st1(square_blob_data, linear_index, afp(v * v)); } \ No newline at end of file