Browse Source

fix: code style

pull/6240/head
ice 10 months ago
parent
commit
7f218e0cd1
7 changed files with 145 additions and 106 deletions
  1. +11
    -5
      src/layer/vulkan/layernorm_vulkan.cpp
  2. +9
    -6
      src/layer/vulkan/shader/layernorm_coeffs.comp
  3. +4
    -15
      src/layer/vulkan/shader/layernorm_norm.comp
  4. +14
    -9
      src/layer/vulkan/shader/layernorm_reduce_mean.comp
  5. +53
    -34
      src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp
  6. +52
    -34
      src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp
  7. +2
    -3
      src/layer/vulkan/shader/layernorm_sub_mean_square.comp

+ 11
- 5
src/layer/vulkan/layernorm_vulkan.cpp View File

@@ -16,6 +16,7 @@ namespace ncnn {
// =================================================================================================
static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt)
{
return;
if (m.empty())
{
printf("--- %s ---\n", name);
@@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt)
return 0;
}

int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const
int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const
{
int elemsize_bak = _bottom_top_blob.elemsize;
VkMat bottom_top_blob;
vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt);

int w = bottom_top_blob.w;
int h = bottom_top_blob.h;
int channels = bottom_top_blob.c;
@@ -429,10 +434,9 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
coeff_bindings[1] = mean_workspace;
coeff_bindings[2] = var_workspace;

std::vector<vk_constant_type> coeff_constants(3);
coeff_constants[0].i = 1;
coeff_constants[1].i = num_groups_per_channel;
coeff_constants[2].i = channels;
std::vector<vk_constant_type> coeff_constants(2);
coeff_constants[0].i = num_groups_per_channel;
coeff_constants[1].i = channels;

VkMat dispatcher_coeffs;
dispatcher_coeffs.w = 1;
@@ -462,6 +466,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt);
// ===============================================

vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt);

return 0;
}


+ 9
- 6
src/layer/vulkan/shader/layernorm_coeffs.comp View File

@@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
layout (binding = 2) readonly buffer var_blob { float var_data[]; };

layout (push_constant) uniform parameter {
int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x
layout (push_constant) uniform parameter
{
int num_groups_per_channel;
int num_channels;
int c;
} p;

void main() {
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return;
if (gx >= 1 ||gy >= p.num_groups_per_channel || gz >= p.c)
return;

int group_id = gz * p.num_groups_per_channel + gy;

@@ -32,4 +35,4 @@ void main() {

buffer_st1(coeffs_blob_data, group_id * 2, afp(a));
buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b));
}
}

+ 4
- 15
src/layer/vulkan/shader/layernorm_norm.comp View File

@@ -3,12 +3,10 @@

#version 450

// This specialization constant is now used to control the affine transformation
layout (constant_id = 0) const int affine = 0;

layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; };
// Separate bindings for gamma and beta if affine is enabled
layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; };
layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; };

@@ -27,47 +25,38 @@ void main()
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

// Boundary check against the original tensor dimensions
if (gx >= p.w || gy >= p.h || gz >= p.c)
return;
return;

int group_id;
int inner_id;

// Determine the group ID and the element's ID within that group.
// This logic correctly maps each invocation to its normalization group.
if (p.affine_size == p.w)
{
// Normalization is performed per row
group_id = gz * p.h + gy;
inner_id = gx;
}
else // if (p.affine_size == p.w * p.h)
{
// Normalization is performed per channel
group_id = gz;
inner_id = gy * p.w + gx;
}

// Fetch the pre-calculated normalization coefficients a and b.
// There is one (a, b) pair per group.
afp a = buffer_ld1(coeffs_blob_data, group_id * 2);
afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1);

// Calculate the correct linear index for the element, respecting cstep.
int linear_index = gz * p.cstep + gy * p.w + gx;
afp v = buffer_ld1(bottom_top_blob_data, linear_index);

// Apply the base normalization: (x - mean) / sqrt(var + eps)
// (x - mean) / sqrt(var + eps)
v = v * a + b;

// Apply the learned affine transformation if enabled
if (affine == 1) {
if (affine == 1)
{
afp gamma = buffer_ld1(gamma_data, inner_id);
afp beta = buffer_ld1(beta_data, inner_id);
v = v * gamma + beta;
}

// Write the final result back to the same location
buffer_st1(bottom_top_blob_data, linear_index, v);
}

+ 14
- 9
src/layer/vulkan/shader/layernorm_reduce_mean.comp View File

@@ -6,25 +6,30 @@
layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; };
layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; };

layout (push_constant) uniform parameter {
int w; int h; int c; int cstep;
layout (push_constant) uniform parameter
{
int w;
int h;
int c;
int cstep;
float group_size;
} p;

void main() {
// Each invocation calculates the mean for one group, identified by (gy, gz)
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);
if (gy >= p.h || gz >= p.c) return;
if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request
return;

float sum = 0.f;
// Base offset for the current group's row of data in the 3D sum_blob
int v_offset = gz * p.cstep + gy * p.w;
for (int i = 0; i < p.w; i++) {
for (int i = 0; i < p.w; i++)
{
sum += sum_blob_data[v_offset + i];
}

// Output is a linear buffer indexed by the flattened group_id
int group_id = gz * p.h + gy;
mean_data[group_id] = sum / p.group_size;
}
}

+ 53
- 34
src/layer/vulkan/shader/layernorm_reduce_sum4_fp16_to_fp32.comp View File

@@ -3,50 +3,69 @@

#version 450

// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
layout (binding = 0) readonly buffer input_blob { sfp input_data[]; };
layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };

layout (push_constant) uniform parameter {
int w; int h; int c; int cstep;
int outw; int outh; int outc; int outcstep;
layout (push_constant) uniform parameter
{
int w;
int h;
int c;
int cstep;

int outw;
int outh;
int outc;
int outcstep;
} p;

void main() {
// Global invocation IDs map to the output buffer dimensions
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

float sum;

// sx is the starting element index for reduction within a group
int sx = gx * 4;

// Correctly calculate the base offset for the group in the input tensor.
// gz * p.cstep -> Jumps to the start of the correct channel plane.
// gy * p.w -> Jumps to the start of the correct row (group) within that plane.
// The stride between rows is p.w (the width of a row).
int base_offset = gz * p.cstep + gy * p.w;
int v_offset = gz * p.cstep + gy * p.w + sx;

float sum;
int r_offset = base_offset + sx;

if (sx >= p.w - 3) {
if (sx >= p.w) {
sum = 0.0f;
} else if (sx == p.w - 1) {
sum = float(buffer_ld1(input_data, r_offset));
} else if (sx == p.w - 2) {
sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1));
} else { // sx == p.w - 3
sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2));
}
} else {
sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3));
if (sx == p.w - 1)
{
float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));

sum = v0;
}
else if (sx == p.w - 2)
{
float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));

sum = v0 + v1;
}
else if (sx == p.w - 3)
{
float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));

sum = v0 + v1 + v2;
}
else
{
float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset));
float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1));
float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2));
float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3));

sum = v0 + v1 + v2 + v3;
}


// Output index is a packed 3D index
int out_index = gz * p.outcstep + gy * p.outw + gx;
output_data[out_index] = sum;
}
int gi = gz * p.outcstep + gy * p.outw + gx;
sum_blob_data[gi] = sum;
}

+ 52
- 34
src/layer/vulkan/shader/layernorm_reduce_sum4_fp32.comp View File

@@ -3,50 +3,68 @@

#version 450

// This shader correctly reduces a 3D dispatched problem over non-contiguous memory.
layout (binding = 0) readonly buffer input_blob { float input_data[]; };
layout (binding = 1) writeonly buffer output_blob { float output_data[]; };
layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; };
layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; };

layout (push_constant) uniform parameter {
int w; int h; int c; int cstep;
int outw; int outh; int outc; int outcstep;
layout (push_constant) uniform parameter
{
int w;
int h;
int c;
int cstep;

int outw;
int outh;
int outc;
int outcstep;
} p;

void main() {
// Global invocation IDs map to the output buffer dimensions
void main()
{
int gx = int(gl_GlobalInvocationID.x);
int gy = int(gl_GlobalInvocationID.y);
int gz = int(gl_GlobalInvocationID.z);

if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return;
if (gx >= p.outw || gy >= p.outh || gz >= p.outc)
return;

float sum;

// sx is the starting element index for reduction within a group
int sx = gx * 4;

// Correctly calculate the base offset for the group in the input tensor.
// gz * p.cstep -> Jumps to the start of the correct channel plane.
// gy * p.w -> Jumps to the start of the correct row (group) within that plane.
// The stride between rows is p.w (the width of a row).
int base_offset = gz * p.cstep + gy * p.w;
int v_offset = gz * p.cstep + gy * p.w + sx;

float sum;
int r_offset = base_offset + sx;

if (sx >= p.w - 3) {
if (sx >= p.w) {
sum = 0.0f;
} else if (sx == p.w - 1) {
sum = input_data[r_offset];
} else if (sx == p.w - 2) {
sum = input_data[r_offset] + input_data[r_offset + 1];
} else { // sx == p.w - 3
sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2];
}
} else {
sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3];
if (sx >= p.w)
{
sum = 0.0f;
}
else if (sx == p.w - 1)
{
float v0 = bottom_top_blob_data[v_offset];
sum = v0;
}
else if (sx == p.w - 2)
{
float v0 = bottom_top_blob_data[v_offset];
float v1 = bottom_top_blob_data[v_offset + 1];
sum = v0 + v1;
}
else if (sx == p.w - 3)
{
float v0 = bottom_top_blob_data[v_offset];
float v1 = bottom_top_blob_data[v_offset + 1];
float v2 = bottom_top_blob_data[v_offset + 2];
sum = v0 + v1 + v2;
}
else
{
float v0 = bottom_top_blob_data[v_offset];
float v1 = bottom_top_blob_data[v_offset + 1];
float v2 = bottom_top_blob_data[v_offset + 2];
float v3 = bottom_top_blob_data[v_offset + 3];
sum = v0 + v1 + v2 + v3;
}

// Output index is a packed 3D index
int out_index = gz * p.outcstep + gy * p.outw + gx;
output_data[out_index] = sum;
}
int gi = gz * p.outcstep + gy * p.outw + gx;
sum_blob_data[gi] = sum;
}

+ 2
- 3
src/layer/vulkan/shader/layernorm_sub_mean_square.comp View File

@@ -5,7 +5,7 @@

layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
layout (binding = 1) readonly buffer mean_blob { float mean_data[]; };
// The output buffer's type must match the input type if the framework reuses the memory type
layout (binding = 2) writeonly buffer square_blob { sfp square_blob_data[]; };

layout (push_constant) uniform parameter {
@@ -40,10 +40,9 @@ void main() {

int linear_index = gz * p.cstep + gy * p.w + gx;

// Perform calculation in fp32 for precision
float v = float(buffer_ld1(bottom_top_blob_data, linear_index));

v = v - mean;
// Write back in the native storage format (sfp)
buffer_st1(square_blob_data, linear_index, afp(v * v));
}

Loading…
Cancel
Save