| @@ -16,6 +16,7 @@ namespace ncnn { | |||
| // ================================================================================================= | |||
| static void print_vkmat(const VkMat& m, const char* name, VkCompute& cmd, const Option& opt) | |||
| { | |||
| return; | |||
| if (m.empty()) | |||
| { | |||
| printf("--- %s ---\n", name); | |||
| @@ -185,8 +186,12 @@ int LayerNorm_vulkan::upload_model(VkTransfer& cmd, const Option& opt) | |||
| return 0; | |||
| } | |||
| int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, const Option& opt) const | |||
| int LayerNorm_vulkan::forward_inplace(VkMat& _bottom_top_blob, VkCompute& cmd, const Option& opt) const | |||
| { | |||
| int elemsize_bak = _bottom_top_blob.elemsize; | |||
| VkMat bottom_top_blob; | |||
| vkdev->convert_packing(_bottom_top_blob, bottom_top_blob, 1,cmd, opt); | |||
| int w = bottom_top_blob.w; | |||
| int h = bottom_top_blob.h; | |||
| int channels = bottom_top_blob.c; | |||
| @@ -429,10 +434,9 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co | |||
| coeff_bindings[1] = mean_workspace; | |||
| coeff_bindings[2] = var_workspace; | |||
| std::vector<vk_constant_type> coeff_constants(3); | |||
| coeff_constants[0].i = 1; | |||
| coeff_constants[1].i = num_groups_per_channel; | |||
| coeff_constants[2].i = channels; | |||
| std::vector<vk_constant_type> coeff_constants(2); | |||
| coeff_constants[0].i = num_groups_per_channel; | |||
| coeff_constants[1].i = channels; | |||
| VkMat dispatcher_coeffs; | |||
| dispatcher_coeffs.w = 1; | |||
| @@ -462,6 +466,8 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co | |||
| print_vkmat(bottom_top_blob, "===> FINAL OUTPUT of LayerNorm <===", cmd, opt); | |||
| // =============================================== | |||
| vkdev->convert_packing(bottom_top_blob, _bottom_top_blob, elemsize_bak, cmd, opt); | |||
| return 0; | |||
| } | |||
| @@ -9,16 +9,19 @@ layout (binding = 0) writeonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer mean_blob { float mean_data[]; }; | |||
| layout (binding = 2) readonly buffer var_blob { float var_data[]; }; | |||
| layout (push_constant) uniform parameter { | |||
| int w_dummy; // w is not used, corresponds to gl_GlobalInvocationID.x | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int num_groups_per_channel; | |||
| int num_channels; | |||
| int c; | |||
| } p; | |||
| void main() { | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gy >= p.num_groups_per_channel || gz >= p.num_channels) return; | |||
| if (gx >= 1 ||gy >= p.num_groups_per_channel || gz >= p.c) | |||
| return; | |||
| int group_id = gz * p.num_groups_per_channel + gy; | |||
| @@ -32,4 +35,4 @@ void main() { | |||
| buffer_st1(coeffs_blob_data, group_id * 2, afp(a)); | |||
| buffer_st1(coeffs_blob_data, group_id * 2 + 1, afp(b)); | |||
| } | |||
| } | |||
| @@ -3,12 +3,10 @@ | |||
| #version 450 | |||
| // This specialization constant is now used to control the affine transformation | |||
| layout (constant_id = 0) const int affine = 0; | |||
| layout (binding = 0) buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer coeffs_blob { sfp coeffs_blob_data[]; }; | |||
| // Separate bindings for gamma and beta if affine is enabled | |||
| layout (binding = 2) readonly buffer gamma_blob { sfp gamma_data[]; }; | |||
| layout (binding = 3) readonly buffer beta_blob { sfp beta_data[]; }; | |||
| @@ -27,47 +25,38 @@ void main() | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| // Boundary check against the original tensor dimensions | |||
| if (gx >= p.w || gy >= p.h || gz >= p.c) | |||
| return; | |||
| return; | |||
| int group_id; | |||
| int inner_id; | |||
| // Determine the group ID and the element's ID within that group. | |||
| // This logic correctly maps each invocation to its normalization group. | |||
| if (p.affine_size == p.w) | |||
| { | |||
| // Normalization is performed per row | |||
| group_id = gz * p.h + gy; | |||
| inner_id = gx; | |||
| } | |||
| else // if (p.affine_size == p.w * p.h) | |||
| { | |||
| // Normalization is performed per channel | |||
| group_id = gz; | |||
| inner_id = gy * p.w + gx; | |||
| } | |||
| // Fetch the pre-calculated normalization coefficients a and b. | |||
| // There is one (a, b) pair per group. | |||
| afp a = buffer_ld1(coeffs_blob_data, group_id * 2); | |||
| afp b = buffer_ld1(coeffs_blob_data, group_id * 2 + 1); | |||
| // Calculate the correct linear index for the element, respecting cstep. | |||
| int linear_index = gz * p.cstep + gy * p.w + gx; | |||
| afp v = buffer_ld1(bottom_top_blob_data, linear_index); | |||
| // Apply the base normalization: (x - mean) / sqrt(var + eps) | |||
| // (x - mean) / sqrt(var + eps) | |||
| v = v * a + b; | |||
| // Apply the learned affine transformation if enabled | |||
| if (affine == 1) { | |||
| if (affine == 1) | |||
| { | |||
| afp gamma = buffer_ld1(gamma_data, inner_id); | |||
| afp beta = buffer_ld1(beta_data, inner_id); | |||
| v = v * gamma + beta; | |||
| } | |||
| // Write the final result back to the same location | |||
| buffer_st1(bottom_top_blob_data, linear_index, v); | |||
| } | |||
| @@ -6,25 +6,30 @@ | |||
| layout (binding = 0) readonly buffer sum_blob { float sum_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer mean_blob { float mean_data[]; }; | |||
| layout (push_constant) uniform parameter { | |||
| int w; int h; int c; int cstep; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| float group_size; | |||
| } p; | |||
| void main() { | |||
| // Each invocation calculates the mean for one group, identified by (gy, gz) | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gy >= p.h || gz >= p.c) return; | |||
| if (gx >= 1 || gy >= p.h || gz >= p.c) // gx >= 1 added as per request | |||
| return; | |||
| float sum = 0.f; | |||
| // Base offset for the current group's row of data in the 3D sum_blob | |||
| int v_offset = gz * p.cstep + gy * p.w; | |||
| for (int i = 0; i < p.w; i++) { | |||
| for (int i = 0; i < p.w; i++) | |||
| { | |||
| sum += sum_blob_data[v_offset + i]; | |||
| } | |||
| // Output is a linear buffer indexed by the flattened group_id | |||
| int group_id = gz * p.h + gy; | |||
| mean_data[group_id] = sum / p.group_size; | |||
| } | |||
| } | |||
| @@ -3,50 +3,69 @@ | |||
| #version 450 | |||
| // This shader correctly reduces a 3D dispatched problem over non-contiguous memory. | |||
| layout (binding = 0) readonly buffer input_blob { sfp input_data[]; }; | |||
| layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; | |||
| layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; | |||
| layout (push_constant) uniform parameter { | |||
| int w; int h; int c; int cstep; | |||
| int outw; int outh; int outc; int outcstep; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() { | |||
| // Global invocation IDs map to the output buffer dimensions | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| float sum; | |||
| // sx is the starting element index for reduction within a group | |||
| int sx = gx * 4; | |||
| // Correctly calculate the base offset for the group in the input tensor. | |||
| // gz * p.cstep -> Jumps to the start of the correct channel plane. | |||
| // gy * p.w -> Jumps to the start of the correct row (group) within that plane. | |||
| // The stride between rows is p.w (the width of a row). | |||
| int base_offset = gz * p.cstep + gy * p.w; | |||
| int v_offset = gz * p.cstep + gy * p.w + sx; | |||
| float sum; | |||
| int r_offset = base_offset + sx; | |||
| if (sx >= p.w - 3) { | |||
| if (sx >= p.w) { | |||
| sum = 0.0f; | |||
| } else if (sx == p.w - 1) { | |||
| sum = float(buffer_ld1(input_data, r_offset)); | |||
| } else if (sx == p.w - 2) { | |||
| sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)); | |||
| } else { // sx == p.w - 3 | |||
| sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)); | |||
| } | |||
| } else { | |||
| sum = float(buffer_ld1(input_data, r_offset)) + float(buffer_ld1(input_data, r_offset + 1)) + float(buffer_ld1(input_data, r_offset + 2)) + float(buffer_ld1(input_data, r_offset + 3)); | |||
| if (sx == p.w - 1) | |||
| { | |||
| float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); | |||
| sum = v0; | |||
| } | |||
| else if (sx == p.w - 2) | |||
| { | |||
| float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); | |||
| float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); | |||
| sum = v0 + v1; | |||
| } | |||
| else if (sx == p.w - 3) | |||
| { | |||
| float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); | |||
| float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); | |||
| float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); | |||
| sum = v0 + v1 + v2; | |||
| } | |||
| else | |||
| { | |||
| float v0 = float(buffer_ld1(bottom_top_blob_data, v_offset)); | |||
| float v1 = float(buffer_ld1(bottom_top_blob_data, v_offset + 1)); | |||
| float v2 = float(buffer_ld1(bottom_top_blob_data, v_offset + 2)); | |||
| float v3 = float(buffer_ld1(bottom_top_blob_data, v_offset + 3)); | |||
| sum = v0 + v1 + v2 + v3; | |||
| } | |||
| // Output index is a packed 3D index | |||
| int out_index = gz * p.outcstep + gy * p.outw + gx; | |||
| output_data[out_index] = sum; | |||
| } | |||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| sum_blob_data[gi] = sum; | |||
| } | |||
| @@ -3,50 +3,68 @@ | |||
| #version 450 | |||
| // This shader correctly reduces a 3D dispatched problem over non-contiguous memory. | |||
| layout (binding = 0) readonly buffer input_blob { float input_data[]; }; | |||
| layout (binding = 1) writeonly buffer output_blob { float output_data[]; }; | |||
| layout (binding = 0) readonly buffer bottom_top_blob { float bottom_top_blob_data[]; }; | |||
| layout (binding = 1) writeonly buffer sum_blob { float sum_blob_data[]; }; | |||
| layout (push_constant) uniform parameter { | |||
| int w; int h; int c; int cstep; | |||
| int outw; int outh; int outc; int outcstep; | |||
| layout (push_constant) uniform parameter | |||
| { | |||
| int w; | |||
| int h; | |||
| int c; | |||
| int cstep; | |||
| int outw; | |||
| int outh; | |||
| int outc; | |||
| int outcstep; | |||
| } p; | |||
| void main() { | |||
| // Global invocation IDs map to the output buffer dimensions | |||
| void main() | |||
| { | |||
| int gx = int(gl_GlobalInvocationID.x); | |||
| int gy = int(gl_GlobalInvocationID.y); | |||
| int gz = int(gl_GlobalInvocationID.z); | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) return; | |||
| if (gx >= p.outw || gy >= p.outh || gz >= p.outc) | |||
| return; | |||
| float sum; | |||
| // sx is the starting element index for reduction within a group | |||
| int sx = gx * 4; | |||
| // Correctly calculate the base offset for the group in the input tensor. | |||
| // gz * p.cstep -> Jumps to the start of the correct channel plane. | |||
| // gy * p.w -> Jumps to the start of the correct row (group) within that plane. | |||
| // The stride between rows is p.w (the width of a row). | |||
| int base_offset = gz * p.cstep + gy * p.w; | |||
| int v_offset = gz * p.cstep + gy * p.w + sx; | |||
| float sum; | |||
| int r_offset = base_offset + sx; | |||
| if (sx >= p.w - 3) { | |||
| if (sx >= p.w) { | |||
| sum = 0.0f; | |||
| } else if (sx == p.w - 1) { | |||
| sum = input_data[r_offset]; | |||
| } else if (sx == p.w - 2) { | |||
| sum = input_data[r_offset] + input_data[r_offset + 1]; | |||
| } else { // sx == p.w - 3 | |||
| sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2]; | |||
| } | |||
| } else { | |||
| sum = input_data[r_offset] + input_data[r_offset + 1] + input_data[r_offset + 2] + input_data[r_offset + 3]; | |||
| if (sx >= p.w) | |||
| { | |||
| sum = 0.0f; | |||
| } | |||
| else if (sx == p.w - 1) | |||
| { | |||
| float v0 = bottom_top_blob_data[v_offset]; | |||
| sum = v0; | |||
| } | |||
| else if (sx == p.w - 2) | |||
| { | |||
| float v0 = bottom_top_blob_data[v_offset]; | |||
| float v1 = bottom_top_blob_data[v_offset + 1]; | |||
| sum = v0 + v1; | |||
| } | |||
| else if (sx == p.w - 3) | |||
| { | |||
| float v0 = bottom_top_blob_data[v_offset]; | |||
| float v1 = bottom_top_blob_data[v_offset + 1]; | |||
| float v2 = bottom_top_blob_data[v_offset + 2]; | |||
| sum = v0 + v1 + v2; | |||
| } | |||
| else | |||
| { | |||
| float v0 = bottom_top_blob_data[v_offset]; | |||
| float v1 = bottom_top_blob_data[v_offset + 1]; | |||
| float v2 = bottom_top_blob_data[v_offset + 2]; | |||
| float v3 = bottom_top_blob_data[v_offset + 3]; | |||
| sum = v0 + v1 + v2 + v3; | |||
| } | |||
| // Output index is a packed 3D index | |||
| int out_index = gz * p.outcstep + gy * p.outw + gx; | |||
| output_data[out_index] = sum; | |||
| } | |||
| int gi = gz * p.outcstep + gy * p.outw + gx; | |||
| sum_blob_data[gi] = sum; | |||
| } | |||
| @@ -5,7 +5,7 @@ | |||
| layout (binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; | |||
| layout (binding = 1) readonly buffer mean_blob { float mean_data[]; }; | |||
| // The output buffer's type must match the input type if the framework reuses the memory type | |||
| layout (binding = 2) writeonly buffer square_blob { sfp square_blob_data[]; }; | |||
| layout (push_constant) uniform parameter { | |||
| @@ -40,10 +40,9 @@ void main() { | |||
| int linear_index = gz * p.cstep + gy * p.w + gx; | |||
| // Perform calculation in fp32 for precision | |||
| float v = float(buffer_ld1(bottom_top_blob_data, linear_index)); | |||
| v = v - mean; | |||
| // Write back in the native storage format (sfp) | |||
| buffer_st1(square_blob_data, linear_index, afp(v * v)); | |||
| } | |||