diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index 3e2f0a167900..c9a5307818c2 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -28,10 +28,27 @@ LayerNorm_vulkan::LayerNorm_vulkan() pipeline_layernorm_sub_mean_square_pack4 = 0; pipeline_layernorm_coeffs_pack4 = 0; pipeline_layernorm_norm_pack4 = 0; + + // subgroup + pipeline_layernorm_reduce_subgroup = 0; + pipeline_layernorm_reduce_subgroup_pack4 = 0; } int LayerNorm_vulkan::create_pipeline(const Option& opt) { + const bool use_subgroup = opt.use_subgroup_ops && (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT); + + if (use_subgroup) + { + pipeline_layernorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup->create(LayerShaderType::layernorm_reduce_subgroup, opt, std::vector()); + + pipeline_layernorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup_pack4->create(LayerShaderType::layernorm_reduce_subgroup_pack4, opt, std::vector()); + } + else { pipeline_layernorm_reduce_sum4_fp16_to_fp32 = new Pipeline(vkdev); pipeline_layernorm_reduce_sum4_fp16_to_fp32->set_optimal_local_size_xyz(16, 4, 1); @@ -54,9 +71,7 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_reduce_sum4_fp32_pack4[1] = new Pipeline(vkdev); pipeline_layernorm_reduce_sum4_fp32_pack4[1]->set_optimal_local_size_xyz(8, 8, 1); pipeline_layernorm_reduce_sum4_fp32_pack4[1]->create(LayerShaderType::layernorm_reduce_sum4_fp32_pack4, opt, std::vector()); - } - { pipeline_layernorm_reduce_mean = new Pipeline(vkdev); pipeline_layernorm_reduce_mean->set_optimal_local_size_xyz(1, 8, 8); pipeline_layernorm_reduce_mean->create(LayerShaderType::layernorm_reduce_mean, opt, std::vector()); @@ -64,19 +79,19 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_reduce_mean_pack4 = new Pipeline(vkdev); pipeline_layernorm_reduce_mean_pack4->set_optimal_local_size_xyz(1, 8, 8); pipeline_layernorm_reduce_mean_pack4->create(LayerShaderType::layernorm_reduce_mean_pack4, opt, std::vector()); - } - { - std::vector specializations(1); - specializations[0].i = affine_size; + { + std::vector specializations(1); + specializations[0].i = affine_size; - pipeline_layernorm_sub_mean_square = new Pipeline(vkdev); - pipeline_layernorm_sub_mean_square->set_optimal_local_size_xyz(8, 8, 1); - pipeline_layernorm_sub_mean_square->create(LayerShaderType::layernorm_sub_mean_square, opt, specializations); + pipeline_layernorm_sub_mean_square = new Pipeline(vkdev); + pipeline_layernorm_sub_mean_square->set_optimal_local_size_xyz(8, 8, 1); + pipeline_layernorm_sub_mean_square->create(LayerShaderType::layernorm_sub_mean_square, opt, specializations); - pipeline_layernorm_sub_mean_square_pack4 = new Pipeline(vkdev); - pipeline_layernorm_sub_mean_square_pack4->set_optimal_local_size_xyz(8, 8, 1); - pipeline_layernorm_sub_mean_square_pack4->create(LayerShaderType::layernorm_sub_mean_square_pack4, opt, specializations); + pipeline_layernorm_sub_mean_square_pack4 = new Pipeline(vkdev); + pipeline_layernorm_sub_mean_square_pack4->set_optimal_local_size_xyz(8, 8, 1); + pipeline_layernorm_sub_mean_square_pack4->create(LayerShaderType::layernorm_sub_mean_square_pack4, opt, specializations); + } } { @@ -143,6 +158,12 @@ int LayerNorm_vulkan::destroy_pipeline(const Option& /*opt*/) pipeline_layernorm_coeffs_pack4 = 0; pipeline_layernorm_norm_pack4 = 0; + // subgroup + delete pipeline_layernorm_reduce_subgroup; + delete pipeline_layernorm_reduce_subgroup_pack4; + pipeline_layernorm_reduce_subgroup = 0; + pipeline_layernorm_reduce_subgroup_pack4 = 0; + return 0; } @@ -207,170 +228,194 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co int num_groups_total = num_groups_per_channel * channels; VkMat mean_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); - { - int reduced_w = (group_size + 3) / 4; - VkMat sum_workspace; - sum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + VkMat var_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); - std::vector bindings(2); + const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_layernorm_reduce_subgroup_pack4 : pipeline_layernorm_reduce_subgroup; + if (pipeline_reduce_subgroup) + { + std::vector bindings(3); bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + bindings[1] = mean_workspace; + bindings[2] = var_workspace; - std::vector constants(8); + std::vector constants(3); constants[0].i = group_size; constants[1].i = num_groups_per_channel; - constants[2].i = channels; - constants[3].i = cstep; - constants[4].i = reduced_w; - constants[5].i = num_groups_per_channel; - constants[6].i = channels; - constants[7].i = sum_workspace.cstep; + constants[2].i = (int)cstep; VkMat dispatcher; - dispatcher.w = reduced_w; - dispatcher.h = num_groups_per_channel; - dispatcher.c = channels; - - const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; - - cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); - - int pb = 1; - while (sum_workspace.w > 1) - { - int current_w = sum_workspace.w; - reduced_w = (current_w + 3) / 4; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings_iter(2); - bindings_iter[0] = sum_workspace; - bindings_iter[1] = sum_workspace_reduced; - - std::vector constants_iter(8); - constants_iter[0].i = current_w; - constants_iter[1].i = num_groups_per_channel; - constants_iter[2].i = channels; - constants_iter[3].i = sum_workspace.cstep; - constants_iter[4].i = reduced_w; - constants_iter[5].i = num_groups_per_channel; - constants_iter[6].i = channels; - constants_iter[7].i = sum_workspace_reduced.cstep; - - dispatcher.w = reduced_w; - - const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; - cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); - pb++; - sum_workspace = sum_workspace_reduced; - } - - std::vector mean_bindings(2); - mean_bindings[0] = sum_workspace; - mean_bindings[1] = mean_workspace; - - std::vector mean_constants(5); - mean_constants[0].i = sum_workspace.w; - mean_constants[1].i = num_groups_per_channel; - mean_constants[2].i = channels; - mean_constants[3].i = sum_workspace.cstep; - mean_constants[4].f = (float)group_size; - dispatcher.w = 1; - const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; - cmd.record_pipeline(pipeline_reduce_mean, mean_bindings, mean_constants, dispatcher); - } + dispatcher.h = num_groups_total; + dispatcher.c = 1; - VkMat var_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher); + } + else { - VkMat square_workspace(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); { - std::vector sq_bindings(3); - sq_bindings[0] = bottom_top_blob; - sq_bindings[1] = mean_workspace; - sq_bindings[2] = square_workspace; - - std::vector sq_constants(4); - sq_constants[0].i = w; - sq_constants[1].i = h; - sq_constants[2].i = channels; - sq_constants[3].i = cstep; - - const Pipeline* pipeline_sub_mean_square = elempack == 4 ? pipeline_layernorm_sub_mean_square_pack4 : pipeline_layernorm_sub_mean_square; - cmd.record_pipeline(pipeline_sub_mean_square, sq_bindings, sq_constants, square_workspace); + int reduced_w = (group_size + 3) / 4; + VkMat sum_workspace; + sum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(8); + constants[0].i = group_size; + constants[1].i = num_groups_per_channel; + constants[2].i = channels; + constants[3].i = cstep; + constants[4].i = reduced_w; + constants[5].i = num_groups_per_channel; + constants[6].i = channels; + constants[7].i = sum_workspace.cstep; + + VkMat dispatcher; + dispatcher.w = reduced_w; + dispatcher.h = num_groups_per_channel; + dispatcher.c = channels; + + const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); + + int pb = 1; + while (sum_workspace.w > 1) + { + int current_w = sum_workspace.w; + reduced_w = (current_w + 3) / 4; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings_iter(2); + bindings_iter[0] = sum_workspace; + bindings_iter[1] = sum_workspace_reduced; + + std::vector constants_iter(8); + constants_iter[0].i = current_w; + constants_iter[1].i = num_groups_per_channel; + constants_iter[2].i = channels; + constants_iter[3].i = sum_workspace.cstep; + constants_iter[4].i = reduced_w; + constants_iter[5].i = num_groups_per_channel; + constants_iter[6].i = channels; + constants_iter[7].i = sum_workspace_reduced.cstep; + + dispatcher.w = reduced_w; + + const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; + cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); + pb++; + sum_workspace = sum_workspace_reduced; + } + + std::vector mean_bindings(2); + mean_bindings[0] = sum_workspace; + mean_bindings[1] = mean_workspace; + + std::vector mean_constants(5); + mean_constants[0].i = sum_workspace.w; + mean_constants[1].i = num_groups_per_channel; + mean_constants[2].i = channels; + mean_constants[3].i = sum_workspace.cstep; + mean_constants[4].f = (float)group_size; + + dispatcher.w = 1; + const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; + cmd.record_pipeline(pipeline_reduce_mean, mean_bindings, mean_constants, dispatcher); } - // Reduce sum of squares - int reduced_w = (group_size + 3) / 4; - VkMat sqsum_workspace; - sqsum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings(2); - bindings[0] = square_workspace; - bindings[1] = sqsum_workspace; - - std::vector constants(8); - constants[0].i = group_size; - constants[1].i = num_groups_per_channel; - constants[2].i = channels; - constants[3].i = square_workspace.cstep; - constants[4].i = reduced_w; - constants[5].i = num_groups_per_channel; - constants[6].i = channels; - constants[7].i = sqsum_workspace.cstep; - - VkMat dispatcher; - dispatcher.w = reduced_w; - dispatcher.h = num_groups_per_channel; - dispatcher.c = channels; - - const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; - - cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); - - int pb = 1; - while (sqsum_workspace.w > 1) { - int current_w = sqsum_workspace.w; - reduced_w = (current_w + 3) / 4; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings_iter(2); - bindings_iter[0] = sqsum_workspace; - bindings_iter[1] = sum_workspace_reduced; - std::vector constants_iter(8); - constants_iter[0].i = current_w; - constants_iter[1].i = num_groups_per_channel; - constants_iter[2].i = channels; - constants_iter[3].i = sqsum_workspace.cstep; - constants_iter[4].i = reduced_w; - constants_iter[5].i = num_groups_per_channel; - constants_iter[6].i = channels; - constants_iter[7].i = sum_workspace_reduced.cstep; - + VkMat square_workspace(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); + { + std::vector sq_bindings(3); + sq_bindings[0] = bottom_top_blob; + sq_bindings[1] = mean_workspace; + sq_bindings[2] = square_workspace; + + std::vector sq_constants(4); + sq_constants[0].i = w; + sq_constants[1].i = h; + sq_constants[2].i = channels; + sq_constants[3].i = cstep; + + const Pipeline* pipeline_sub_mean_square = elempack == 4 ? pipeline_layernorm_sub_mean_square_pack4 : pipeline_layernorm_sub_mean_square; + cmd.record_pipeline(pipeline_sub_mean_square, sq_bindings, sq_constants, square_workspace); + } + + // Reduce sum of squares + int reduced_w = (group_size + 3) / 4; + VkMat sqsum_workspace; + sqsum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings(2); + bindings[0] = square_workspace; + bindings[1] = sqsum_workspace; + + std::vector constants(8); + constants[0].i = group_size; + constants[1].i = num_groups_per_channel; + constants[2].i = channels; + constants[3].i = square_workspace.cstep; + constants[4].i = reduced_w; + constants[5].i = num_groups_per_channel; + constants[6].i = channels; + constants[7].i = sqsum_workspace.cstep; + + VkMat dispatcher; dispatcher.w = reduced_w; - - const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; - cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); - pb++; - sqsum_workspace = sum_workspace_reduced; + dispatcher.h = num_groups_per_channel; + dispatcher.c = channels; + + const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); + + int pb = 1; + while (sqsum_workspace.w > 1) + { + int current_w = sqsum_workspace.w; + reduced_w = (current_w + 3) / 4; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings_iter(2); + bindings_iter[0] = sqsum_workspace; + bindings_iter[1] = sum_workspace_reduced; + std::vector constants_iter(8); + constants_iter[0].i = current_w; + constants_iter[1].i = num_groups_per_channel; + constants_iter[2].i = channels; + constants_iter[3].i = sqsum_workspace.cstep; + constants_iter[4].i = reduced_w; + constants_iter[5].i = num_groups_per_channel; + constants_iter[6].i = channels; + constants_iter[7].i = sum_workspace_reduced.cstep; + + dispatcher.w = reduced_w; + + const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; + cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); + pb++; + sqsum_workspace = sum_workspace_reduced; + } + + std::vector var_bindings(2); + var_bindings[0] = sqsum_workspace; + var_bindings[1] = var_workspace; + std::vector var_constants(5); + var_constants[0].i = sqsum_workspace.w; + var_constants[1].i = num_groups_per_channel; + var_constants[2].i = channels; + var_constants[3].i = sqsum_workspace.cstep; + var_constants[4].f = (float)group_size; + + dispatcher.w = 1; + + const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; + cmd.record_pipeline(pipeline_reduce_mean, var_bindings, var_constants, dispatcher); } - - std::vector var_bindings(2); - var_bindings[0] = sqsum_workspace; - var_bindings[1] = var_workspace; - std::vector var_constants(5); - var_constants[0].i = sqsum_workspace.w; - var_constants[1].i = num_groups_per_channel; - var_constants[2].i = channels; - var_constants[3].i = sqsum_workspace.cstep; - var_constants[4].f = (float)group_size; - - dispatcher.w = 1; - - const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; - cmd.record_pipeline(pipeline_reduce_mean, var_bindings, var_constants, dispatcher); } // coeffs a and b --- diff --git a/src/layer/vulkan/layernorm_vulkan.h b/src/layer/vulkan/layernorm_vulkan.h index 7e662a72729c..a48ce8c9b0fc 100644 --- a/src/layer/vulkan/layernorm_vulkan.h +++ b/src/layer/vulkan/layernorm_vulkan.h @@ -42,6 +42,10 @@ class LayerNorm_vulkan : public LayerNorm Pipeline* pipeline_layernorm_sub_mean_square_pack4; Pipeline* pipeline_layernorm_coeffs_pack4; Pipeline* pipeline_layernorm_norm_pack4; + + // subgroup pipelines + Pipeline* pipeline_layernorm_reduce_subgroup; + Pipeline* pipeline_layernorm_reduce_subgroup_pack4; }; } // namespace ncnn diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp new file mode 100644 index 000000000000..5d7e54eda0f4 --- /dev/null +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp @@ -0,0 +1,193 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_size; + int num_groups_per_channel; + int cstep; +} p; + +shared float sdata_sum[256]; +shared float sdata_sqsum[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const int gz = group_id / p.num_groups_per_channel; + const int gy = group_id % p.num_groups_per_channel; + const int offset = gz * p.cstep + gy * p.group_size; + + float sum = 0.f; + float sqsum = 0.f; + + for (int t = tid; t < p.group_size; t += 256) + { + float v = float(buffer_ld1(bottom_top_blob_data, offset + t)); + sum += v; + sqsum += v * v; + } + + float sg_sum = subgroupAdd(sum); + float sg_sqsum = subgroupAdd(sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = float(sg_sum); + sdata_sqsum[int(gl_SubgroupID)] = float(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + float v_sum = lane < num_sg ? sdata_sum[lane] : 0.f; + float v_sqsum = lane < num_sg ? sdata_sqsum[lane] : 0.f; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v_sum = lane < 4 ? sdata_sum[lane] : 0.f; + float v_sqsum = lane < 4 ? sdata_sqsum[lane] : 0.f; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v_sum = sdata_sum[lane]; + float v_sqsum = sdata_sqsum[lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + float r_sum = sdata_sum[0]; + float r_sqsum = sdata_sqsum[0]; + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#endif +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp new file mode 100644 index 000000000000..f92cbcab641e --- /dev/null +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp @@ -0,0 +1,193 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { vec4 mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { vec4 var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_size; + int num_groups_per_channel; + int cstep; +} p; + +shared vec4 sdata_sum[256]; +shared vec4 sdata_sqsum[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const int gz = group_id / p.num_groups_per_channel; + const int gy = group_id % p.num_groups_per_channel; + const int offset = gz * p.cstep + gy * p.group_size; + + vec4 sum = vec4(0.f); + vec4 sqsum = vec4(0.f); + + for (int t = tid; t < p.group_size; t += 256) + { + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, offset + t)); + sum += v; + sqsum += v * v; + } + + vec4 sg_sum = subgroupAdd(sum); + vec4 sg_sqsum = subgroupAdd(sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = vec4(sg_sum); + sdata_sqsum[int(gl_SubgroupID)] = vec4(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v_sum = lane < num_sg ? sdata_sum[lane] : vec4(0.f); + vec4 v_sqsum = lane < num_sg ? sdata_sqsum[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v_sum = sdata_sum[base + lane]; + vec4 v_sqsum = sdata_sqsum[base + lane]; + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = vec4(r_sum); + sdata_sqsum[int(gl_SubgroupID)] = vec4(r_sqsum); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v_sum = lane < 4 ? sdata_sum[lane] : vec4(0.f); + vec4 v_sqsum = lane < 4 ? sdata_sqsum[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v_sum = vec4(lfp2afpvec4(sdata_sum[base + lane])); + vec4 v_sqsum = vec4(lfp2afpvec4(sdata_sqsum[base + lane])); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sum)); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sqsum)); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v_sum = vec4(lfp2afpvec4(sdata_sum[base + lane])); + vec4 v_sqsum = vec4(lfp2afpvec4(sdata_sqsum[base + lane])); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sum)); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sqsum)); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v_sum = sdata_sum[lane]; + vec4 v_sqsum = sdata_sqsum[lane]; + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + vec4 r_sum = sdata_sum[0]; + vec4 r_sqsum = sdata_sqsum[0]; + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#endif +} diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt index 10c0535d8087..bf1871f5efcc 100644 --- a/tests/perf/CMakeLists.txt +++ b/tests/perf/CMakeLists.txt @@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp) ncnn_add_layer_perf(Concat) ncnn_add_layer_perf(Sigmoid) ncnn_add_layer_perf(BatchNorm) +ncnn_add_layer_perf(LayerNorm) # SDPA perf tests (decode and prefill phases) if(WITH_LAYER_sdpa) diff --git a/tests/perf/perf_layernorm.cpp b/tests/perf/perf_layernorm.cpp new file mode 100644 index 000000000000..dffe6debb541 --- /dev/null +++ b/tests/perf/perf_layernorm.cpp @@ -0,0 +1,34 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "perfutil.h" + +static void perf_layernorm(int w, int h, int c, int affine_size) +{ + ncnn::ParamDict pd; + pd.set(0, affine_size); + pd.set(1, 1e-5f); + pd.set(2, 1); + + std::vector weights(2); + weights[0] = PerfMat(affine_size, 1.0f); + weights[1] = PerfMat(affine_size, 0.0f); + + perf_layer("LayerNorm", pd, weights, PerfMat(w, h, c), "affine_size=%d", affine_size); +} + +int main() +{ + // typical LLM feature dimensions + perf_layernorm(4096, 1, 1, 4096); + perf_layernorm(4096, 1, 32, 4096); + perf_layernorm(16384, 1, 1, 16384); + perf_layernorm(5120, 1, 1, 5120); + perf_layernorm(4096, 512, 1, 4096); + + // smaller dims for comparison + perf_layernorm(1024, 1, 1, 1024); + perf_layernorm(768, 1, 1, 768); + + return 0; +}