From ba13ae46980d28b1d9a9b6b789376d105830fda7 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Fri, 29 May 2026 12:54:15 +0800 Subject: [PATCH 1/3] layernorm vulkan subgroup reduce optimization - add layernorm_reduce_subgroup shader using subgroupAdd arithmetic - compute mean and variance in single dispatch, eliminating ~10+ ping-pong reduces - fallback to shared memory tree reduction when subgroup arithmetic unavailable - add perf_layernorm benchmark perf improvement on RTX 4060 [4096,1,1] fp32: ~27ms -> ~7.5ms (3.6x) perf improvement on RTX 4060 [4096,1,32] fp32: ~11.5ms -> ~1.3ms (8.8x) perf improvement on RTX 4060 [4096,512,1] fp32: ~11.5ms -> ~4.8ms (2.4x) perf improvement on RTX 4060 [1024,1,1] fp32: ~27ms -> ~7ms (3.8x) --- src/layer/vulkan/layernorm_vulkan.cpp | 341 ++++++++++-------- src/layer/vulkan/layernorm_vulkan.h | 4 + .../shader/layernorm_reduce_subgroup.comp | 222 ++++++++++++ .../layernorm_reduce_subgroup_pack4.comp | 222 ++++++++++++ tests/perf/CMakeLists.txt | 1 + tests/perf/perf_layernorm.cpp | 34 ++ 6 files changed, 676 insertions(+), 148 deletions(-) create mode 100644 src/layer/vulkan/shader/layernorm_reduce_subgroup.comp create mode 100644 src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp create mode 100644 tests/perf/perf_layernorm.cpp diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index 3e2f0a167900..ec430b473667 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -28,6 +28,10 @@ LayerNorm_vulkan::LayerNorm_vulkan() pipeline_layernorm_sub_mean_square_pack4 = 0; pipeline_layernorm_coeffs_pack4 = 0; pipeline_layernorm_norm_pack4 = 0; + + // subgroup + pipeline_layernorm_reduce_subgroup = 0; + pipeline_layernorm_reduce_subgroup_pack4 = 0; } int LayerNorm_vulkan::create_pipeline(const Option& opt) @@ -106,6 +110,17 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_norm_pack4->create(LayerShaderType::layernorm_norm_pack4, opt, specializations); } + if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) + { + pipeline_layernorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup->create(LayerShaderType::layernorm_reduce_subgroup, opt, std::vector()); + + pipeline_layernorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup_pack4->create(LayerShaderType::layernorm_reduce_subgroup_pack4, opt, std::vector()); + } + return 0; } @@ -143,6 +158,12 @@ int LayerNorm_vulkan::destroy_pipeline(const Option& /*opt*/) pipeline_layernorm_coeffs_pack4 = 0; pipeline_layernorm_norm_pack4 = 0; + // subgroup + delete pipeline_layernorm_reduce_subgroup; + delete pipeline_layernorm_reduce_subgroup_pack4; + pipeline_layernorm_reduce_subgroup = 0; + pipeline_layernorm_reduce_subgroup_pack4 = 0; + return 0; } @@ -207,170 +228,194 @@ int LayerNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co int num_groups_total = num_groups_per_channel * channels; VkMat mean_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); - { - int reduced_w = (group_size + 3) / 4; - VkMat sum_workspace; - sum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + VkMat var_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); - std::vector bindings(2); + const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_layernorm_reduce_subgroup_pack4 : pipeline_layernorm_reduce_subgroup; + if (pipeline_reduce_subgroup) + { + std::vector bindings(3); bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + bindings[1] = mean_workspace; + bindings[2] = var_workspace; - std::vector constants(8); + std::vector constants(3); constants[0].i = group_size; constants[1].i = num_groups_per_channel; - constants[2].i = channels; - constants[3].i = cstep; - constants[4].i = reduced_w; - constants[5].i = num_groups_per_channel; - constants[6].i = channels; - constants[7].i = sum_workspace.cstep; + constants[2].i = (int)cstep; VkMat dispatcher; - dispatcher.w = reduced_w; - dispatcher.h = num_groups_per_channel; - dispatcher.c = channels; - - const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; - - cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); - - int pb = 1; - while (sum_workspace.w > 1) - { - int current_w = sum_workspace.w; - reduced_w = (current_w + 3) / 4; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings_iter(2); - bindings_iter[0] = sum_workspace; - bindings_iter[1] = sum_workspace_reduced; - - std::vector constants_iter(8); - constants_iter[0].i = current_w; - constants_iter[1].i = num_groups_per_channel; - constants_iter[2].i = channels; - constants_iter[3].i = sum_workspace.cstep; - constants_iter[4].i = reduced_w; - constants_iter[5].i = num_groups_per_channel; - constants_iter[6].i = channels; - constants_iter[7].i = sum_workspace_reduced.cstep; - - dispatcher.w = reduced_w; - - const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; - cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); - pb++; - sum_workspace = sum_workspace_reduced; - } - - std::vector mean_bindings(2); - mean_bindings[0] = sum_workspace; - mean_bindings[1] = mean_workspace; - - std::vector mean_constants(5); - mean_constants[0].i = sum_workspace.w; - mean_constants[1].i = num_groups_per_channel; - mean_constants[2].i = channels; - mean_constants[3].i = sum_workspace.cstep; - mean_constants[4].f = (float)group_size; - dispatcher.w = 1; - const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; - cmd.record_pipeline(pipeline_reduce_mean, mean_bindings, mean_constants, dispatcher); - } + dispatcher.h = num_groups_total; + dispatcher.c = 1; - VkMat var_workspace(num_groups_total, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher); + } + else { - VkMat square_workspace(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); { - std::vector sq_bindings(3); - sq_bindings[0] = bottom_top_blob; - sq_bindings[1] = mean_workspace; - sq_bindings[2] = square_workspace; - - std::vector sq_constants(4); - sq_constants[0].i = w; - sq_constants[1].i = h; - sq_constants[2].i = channels; - sq_constants[3].i = cstep; - - const Pipeline* pipeline_sub_mean_square = elempack == 4 ? pipeline_layernorm_sub_mean_square_pack4 : pipeline_layernorm_sub_mean_square; - cmd.record_pipeline(pipeline_sub_mean_square, sq_bindings, sq_constants, square_workspace); + int reduced_w = (group_size + 3) / 4; + VkMat sum_workspace; + sum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(8); + constants[0].i = group_size; + constants[1].i = num_groups_per_channel; + constants[2].i = channels; + constants[3].i = cstep; + constants[4].i = reduced_w; + constants[5].i = num_groups_per_channel; + constants[6].i = channels; + constants[7].i = sum_workspace.cstep; + + VkMat dispatcher; + dispatcher.w = reduced_w; + dispatcher.h = num_groups_per_channel; + dispatcher.c = channels; + + const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); + + int pb = 1; + while (sum_workspace.w > 1) + { + int current_w = sum_workspace.w; + reduced_w = (current_w + 3) / 4; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings_iter(2); + bindings_iter[0] = sum_workspace; + bindings_iter[1] = sum_workspace_reduced; + + std::vector constants_iter(8); + constants_iter[0].i = current_w; + constants_iter[1].i = num_groups_per_channel; + constants_iter[2].i = channels; + constants_iter[3].i = sum_workspace.cstep; + constants_iter[4].i = reduced_w; + constants_iter[5].i = num_groups_per_channel; + constants_iter[6].i = channels; + constants_iter[7].i = sum_workspace_reduced.cstep; + + dispatcher.w = reduced_w; + + const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; + cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); + pb++; + sum_workspace = sum_workspace_reduced; + } + + std::vector mean_bindings(2); + mean_bindings[0] = sum_workspace; + mean_bindings[1] = mean_workspace; + + std::vector mean_constants(5); + mean_constants[0].i = sum_workspace.w; + mean_constants[1].i = num_groups_per_channel; + mean_constants[2].i = channels; + mean_constants[3].i = sum_workspace.cstep; + mean_constants[4].f = (float)group_size; + + dispatcher.w = 1; + const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; + cmd.record_pipeline(pipeline_reduce_mean, mean_bindings, mean_constants, dispatcher); } - // Reduce sum of squares - int reduced_w = (group_size + 3) / 4; - VkMat sqsum_workspace; - sqsum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings(2); - bindings[0] = square_workspace; - bindings[1] = sqsum_workspace; - - std::vector constants(8); - constants[0].i = group_size; - constants[1].i = num_groups_per_channel; - constants[2].i = channels; - constants[3].i = square_workspace.cstep; - constants[4].i = reduced_w; - constants[5].i = num_groups_per_channel; - constants[6].i = channels; - constants[7].i = sqsum_workspace.cstep; - - VkMat dispatcher; - dispatcher.w = reduced_w; - dispatcher.h = num_groups_per_channel; - dispatcher.c = channels; - - const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; - - cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); - - int pb = 1; - while (sqsum_workspace.w > 1) { - int current_w = sqsum_workspace.w; - reduced_w = (current_w + 3) / 4; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); - - std::vector bindings_iter(2); - bindings_iter[0] = sqsum_workspace; - bindings_iter[1] = sum_workspace_reduced; - std::vector constants_iter(8); - constants_iter[0].i = current_w; - constants_iter[1].i = num_groups_per_channel; - constants_iter[2].i = channels; - constants_iter[3].i = sqsum_workspace.cstep; - constants_iter[4].i = reduced_w; - constants_iter[5].i = num_groups_per_channel; - constants_iter[6].i = channels; - constants_iter[7].i = sum_workspace_reduced.cstep; - + VkMat square_workspace(w, h, channels, elemsize, elempack, opt.workspace_vkallocator); + { + std::vector sq_bindings(3); + sq_bindings[0] = bottom_top_blob; + sq_bindings[1] = mean_workspace; + sq_bindings[2] = square_workspace; + + std::vector sq_constants(4); + sq_constants[0].i = w; + sq_constants[1].i = h; + sq_constants[2].i = channels; + sq_constants[3].i = cstep; + + const Pipeline* pipeline_sub_mean_square = elempack == 4 ? pipeline_layernorm_sub_mean_square_pack4 : pipeline_layernorm_sub_mean_square; + cmd.record_pipeline(pipeline_sub_mean_square, sq_bindings, sq_constants, square_workspace); + } + + // Reduce sum of squares + int reduced_w = (group_size + 3) / 4; + VkMat sqsum_workspace; + sqsum_workspace.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings(2); + bindings[0] = square_workspace; + bindings[1] = sqsum_workspace; + + std::vector constants(8); + constants[0].i = group_size; + constants[1].i = num_groups_per_channel; + constants[2].i = channels; + constants[3].i = square_workspace.cstep; + constants[4].i = reduced_w; + constants[5].i = num_groups_per_channel; + constants[6].i = channels; + constants[7].i = sqsum_workspace.cstep; + + VkMat dispatcher; dispatcher.w = reduced_w; - - const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; - cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); - pb++; - sqsum_workspace = sum_workspace_reduced; + dispatcher.h = num_groups_per_channel; + dispatcher.c = channels; + + const Pipeline* pipeline_reduce_sum4 = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_layernorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline_reduce_sum4, bindings, constants, dispatcher); + + int pb = 1; + while (sqsum_workspace.w > 1) + { + int current_w = sqsum_workspace.w; + reduced_w = (current_w + 3) / 4; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, num_groups_per_channel, channels, 4u * elempack, elempack, opt.workspace_vkallocator); + + std::vector bindings_iter(2); + bindings_iter[0] = sqsum_workspace; + bindings_iter[1] = sum_workspace_reduced; + std::vector constants_iter(8); + constants_iter[0].i = current_w; + constants_iter[1].i = num_groups_per_channel; + constants_iter[2].i = channels; + constants_iter[3].i = sqsum_workspace.cstep; + constants_iter[4].i = reduced_w; + constants_iter[5].i = num_groups_per_channel; + constants_iter[6].i = channels; + constants_iter[7].i = sum_workspace_reduced.cstep; + + dispatcher.w = reduced_w; + + const Pipeline* pipeline_reduce_iter = elempack == 4 ? pipeline_layernorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_layernorm_reduce_sum4_fp32[pb % 2]; + cmd.record_pipeline(pipeline_reduce_iter, bindings_iter, constants_iter, dispatcher); + pb++; + sqsum_workspace = sum_workspace_reduced; + } + + std::vector var_bindings(2); + var_bindings[0] = sqsum_workspace; + var_bindings[1] = var_workspace; + std::vector var_constants(5); + var_constants[0].i = sqsum_workspace.w; + var_constants[1].i = num_groups_per_channel; + var_constants[2].i = channels; + var_constants[3].i = sqsum_workspace.cstep; + var_constants[4].f = (float)group_size; + + dispatcher.w = 1; + + const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; + cmd.record_pipeline(pipeline_reduce_mean, var_bindings, var_constants, dispatcher); } - - std::vector var_bindings(2); - var_bindings[0] = sqsum_workspace; - var_bindings[1] = var_workspace; - std::vector var_constants(5); - var_constants[0].i = sqsum_workspace.w; - var_constants[1].i = num_groups_per_channel; - var_constants[2].i = channels; - var_constants[3].i = sqsum_workspace.cstep; - var_constants[4].f = (float)group_size; - - dispatcher.w = 1; - - const Pipeline* pipeline_reduce_mean = elempack == 4 ? pipeline_layernorm_reduce_mean_pack4 : pipeline_layernorm_reduce_mean; - cmd.record_pipeline(pipeline_reduce_mean, var_bindings, var_constants, dispatcher); } // coeffs a and b --- diff --git a/src/layer/vulkan/layernorm_vulkan.h b/src/layer/vulkan/layernorm_vulkan.h index 7e662a72729c..a48ce8c9b0fc 100644 --- a/src/layer/vulkan/layernorm_vulkan.h +++ b/src/layer/vulkan/layernorm_vulkan.h @@ -42,6 +42,10 @@ class LayerNorm_vulkan : public LayerNorm Pipeline* pipeline_layernorm_sub_mean_square_pack4; Pipeline* pipeline_layernorm_coeffs_pack4; Pipeline* pipeline_layernorm_norm_pack4; + + // subgroup pipelines + Pipeline* pipeline_layernorm_reduce_subgroup; + Pipeline* pipeline_layernorm_reduce_subgroup_pack4; }; } // namespace ncnn diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp new file mode 100644 index 000000000000..db2876c076c8 --- /dev/null +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp @@ -0,0 +1,222 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_subgroup_arithmetic +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_size; + int num_groups_per_channel; + int cstep; +} p; + +shared float sdata_sum[256]; +shared float sdata_sqsum[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const int gz = group_id / p.num_groups_per_channel; + const int gy = group_id % p.num_groups_per_channel; + const int offset = gz * p.cstep + gy * p.group_size; + + afp sum = afp(0.f); + afp sqsum = afp(0.f); + + for (int t = tid; t < p.group_size; t += 256) + { + afp v = buffer_ld1(bottom_top_blob_data, offset + t); + sum += v; + sqsum += v * v; + } + +#if ncnn_subgroup_arithmetic + afp sg_sum = subgroupAdd(sum); + afp sg_sqsum = subgroupAdd(sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = float(sg_sum); + sdata_sqsum[int(gl_SubgroupID)] = float(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v_sum = lane < num_sg ? afp(sdata_sum[lane]) : afp(0.f); + afp v_sqsum = lane < num_sg ? afp(sdata_sqsum[lane]) : afp(0.f); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = float(r_sum) / float(p.group_size); + float var = float(r_sqsum) / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v_sum = afp(sdata_sum[base + lane]); + afp v_sqsum = afp(sdata_sqsum[base + lane]); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = float(r_sum); + sdata_sqsum[int(gl_SubgroupID)] = float(r_sqsum); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v_sum = lane < 4 ? afp(sdata_sum[lane]) : afp(0.f); + afp v_sqsum = lane < 4 ? afp(sdata_sqsum[lane]) : afp(0.f); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = float(r_sum) / float(p.group_size); + float var = float(r_sqsum) / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v_sum = lfp2afp(sdata_sum[base + lane]); + afp v_sqsum = lfp2afp(sdata_sqsum[base + lane]); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfp(r_sum); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfp(r_sqsum); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v_sum = lfp2afp(sdata_sum[base + lane]); + afp v_sqsum = lfp2afp(sdata_sqsum[base + lane]); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfp(r_sum); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfp(r_sqsum); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v_sum = afp(sdata_sum[lane]); + afp v_sqsum = afp(sdata_sqsum[lane]); + afp r_sum = subgroupAdd(v_sum); + afp r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + float mean = float(r_sum) / float(p.group_size); + float var = float(r_sqsum) / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + afp r_sum = afp(sdata_sum[0]); + afp r_sqsum = afp(sdata_sqsum[0]); + float mean = float(r_sum) / float(p.group_size); + float var = float(r_sqsum) / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#endif + +#else + sdata_sum[tid] = float(sum); + sdata_sqsum[tid] = float(sqsum); + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + afp r_sum = afp(sdata_sum[0]); + afp r_sqsum = afp(sdata_sqsum[0]); + float mean = float(r_sum) / float(p.group_size); + float var = float(r_sqsum) / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } +#endif +} diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp new file mode 100644 index 000000000000..792bcdb8ae96 --- /dev/null +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp @@ -0,0 +1,222 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_subgroup_arithmetic +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { vec4 mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { vec4 var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_size; + int num_groups_per_channel; + int cstep; +} p; + +shared vec4 sdata_sum[256]; +shared vec4 sdata_sqsum[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const int gz = group_id / p.num_groups_per_channel; + const int gy = group_id % p.num_groups_per_channel; + const int offset = gz * p.cstep + gy * p.group_size; + + vec4 sum = vec4(0.f); + vec4 sqsum = vec4(0.f); + + for (int t = tid; t < p.group_size; t += 256) + { + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, offset + t)); + sum += v; + sqsum += v * v; + } + +#if ncnn_subgroup_arithmetic + vec4 sg_sum = subgroupAdd(sum); + vec4 sg_sqsum = subgroupAdd(sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = vec4(sg_sum); + sdata_sqsum[int(gl_SubgroupID)] = vec4(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v_sum = lane < num_sg ? sdata_sum[lane] : vec4(0.f); + vec4 v_sqsum = lane < num_sg ? sdata_sqsum[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v_sum = sdata_sum[base + lane]; + vec4 v_sqsum = sdata_sqsum[base + lane]; + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = vec4(r_sum); + sdata_sqsum[int(gl_SubgroupID)] = vec4(r_sqsum); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v_sum = lane < 4 ? sdata_sum[lane] : vec4(0.f); + vec4 v_sqsum = lane < 4 ? sdata_sqsum[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v_sum = vec4(lfp2afpvec4(sdata_sum[base + lane])); + vec4 v_sqsum = vec4(lfp2afpvec4(sdata_sqsum[base + lane])); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sum)); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sqsum)); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v_sum = vec4(lfp2afpvec4(sdata_sum[base + lane])); + vec4 v_sqsum = vec4(lfp2afpvec4(sdata_sqsum[base + lane])); + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + sdata_sum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sum)); + sdata_sqsum[int(gl_SubgroupID)] = afp2lfpvec4(afpvec4(r_sqsum)); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v_sum = sdata_sum[lane]; + vec4 v_sqsum = sdata_sqsum[lane]; + vec4 r_sum = subgroupAdd(v_sum); + vec4 r_sqsum = subgroupAdd(v_sqsum); + + if (subgroupElect()) + { + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + vec4 r_sum = sdata_sum[0]; + vec4 r_sqsum = sdata_sqsum[0]; + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } + } +#endif + +#else + sdata_sum[tid] = vec4(sum); + sdata_sqsum[tid] = vec4(sqsum); + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; + sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + vec4 r_sum = sdata_sum[0]; + vec4 r_sqsum = sdata_sqsum[0]; + vec4 mean = r_sum / float(p.group_size); + vec4 var = r_sqsum / float(p.group_size) - mean * mean; + mean_data[group_id] = mean; + var_data[group_id] = var; + } +#endif +} diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt index 10c0535d8087..bf1871f5efcc 100644 --- a/tests/perf/CMakeLists.txt +++ b/tests/perf/CMakeLists.txt @@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp) ncnn_add_layer_perf(Concat) ncnn_add_layer_perf(Sigmoid) ncnn_add_layer_perf(BatchNorm) +ncnn_add_layer_perf(LayerNorm) # SDPA perf tests (decode and prefill phases) if(WITH_LAYER_sdpa) diff --git a/tests/perf/perf_layernorm.cpp b/tests/perf/perf_layernorm.cpp new file mode 100644 index 000000000000..dffe6debb541 --- /dev/null +++ b/tests/perf/perf_layernorm.cpp @@ -0,0 +1,34 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "perfutil.h" + +static void perf_layernorm(int w, int h, int c, int affine_size) +{ + ncnn::ParamDict pd; + pd.set(0, affine_size); + pd.set(1, 1e-5f); + pd.set(2, 1); + + std::vector weights(2); + weights[0] = PerfMat(affine_size, 1.0f); + weights[1] = PerfMat(affine_size, 0.0f); + + perf_layer("LayerNorm", pd, weights, PerfMat(w, h, c), "affine_size=%d", affine_size); +} + +int main() +{ + // typical LLM feature dimensions + perf_layernorm(4096, 1, 1, 4096); + perf_layernorm(4096, 1, 32, 4096); + perf_layernorm(16384, 1, 1, 16384); + perf_layernorm(5120, 1, 1, 5120); + perf_layernorm(4096, 512, 1, 4096); + + // smaller dims for comparison + perf_layernorm(1024, 1, 1, 1024); + perf_layernorm(768, 1, 1, 768); + + return 0; +} From 6ef2d42224e22ff23fe905f84ed0b02d89d880e3 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Fri, 29 May 2026 16:38:40 +0800 Subject: [PATCH 2/3] vulkan: remove dead #else branches in layernorm subgroup reduce shaders The layernorm_reduce_subgroup shaders are only created on devices that support subgroup arithmetic, so the outermost #else (pure shared-memory fallback without subgroup) is dead code. Note: InstanceNorm/GroupNorm/RMSNorm do not have subgroup reduce shaders yet, so there is no dead branch to clean up there. --- .../shader/layernorm_reduce_subgroup.comp | 29 ------------------- .../layernorm_reduce_subgroup_pack4.comp | 29 ------------------- 2 files changed, 58 deletions(-) diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp index db2876c076c8..c5996509dcd5 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp @@ -3,13 +3,11 @@ #version 450 -#if ncnn_subgroup_arithmetic #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_arithmetic : enable #if NCNN_fp16_storage #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif -#endif layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; @@ -44,7 +42,6 @@ void main() sqsum += v * v; } -#if ncnn_subgroup_arithmetic afp sg_sum = subgroupAdd(sum); afp sg_sqsum = subgroupAdd(sqsum); @@ -193,30 +190,4 @@ void main() } } #endif - -#else - sdata_sum[tid] = float(sum); - sdata_sqsum[tid] = float(sqsum); - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; - sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; - } - barrier(); - } - - if (tid == 0) - { - afp r_sum = afp(sdata_sum[0]); - afp r_sqsum = afp(sdata_sqsum[0]); - float mean = float(r_sum) / float(p.group_size); - float var = float(r_sqsum) / float(p.group_size) - mean * mean; - mean_data[group_id] = mean; - var_data[group_id] = var; - } -#endif } diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp index 792bcdb8ae96..f92cbcab641e 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup_pack4.comp @@ -3,13 +3,11 @@ #version 450 -#if ncnn_subgroup_arithmetic #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_arithmetic : enable #if NCNN_fp16_storage #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif -#endif layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { vec4 mean_data[]; }; @@ -44,7 +42,6 @@ void main() sqsum += v * v; } -#if ncnn_subgroup_arithmetic vec4 sg_sum = subgroupAdd(sum); vec4 sg_sqsum = subgroupAdd(sqsum); @@ -193,30 +190,4 @@ void main() } } #endif - -#else - sdata_sum[tid] = vec4(sum); - sdata_sqsum[tid] = vec4(sqsum); - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata_sum[tid] = sdata_sum[tid] + sdata_sum[tid + stride]; - sdata_sqsum[tid] = sdata_sqsum[tid] + sdata_sqsum[tid + stride]; - } - barrier(); - } - - if (tid == 0) - { - vec4 r_sum = sdata_sum[0]; - vec4 r_sqsum = sdata_sqsum[0]; - vec4 mean = r_sum / float(p.group_size); - vec4 var = r_sqsum / float(p.group_size) - mean * mean; - mean_data[group_id] = mean; - var_data[group_id] = var; - } -#endif } From f3c433aabf9d42406c44a5f48cc7cd67eebf67e0 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Tue, 2 Jun 2026 13:38:51 +0800 Subject: [PATCH 3/3] fix route logic --- src/layer/vulkan/layernorm_vulkan.cpp | 46 +++++----- .../shader/layernorm_reduce_subgroup.comp | 90 +++++++++---------- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/src/layer/vulkan/layernorm_vulkan.cpp b/src/layer/vulkan/layernorm_vulkan.cpp index ec430b473667..c9a5307818c2 100644 --- a/src/layer/vulkan/layernorm_vulkan.cpp +++ b/src/layer/vulkan/layernorm_vulkan.cpp @@ -36,6 +36,19 @@ LayerNorm_vulkan::LayerNorm_vulkan() int LayerNorm_vulkan::create_pipeline(const Option& opt) { + const bool use_subgroup = opt.use_subgroup_ops && (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT); + + if (use_subgroup) + { + pipeline_layernorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup->create(LayerShaderType::layernorm_reduce_subgroup, opt, std::vector()); + + pipeline_layernorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_layernorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_layernorm_reduce_subgroup_pack4->create(LayerShaderType::layernorm_reduce_subgroup_pack4, opt, std::vector()); + } + else { pipeline_layernorm_reduce_sum4_fp16_to_fp32 = new Pipeline(vkdev); pipeline_layernorm_reduce_sum4_fp16_to_fp32->set_optimal_local_size_xyz(16, 4, 1); @@ -58,9 +71,7 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_reduce_sum4_fp32_pack4[1] = new Pipeline(vkdev); pipeline_layernorm_reduce_sum4_fp32_pack4[1]->set_optimal_local_size_xyz(8, 8, 1); pipeline_layernorm_reduce_sum4_fp32_pack4[1]->create(LayerShaderType::layernorm_reduce_sum4_fp32_pack4, opt, std::vector()); - } - { pipeline_layernorm_reduce_mean = new Pipeline(vkdev); pipeline_layernorm_reduce_mean->set_optimal_local_size_xyz(1, 8, 8); pipeline_layernorm_reduce_mean->create(LayerShaderType::layernorm_reduce_mean, opt, std::vector()); @@ -68,19 +79,19 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_reduce_mean_pack4 = new Pipeline(vkdev); pipeline_layernorm_reduce_mean_pack4->set_optimal_local_size_xyz(1, 8, 8); pipeline_layernorm_reduce_mean_pack4->create(LayerShaderType::layernorm_reduce_mean_pack4, opt, std::vector()); - } - { - std::vector specializations(1); - specializations[0].i = affine_size; + { + std::vector specializations(1); + specializations[0].i = affine_size; - pipeline_layernorm_sub_mean_square = new Pipeline(vkdev); - pipeline_layernorm_sub_mean_square->set_optimal_local_size_xyz(8, 8, 1); - pipeline_layernorm_sub_mean_square->create(LayerShaderType::layernorm_sub_mean_square, opt, specializations); + pipeline_layernorm_sub_mean_square = new Pipeline(vkdev); + pipeline_layernorm_sub_mean_square->set_optimal_local_size_xyz(8, 8, 1); + pipeline_layernorm_sub_mean_square->create(LayerShaderType::layernorm_sub_mean_square, opt, specializations); - pipeline_layernorm_sub_mean_square_pack4 = new Pipeline(vkdev); - pipeline_layernorm_sub_mean_square_pack4->set_optimal_local_size_xyz(8, 8, 1); - pipeline_layernorm_sub_mean_square_pack4->create(LayerShaderType::layernorm_sub_mean_square_pack4, opt, specializations); + pipeline_layernorm_sub_mean_square_pack4 = new Pipeline(vkdev); + pipeline_layernorm_sub_mean_square_pack4->set_optimal_local_size_xyz(8, 8, 1); + pipeline_layernorm_sub_mean_square_pack4->create(LayerShaderType::layernorm_sub_mean_square_pack4, opt, specializations); + } } { @@ -110,17 +121,6 @@ int LayerNorm_vulkan::create_pipeline(const Option& opt) pipeline_layernorm_norm_pack4->create(LayerShaderType::layernorm_norm_pack4, opt, specializations); } - if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) - { - pipeline_layernorm_reduce_subgroup = new Pipeline(vkdev); - pipeline_layernorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); - pipeline_layernorm_reduce_subgroup->create(LayerShaderType::layernorm_reduce_subgroup, opt, std::vector()); - - pipeline_layernorm_reduce_subgroup_pack4 = new Pipeline(vkdev); - pipeline_layernorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); - pipeline_layernorm_reduce_subgroup_pack4->create(LayerShaderType::layernorm_reduce_subgroup_pack4, opt, std::vector()); - } - return 0; } diff --git a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp index c5996509dcd5..5d7e54eda0f4 100644 --- a/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp +++ b/src/layer/vulkan/shader/layernorm_reduce_subgroup.comp @@ -32,18 +32,18 @@ void main() const int gy = group_id % p.num_groups_per_channel; const int offset = gz * p.cstep + gy * p.group_size; - afp sum = afp(0.f); - afp sqsum = afp(0.f); + float sum = 0.f; + float sqsum = 0.f; for (int t = tid; t < p.group_size; t += 256) { - afp v = buffer_ld1(bottom_top_blob_data, offset + t); + float v = float(buffer_ld1(bottom_top_blob_data, offset + t)); sum += v; sqsum += v * v; } - afp sg_sum = subgroupAdd(sum); - afp sg_sqsum = subgroupAdd(sqsum); + float sg_sum = subgroupAdd(sum); + float sg_sqsum = subgroupAdd(sqsum); if (subgroupElect()) { @@ -59,15 +59,15 @@ void main() const int lane = int(gl_SubgroupInvocationID); const int num_sg = int(gl_NumSubgroups); - afp v_sum = lane < num_sg ? afp(sdata_sum[lane]) : afp(0.f); - afp v_sqsum = lane < num_sg ? afp(sdata_sqsum[lane]) : afp(0.f); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = lane < num_sg ? sdata_sum[lane] : 0.f; + float v_sqsum = lane < num_sg ? sdata_sqsum[lane] : 0.f; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - float mean = float(r_sum) / float(p.group_size); - float var = float(r_sqsum) / float(p.group_size) - mean * mean; + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; mean_data[group_id] = mean; var_data[group_id] = var; } @@ -78,15 +78,15 @@ void main() const int lane = int(gl_SubgroupInvocationID); const int base = int(gl_SubgroupID) * 8; - afp v_sum = afp(sdata_sum[base + lane]); - afp v_sqsum = afp(sdata_sqsum[base + lane]); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - sdata_sum[int(gl_SubgroupID)] = float(r_sum); - sdata_sqsum[int(gl_SubgroupID)] = float(r_sqsum); + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; } } @@ -96,15 +96,15 @@ void main() { const int lane = int(gl_SubgroupInvocationID); - afp v_sum = lane < 4 ? afp(sdata_sum[lane]) : afp(0.f); - afp v_sqsum = lane < 4 ? afp(sdata_sqsum[lane]) : afp(0.f); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = lane < 4 ? sdata_sum[lane] : 0.f; + float v_sqsum = lane < 4 ? sdata_sqsum[lane] : 0.f; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - float mean = float(r_sum) / float(p.group_size); - float var = float(r_sqsum) / float(p.group_size) - mean * mean; + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; mean_data[group_id] = mean; var_data[group_id] = var; } @@ -115,15 +115,15 @@ void main() const int lane = int(gl_SubgroupInvocationID); const int base = int(gl_SubgroupID) * 4; - afp v_sum = lfp2afp(sdata_sum[base + lane]); - afp v_sqsum = lfp2afp(sdata_sqsum[base + lane]); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - sdata_sum[int(gl_SubgroupID)] = afp2lfp(r_sum); - sdata_sqsum[int(gl_SubgroupID)] = afp2lfp(r_sqsum); + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; } } @@ -134,15 +134,15 @@ void main() const int lane = int(gl_SubgroupInvocationID); const int base = int(gl_SubgroupID) * 4; - afp v_sum = lfp2afp(sdata_sum[base + lane]); - afp v_sqsum = lfp2afp(sdata_sqsum[base + lane]); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = sdata_sum[base + lane]; + float v_sqsum = sdata_sqsum[base + lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - sdata_sum[int(gl_SubgroupID)] = afp2lfp(r_sum); - sdata_sqsum[int(gl_SubgroupID)] = afp2lfp(r_sqsum); + sdata_sum[int(gl_SubgroupID)] = r_sum; + sdata_sqsum[int(gl_SubgroupID)] = r_sqsum; } } @@ -152,15 +152,15 @@ void main() { const int lane = int(gl_SubgroupInvocationID); - afp v_sum = afp(sdata_sum[lane]); - afp v_sqsum = afp(sdata_sqsum[lane]); - afp r_sum = subgroupAdd(v_sum); - afp r_sqsum = subgroupAdd(v_sqsum); + float v_sum = sdata_sum[lane]; + float v_sqsum = sdata_sqsum[lane]; + float r_sum = subgroupAdd(v_sum); + float r_sqsum = subgroupAdd(v_sqsum); if (subgroupElect()) { - float mean = float(r_sum) / float(p.group_size); - float var = float(r_sqsum) / float(p.group_size) - mean * mean; + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; mean_data[group_id] = mean; var_data[group_id] = var; } @@ -181,10 +181,10 @@ void main() if (tid == 0) { - afp r_sum = afp(sdata_sum[0]); - afp r_sqsum = afp(sdata_sqsum[0]); - float mean = float(r_sum) / float(p.group_size); - float var = float(r_sqsum) / float(p.group_size) - mean * mean; + float r_sum = sdata_sum[0]; + float r_sqsum = sdata_sqsum[0]; + float mean = r_sum / float(p.group_size); + float var = r_sqsum / float(p.group_size) - mean * mean; mean_data[group_id] = mean; var_data[group_id] = var; }