diff --git a/src/layer/vulkan/groupnorm_vulkan.cpp b/src/layer/vulkan/groupnorm_vulkan.cpp index 41b390bfe2e1..7e46056e1480 100644 --- a/src/layer/vulkan/groupnorm_vulkan.cpp +++ b/src/layer/vulkan/groupnorm_vulkan.cpp @@ -27,6 +27,9 @@ GroupNorm_vulkan::GroupNorm_vulkan() pipeline_groupnorm_sub_mean_square_pack4 = 0; pipeline_groupnorm_coeffs_pack4 = 0; pipeline_groupnorm_norm_pack4 = 0; + + pipeline_groupnorm_reduce_subgroup = 0; + pipeline_groupnorm_reduce_subgroup_pack4 = 0; } int GroupNorm_vulkan::create_pipeline(const Option& opt) @@ -276,6 +279,23 @@ int GroupNorm_vulkan::create_pipeline(const Option& opt) } } + if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) + { + if (elempack == 1 || _channels == 0) + { + pipeline_groupnorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_groupnorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_groupnorm_reduce_subgroup->create(LayerShaderType::groupnorm_reduce_subgroup, opt, std::vector()); + } + + if (elempack == 4 || _channels == 0) + { + pipeline_groupnorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_groupnorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_groupnorm_reduce_subgroup_pack4->create(LayerShaderType::groupnorm_reduce_subgroup_pack4, opt, std::vector()); + } + } + return 0; } @@ -321,6 +341,11 @@ int GroupNorm_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_groupnorm_norm_pack4; pipeline_groupnorm_norm_pack4 = 0; + delete pipeline_groupnorm_reduce_subgroup; + pipeline_groupnorm_reduce_subgroup = 0; + delete pipeline_groupnorm_reduce_subgroup_pack4; + pipeline_groupnorm_reduce_subgroup_pack4 = 0; + return 0; } @@ -369,197 +394,223 @@ int GroupNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co int channels_g = channels / group; - // mean - one float per group + // mean and var - one float per group VkMat mean_workspace(group, 4u, 1, opt.workspace_vkallocator); + VkMat var_workspace(group, 4u, 1, opt.workspace_vkallocator); + + const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_groupnorm_reduce_subgroup_pack4 : pipeline_groupnorm_reduce_subgroup; + if (pipeline_reduce_subgroup) { - // reduce sum per channel first - VkMat sum_workspace; + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = var_workspace; + + std::vector constants(4); + constants[0].i = size_virtual; + constants[1].i = channels_g; + constants[2].i = cstep_virtual; + constants[3].i = group; + + VkMat dispatcher; + dispatcher.w = 1; + dispatcher.h = group; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher); + } + else + { + // mean { - int reduced_w = (size_virtual + 3) / 4; - int reduced_c = c_virtual; + // reduce sum per channel first + VkMat sum_workspace; + { + int reduced_w = (size_virtual + 3) / 4; + int reduced_c = c_virtual; + + sum_workspace.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(8); + constants[0].i = size_virtual; + constants[1].i = 1; + constants[2].i = c_virtual; + constants[3].i = cstep_virtual; + constants[4].i = sum_workspace.w; + constants[5].i = 1; + constants[6].i = sum_workspace.c; + constants[7].i = sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_groupnorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); + } + } - sum_workspace.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + int pb = 0; + while (sum_workspace.w > 4) { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + int reduced_w = (sum_workspace.w + 3) / 4; + int reduced_c = sum_workspace.c; - std::vector constants(8); - constants[0].i = size_virtual; - constants[1].i = 1; - constants[2].i = c_virtual; - constants[3].i = cstep_virtual; - constants[4].i = sum_workspace.w; - constants[5].i = 1; - constants[6].i = sum_workspace.c; - constants[7].i = sum_workspace.cstep; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_groupnorm_reduce_sum4_fp16_to_fp32; + { + std::vector bindings(2); + bindings[0] = sum_workspace; + bindings[1] = sum_workspace_reduced; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); - } - } + std::vector constants(8); + constants[0].i = sum_workspace.w; + constants[1].i = 1; + constants[2].i = sum_workspace.c; + constants[3].i = sum_workspace.cstep; + constants[4].i = sum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sum_workspace_reduced.c; + constants[7].i = sum_workspace_reduced.cstep; - int pb = 0; - while (sum_workspace.w > 4) - { - int reduced_w = (sum_workspace.w + 3) / 4; - int reduced_c = sum_workspace.c; + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2]; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + pb++; + } + + sum_workspace = sum_workspace_reduced; + } + + // reduce mean across channels within each group { std::vector bindings(2); bindings[0] = sum_workspace; - bindings[1] = sum_workspace_reduced; + bindings[1] = mean_workspace; - std::vector constants(8); + std::vector constants(7); constants[0].i = sum_workspace.w; constants[1].i = 1; constants[2].i = sum_workspace.c; constants[3].i = sum_workspace.cstep; - constants[4].i = sum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sum_workspace_reduced.c; - constants[7].i = sum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = (float)(channels_g * size_virtual); + constants[5].i = group; + constants[6].i = channels_g; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); } - - sum_workspace = sum_workspace_reduced; - } - - // reduce mean across channels within each group - { - std::vector bindings(2); - bindings[0] = sum_workspace; - bindings[1] = mean_workspace; - - std::vector constants(7); - constants[0].i = sum_workspace.w; - constants[1].i = 1; - constants[2].i = sum_workspace.c; - constants[3].i = sum_workspace.cstep; - constants[4].f = (float)(channels_g * size_virtual); - constants[5].i = group; - constants[6].i = channels_g; - - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean; - - cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); } - } - // var - one float per group - VkMat var_workspace(group, 4u, 1, opt.workspace_vkallocator); - { - // sub mean and square - VkMat square_workspace; - if (dims == 1 || dims == 2) - { - square_workspace.create(size_virtual, 1, c_virtual, 4u * elempack, elempack, opt.workspace_vkallocator); - } - else - { - square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); - } + // var { - std::vector bindings(3); - bindings[0] = bottom_top_blob; - bindings[1] = mean_workspace; - bindings[2] = square_workspace; - - std::vector constants(11); + // sub mean and square + VkMat square_workspace; if (dims == 1 || dims == 2) { - constants[0].i = 3; - constants[1].i = size_virtual; - constants[2].i = 1; - constants[3].i = c_virtual; - constants[4].i = cstep_virtual; + square_workspace.create(size_virtual, 1, c_virtual, 4u * elempack, elempack, opt.workspace_vkallocator); } else { - constants[0].i = std::min(3, dims); - constants[1].i = w; - constants[2].i = h; - constants[3].i = c; - constants[4].i = cstep; + square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); + } + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = square_workspace; + + std::vector constants(11); + if (dims == 1 || dims == 2) + { + constants[0].i = 3; + constants[1].i = size_virtual; + constants[2].i = 1; + constants[3].i = c_virtual; + constants[4].i = cstep_virtual; + } + else + { + constants[0].i = std::min(3, dims); + constants[1].i = w; + constants[2].i = h; + constants[3].i = c; + constants[4].i = cstep; + } + constants[5].i = square_workspace.dims; + constants[6].i = square_workspace.w; + constants[7].i = square_workspace.h; + constants[8].i = square_workspace.c; + constants[9].i = square_workspace.cstep; + constants[10].i = channels_g; + + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_sub_mean_square_pack4 : pipeline_groupnorm_sub_mean_square; + + cmd.record_pipeline(pipeline, bindings, constants, square_workspace); } - constants[5].i = square_workspace.dims; - constants[6].i = square_workspace.w; - constants[7].i = square_workspace.h; - constants[8].i = square_workspace.c; - constants[9].i = square_workspace.cstep; - constants[10].i = channels_g; - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_sub_mean_square_pack4 : pipeline_groupnorm_sub_mean_square; + // reduce square + VkMat sqsum_workspace = square_workspace; + sqsum_workspace.w = sqsum_workspace.w * sqsum_workspace.h; + sqsum_workspace.h = 1; - cmd.record_pipeline(pipeline, bindings, constants, square_workspace); - } + int pb = 0; + while (sqsum_workspace.w > 4) + { + int reduced_w = (sqsum_workspace.w + 3) / 4; + int reduced_c = sqsum_workspace.c; - // reduce square - VkMat sqsum_workspace = square_workspace; - sqsum_workspace.w = sqsum_workspace.w * sqsum_workspace.h; - sqsum_workspace.h = 1; + VkMat sqsum_workspace_reduced; + sqsum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - int pb = 0; - while (sqsum_workspace.w > 4) - { - int reduced_w = (sqsum_workspace.w + 3) / 4; - int reduced_c = sqsum_workspace.c; + { + std::vector bindings(2); + bindings[0] = sqsum_workspace; + bindings[1] = sqsum_workspace_reduced; + + std::vector constants(8); + constants[0].i = sqsum_workspace.w; + constants[1].i = 1; + constants[2].i = sqsum_workspace.c; + constants[3].i = sqsum_workspace.cstep; + constants[4].i = sqsum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sqsum_workspace_reduced.c; + constants[7].i = sqsum_workspace_reduced.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2]; - VkMat sqsum_workspace_reduced; - sqsum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); + pb++; + } + + sqsum_workspace = sqsum_workspace_reduced; + } + + // reduce var across channels within each group { std::vector bindings(2); bindings[0] = sqsum_workspace; - bindings[1] = sqsum_workspace_reduced; + bindings[1] = var_workspace; - std::vector constants(8); + std::vector constants(7); constants[0].i = sqsum_workspace.w; constants[1].i = 1; constants[2].i = sqsum_workspace.c; constants[3].i = sqsum_workspace.cstep; - constants[4].i = sqsum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sqsum_workspace_reduced.c; - constants[7].i = sqsum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = (float)(channels_g * size_virtual); + constants[5].i = group; + constants[6].i = channels_g; - cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } - - sqsum_workspace = sqsum_workspace_reduced; - } - - // reduce var across channels within each group - { - std::vector bindings(2); - bindings[0] = sqsum_workspace; - bindings[1] = var_workspace; - - std::vector constants(7); - constants[0].i = sqsum_workspace.w; - constants[1].i = 1; - constants[2].i = sqsum_workspace.c; - constants[3].i = sqsum_workspace.cstep; - constants[4].f = (float)(channels_g * size_virtual); - constants[5].i = group; - constants[6].i = channels_g; - - const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean; - - cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } } diff --git a/src/layer/vulkan/groupnorm_vulkan.h b/src/layer/vulkan/groupnorm_vulkan.h index d850b38b637e..4d8656104b36 100644 --- a/src/layer/vulkan/groupnorm_vulkan.h +++ b/src/layer/vulkan/groupnorm_vulkan.h @@ -38,6 +38,9 @@ class GroupNorm_vulkan : public GroupNorm Pipeline* pipeline_groupnorm_sub_mean_square_pack4; Pipeline* pipeline_groupnorm_coeffs_pack4; Pipeline* pipeline_groupnorm_norm_pack4; + + Pipeline* pipeline_groupnorm_reduce_subgroup; + Pipeline* pipeline_groupnorm_reduce_subgroup_pack4; }; } // namespace ncnn diff --git a/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp b/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp new file mode 100644 index 000000000000..75558e9d8f47 --- /dev/null +++ b/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp @@ -0,0 +1,310 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_area; + int channels_g; + int cstep; + int group; +} p; + +shared float sdata[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const float area = float(p.channels_g * p.group_area); + const int actual_channel_start = group_id * p.channels_g; + const int group_size = p.channels_g * p.group_area; + + // Phase 1: compute sum -> mean + afp sum = afp(0.f); + for (int t = tid; t < group_size; t += 256) + { + int q = t / p.group_area; + int s = t % p.group_area; + int actual_channel = actual_channel_start + q; + int v_offset = actual_channel * p.cstep + s; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + sum += v; + } + + afp sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float mean_val = sdata[0] / area; + if (tid == 0) + { + sdata[0] = mean_val; + mean_data[group_id] = mean_val; + } + barrier(); + mean_val = sdata[0]; + + // Phase 2: compute sqsum -> var + afp sqsum = afp(0.f); + for (int t = tid; t < group_size; t += 256) + { + int q = t / p.group_area; + int s = t % p.group_area; + int actual_channel = actual_channel_start + q; + int v_offset = actual_channel * p.cstep + s; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + afp d = v - afp(mean_val); + sqsum += d * d; + } + + afp sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sqsum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float var_val = sdata[0] / area; + if (tid == 0) + { + var_data[group_id] = var_val; + } +} diff --git a/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp new file mode 100644 index 000000000000..dc8d0767329d --- /dev/null +++ b/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp @@ -0,0 +1,338 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int group_area; + int channels_g; + int cstep; + int group; +} p; + +shared float sdata[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int group_id = int(gl_WorkGroupID.y); + + const float area = float(p.channels_g * p.group_area); + const int actual_channel_start = group_id * p.channels_g; + const int actual_channel_end = actual_channel_start + p.channels_g; + const int packed_start = actual_channel_start / 4; + const int packed_end = (actual_channel_end - 1) / 4; + + // Phase 1: compute sum -> mean + float sum = 0.f; + for (int s = tid; s < p.group_area; s += 256) + { + for (int pc = packed_start; pc <= packed_end; pc++) + { + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, pc * p.cstep + s)); + int ac_base = pc * 4; + if (ac_base >= actual_channel_start && ac_base < actual_channel_end) + sum += v.r; + if (ac_base + 1 >= actual_channel_start && ac_base + 1 < actual_channel_end) + sum += v.g; + if (ac_base + 2 >= actual_channel_start && ac_base + 2 < actual_channel_end) + sum += v.b; + if (ac_base + 3 >= actual_channel_start && ac_base + 3 < actual_channel_end) + sum += v.a; + } + } + + float sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = sg_sum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + float v = lane < num_sg ? sdata[lane] : 0.f; + float r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r_sum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v = lane < 4 ? sdata[lane] : 0.f; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v = sdata[lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float mean_val = sdata[0] / area; + if (tid == 0) + { + sdata[0] = mean_val; + mean_data[group_id] = mean_val; + } + barrier(); + mean_val = sdata[0]; + + // Phase 2: compute sqsum -> var + float sqsum = 0.f; + for (int s = tid; s < p.group_area; s += 256) + { + for (int pc = packed_start; pc <= packed_end; pc++) + { + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, pc * p.cstep + s)); + int ac_base = pc * 4; + float d; + if (ac_base >= actual_channel_start && ac_base < actual_channel_end) + { + d = v.r - mean_val; + sqsum += d * d; + } + if (ac_base + 1 >= actual_channel_start && ac_base + 1 < actual_channel_end) + { + d = v.g - mean_val; + sqsum += d * d; + } + if (ac_base + 2 >= actual_channel_start && ac_base + 2 < actual_channel_end) + { + d = v.b - mean_val; + sqsum += d * d; + } + if (ac_base + 3 >= actual_channel_start && ac_base + 3 < actual_channel_end) + { + d = v.a - mean_val; + sqsum += d * d; + } + } + } + + float sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = sg_sqsum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + float v = lane < num_sg ? sdata[lane] : 0.f; + float r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r_sqsum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v = lane < 4 ? sdata[lane] : 0.f; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + float v = sdata[base + lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + float v = sdata[lane]; + float r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float var_val = sdata[0] / area; + if (tid == 0) + { + var_data[group_id] = var_val; + } +} diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt index 10c0535d8087..1a9dcc724936 100644 --- a/tests/perf/CMakeLists.txt +++ b/tests/perf/CMakeLists.txt @@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp) ncnn_add_layer_perf(Concat) ncnn_add_layer_perf(Sigmoid) ncnn_add_layer_perf(BatchNorm) +ncnn_add_layer_perf(GroupNorm) # SDPA perf tests (decode and prefill phases) if(WITH_LAYER_sdpa) diff --git a/tests/perf/perf_groupnorm.cpp b/tests/perf/perf_groupnorm.cpp new file mode 100644 index 000000000000..e41f304a9b50 --- /dev/null +++ b/tests/perf/perf_groupnorm.cpp @@ -0,0 +1,38 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "perfutil.h" + +static void perf_groupnorm(int w, int h, int c, int group) +{ + ncnn::ParamDict pd; + pd.set(0, group); + pd.set(1, c); + pd.set(2, 1e-5f); + pd.set(3, 1); + + std::vector weights(2); + weights[0] = PerfMat(c, 1.0f); + weights[1] = PerfMat(c, 0.0f); + + perf_layer("GroupNorm", pd, weights, PerfMat(w, h, c), NULL); +} + +int main() +{ + // Stable Diffusion-like shapes + perf_groupnorm(64, 64, 128, 32); + perf_groupnorm(32, 32, 256, 32); + perf_groupnorm(16, 16, 512, 32); + perf_groupnorm(8, 8, 512, 32); + + // Image-like shapes + perf_groupnorm(224, 224, 3, 3); + perf_groupnorm(224, 224, 64, 32); + + // 1D / LLM-like + perf_groupnorm(4096, 1, 1, 1); + perf_groupnorm(512, 1, 1, 1); + + return 0; +}