diff --git a/src/layer/vulkan/groupnorm_vulkan.cpp b/src/layer/vulkan/groupnorm_vulkan.cpp
index 41b390bfe2e1..7e46056e1480 100644
--- a/src/layer/vulkan/groupnorm_vulkan.cpp
+++ b/src/layer/vulkan/groupnorm_vulkan.cpp
@@ -27,6 +27,9 @@ GroupNorm_vulkan::GroupNorm_vulkan()
     pipeline_groupnorm_sub_mean_square_pack4 = 0;
     pipeline_groupnorm_coeffs_pack4 = 0;
     pipeline_groupnorm_norm_pack4 = 0;
+
+    pipeline_groupnorm_reduce_subgroup = 0;
+    pipeline_groupnorm_reduce_subgroup_pack4 = 0;
 }
 
 int GroupNorm_vulkan::create_pipeline(const Option& opt)
@@ -276,6 +279,23 @@ int GroupNorm_vulkan::create_pipeline(const Option& opt)
         }
     }
 
+    if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT)
+    {
+        if (elempack == 1 || _channels == 0)
+        {
+            pipeline_groupnorm_reduce_subgroup = new Pipeline(vkdev);
+            pipeline_groupnorm_reduce_subgroup->set_local_size_xyz(256, 1, 1);
+            pipeline_groupnorm_reduce_subgroup->create(LayerShaderType::groupnorm_reduce_subgroup, opt, std::vector<vk_specialization_type>());
+        }
+
+        if (elempack == 4 || _channels == 0)
+        {
+            pipeline_groupnorm_reduce_subgroup_pack4 = new Pipeline(vkdev);
+            pipeline_groupnorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1);
+            pipeline_groupnorm_reduce_subgroup_pack4->create(LayerShaderType::groupnorm_reduce_subgroup_pack4, opt, std::vector<vk_specialization_type>());
+        }
+    }
+
     return 0;
 }
 
@@ -321,6 +341,11 @@ int GroupNorm_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_groupnorm_norm_pack4;
     pipeline_groupnorm_norm_pack4 = 0;
 
+    delete pipeline_groupnorm_reduce_subgroup;
+    pipeline_groupnorm_reduce_subgroup = 0;
+    delete pipeline_groupnorm_reduce_subgroup_pack4;
+    pipeline_groupnorm_reduce_subgroup_pack4 = 0;
+
     return 0;
 }
 
@@ -369,197 +394,223 @@ int GroupNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, co
 
     int channels_g = channels / group;
 
-    // mean - one float per group
+    // mean and var - one float per group
     VkMat mean_workspace(group, 4u, 1, opt.workspace_vkallocator);
+    VkMat var_workspace(group, 4u, 1, opt.workspace_vkallocator);
+
+    const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_groupnorm_reduce_subgroup_pack4 : pipeline_groupnorm_reduce_subgroup;
+    if (pipeline_reduce_subgroup)
     {
-        // reduce sum per channel first
-        VkMat sum_workspace;
+        std::vector<VkMat> bindings(3);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = mean_workspace;
+        bindings[2] = var_workspace;
+
+        std::vector<vk_constant_type> constants(4);
+        constants[0].i = size_virtual;
+        constants[1].i = channels_g;
+        constants[2].i = cstep_virtual;
+        constants[3].i = group;
+
+        VkMat dispatcher;
+        dispatcher.w = 1;
+        dispatcher.h = group;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher);
+    }
+    else
+    {
+        // mean
         {
-            int reduced_w = (size_virtual + 3) / 4;
-            int reduced_c = c_virtual;
+            // reduce sum per channel first
+            VkMat sum_workspace;
+            {
+                int reduced_w = (size_virtual + 3) / 4;
+                int reduced_c = c_virtual;
+
+                sum_workspace.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = bottom_top_blob;
+                    bindings[1] = sum_workspace;
+
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = size_virtual;
+                    constants[1].i = 1;
+                    constants[2].i = c_virtual;
+                    constants[3].i = cstep_virtual;
+                    constants[4].i = sum_workspace.w;
+                    constants[5].i = 1;
+                    constants[6].i = sum_workspace.c;
+                    constants[7].i = sum_workspace.cstep;
+
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_groupnorm_reduce_sum4_fp16_to_fp32;
+
+                    cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
+                }
+            }
 
-            sum_workspace.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+            int pb = 0;
+            while (sum_workspace.w > 4)
             {
-                std::vector<VkMat> bindings(2);
-                bindings[0] = bottom_top_blob;
-                bindings[1] = sum_workspace;
+                int reduced_w = (sum_workspace.w + 3) / 4;
+                int reduced_c = sum_workspace.c;
 
-                std::vector<vk_constant_type> constants(8);
-                constants[0].i = size_virtual;
-                constants[1].i = 1;
-                constants[2].i = c_virtual;
-                constants[3].i = cstep_virtual;
-                constants[4].i = sum_workspace.w;
-                constants[5].i = 1;
-                constants[6].i = sum_workspace.c;
-                constants[7].i = sum_workspace.cstep;
+                VkMat sum_workspace_reduced;
+                sum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
 
-                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_groupnorm_reduce_sum4_fp16_to_fp32;
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = sum_workspace;
+                    bindings[1] = sum_workspace_reduced;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
-            }
-        }
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = sum_workspace.w;
+                    constants[1].i = 1;
+                    constants[2].i = sum_workspace.c;
+                    constants[3].i = sum_workspace.cstep;
+                    constants[4].i = sum_workspace_reduced.w;
+                    constants[5].i = 1;
+                    constants[6].i = sum_workspace_reduced.c;
+                    constants[7].i = sum_workspace_reduced.cstep;
 
-        int pb = 0;
-        while (sum_workspace.w > 4)
-        {
-            int reduced_w = (sum_workspace.w + 3) / 4;
-            int reduced_c = sum_workspace.c;
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2];
 
-            VkMat sum_workspace_reduced;
-            sum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                    cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
 
+                    pb++;
+                }
+
+                sum_workspace = sum_workspace_reduced;
+            }
+
+            // reduce mean across channels within each group
             {
                 std::vector<VkMat> bindings(2);
                 bindings[0] = sum_workspace;
-                bindings[1] = sum_workspace_reduced;
+                bindings[1] = mean_workspace;
 
-                std::vector<vk_constant_type> constants(8);
+                std::vector<vk_constant_type> constants(7);
                 constants[0].i = sum_workspace.w;
                 constants[1].i = 1;
                 constants[2].i = sum_workspace.c;
                 constants[3].i = sum_workspace.cstep;
-                constants[4].i = sum_workspace_reduced.w;
-                constants[5].i = 1;
-                constants[6].i = sum_workspace_reduced.c;
-                constants[7].i = sum_workspace_reduced.cstep;
-
-                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2];
+                constants[4].f = (float)(channels_g * size_virtual);
+                constants[5].i = group;
+                constants[6].i = channels_g;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
+                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean;
 
-                pb++;
+                cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
             }
-
-            sum_workspace = sum_workspace_reduced;
-        }
-
-        // reduce mean across channels within each group
-        {
-            std::vector<VkMat> bindings(2);
-            bindings[0] = sum_workspace;
-            bindings[1] = mean_workspace;
-
-            std::vector<vk_constant_type> constants(7);
-            constants[0].i = sum_workspace.w;
-            constants[1].i = 1;
-            constants[2].i = sum_workspace.c;
-            constants[3].i = sum_workspace.cstep;
-            constants[4].f = (float)(channels_g * size_virtual);
-            constants[5].i = group;
-            constants[6].i = channels_g;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean;
-
-            cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
         }
-    }
 
-    // var - one float per group
-    VkMat var_workspace(group, 4u, 1, opt.workspace_vkallocator);
-    {
-        // sub mean and square
-        VkMat square_workspace;
-        if (dims == 1 || dims == 2)
-        {
-            square_workspace.create(size_virtual, 1, c_virtual, 4u * elempack, elempack, opt.workspace_vkallocator);
-        }
-        else
-        {
-            square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
-        }
+        // var
         {
-            std::vector<VkMat> bindings(3);
-            bindings[0] = bottom_top_blob;
-            bindings[1] = mean_workspace;
-            bindings[2] = square_workspace;
-
-            std::vector<vk_constant_type> constants(11);
+            // sub mean and square
+            VkMat square_workspace;
             if (dims == 1 || dims == 2)
             {
-                constants[0].i = 3;
-                constants[1].i = size_virtual;
-                constants[2].i = 1;
-                constants[3].i = c_virtual;
-                constants[4].i = cstep_virtual;
+                square_workspace.create(size_virtual, 1, c_virtual, 4u * elempack, elempack, opt.workspace_vkallocator);
             }
             else
             {
-                constants[0].i = std::min(3, dims);
-                constants[1].i = w;
-                constants[2].i = h;
-                constants[3].i = c;
-                constants[4].i = cstep;
+                square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
+            }
+            {
+                std::vector<VkMat> bindings(3);
+                bindings[0] = bottom_top_blob;
+                bindings[1] = mean_workspace;
+                bindings[2] = square_workspace;
+
+                std::vector<vk_constant_type> constants(11);
+                if (dims == 1 || dims == 2)
+                {
+                    constants[0].i = 3;
+                    constants[1].i = size_virtual;
+                    constants[2].i = 1;
+                    constants[3].i = c_virtual;
+                    constants[4].i = cstep_virtual;
+                }
+                else
+                {
+                    constants[0].i = std::min(3, dims);
+                    constants[1].i = w;
+                    constants[2].i = h;
+                    constants[3].i = c;
+                    constants[4].i = cstep;
+                }
+                constants[5].i = square_workspace.dims;
+                constants[6].i = square_workspace.w;
+                constants[7].i = square_workspace.h;
+                constants[8].i = square_workspace.c;
+                constants[9].i = square_workspace.cstep;
+                constants[10].i = channels_g;
+
+                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_sub_mean_square_pack4 : pipeline_groupnorm_sub_mean_square;
+
+                cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
             }
-            constants[5].i = square_workspace.dims;
-            constants[6].i = square_workspace.w;
-            constants[7].i = square_workspace.h;
-            constants[8].i = square_workspace.c;
-            constants[9].i = square_workspace.cstep;
-            constants[10].i = channels_g;
 
-            const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_sub_mean_square_pack4 : pipeline_groupnorm_sub_mean_square;
+            // reduce square
+            VkMat sqsum_workspace = square_workspace;
+            sqsum_workspace.w = sqsum_workspace.w * sqsum_workspace.h;
+            sqsum_workspace.h = 1;
 
-            cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
-        }
+            int pb = 0;
+            while (sqsum_workspace.w > 4)
+            {
+                int reduced_w = (sqsum_workspace.w + 3) / 4;
+                int reduced_c = sqsum_workspace.c;
 
-        // reduce square
-        VkMat sqsum_workspace = square_workspace;
-        sqsum_workspace.w = sqsum_workspace.w * sqsum_workspace.h;
-        sqsum_workspace.h = 1;
+                VkMat sqsum_workspace_reduced;
+                sqsum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
 
-        int pb = 0;
-        while (sqsum_workspace.w > 4)
-        {
-            int reduced_w = (sqsum_workspace.w + 3) / 4;
-            int reduced_c = sqsum_workspace.c;
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = sqsum_workspace;
+                    bindings[1] = sqsum_workspace_reduced;
+
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = sqsum_workspace.w;
+                    constants[1].i = 1;
+                    constants[2].i = sqsum_workspace.c;
+                    constants[3].i = sqsum_workspace.cstep;
+                    constants[4].i = sqsum_workspace_reduced.w;
+                    constants[5].i = 1;
+                    constants[6].i = sqsum_workspace_reduced.c;
+                    constants[7].i = sqsum_workspace_reduced.cstep;
+
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2];
 
-            VkMat sqsum_workspace_reduced;
-            sqsum_workspace_reduced.create(reduced_w, 1, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                    cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
 
+                    pb++;
+                }
+
+                sqsum_workspace = sqsum_workspace_reduced;
+            }
+
+            // reduce var across channels within each group
             {
                 std::vector<VkMat> bindings(2);
                 bindings[0] = sqsum_workspace;
-                bindings[1] = sqsum_workspace_reduced;
+                bindings[1] = var_workspace;
 
-                std::vector<vk_constant_type> constants(8);
+                std::vector<vk_constant_type> constants(7);
                 constants[0].i = sqsum_workspace.w;
                 constants[1].i = 1;
                 constants[2].i = sqsum_workspace.c;
                 constants[3].i = sqsum_workspace.cstep;
-                constants[4].i = sqsum_workspace_reduced.w;
-                constants[5].i = 1;
-                constants[6].i = sqsum_workspace_reduced.c;
-                constants[7].i = sqsum_workspace_reduced.cstep;
-
-                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_groupnorm_reduce_sum4_fp32[pb % 2];
+                constants[4].f = (float)(channels_g * size_virtual);
+                constants[5].i = group;
+                constants[6].i = channels_g;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
+                const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean;
 
-                pb++;
+                cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
             }
-
-            sqsum_workspace = sqsum_workspace_reduced;
-        }
-
-        // reduce var across channels within each group
-        {
-            std::vector<VkMat> bindings(2);
-            bindings[0] = sqsum_workspace;
-            bindings[1] = var_workspace;
-
-            std::vector<vk_constant_type> constants(7);
-            constants[0].i = sqsum_workspace.w;
-            constants[1].i = 1;
-            constants[2].i = sqsum_workspace.c;
-            constants[3].i = sqsum_workspace.cstep;
-            constants[4].f = (float)(channels_g * size_virtual);
-            constants[5].i = group;
-            constants[6].i = channels_g;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_groupnorm_reduce_mean_pack4 : pipeline_groupnorm_reduce_mean;
-
-            cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
         }
     }
 
diff --git a/src/layer/vulkan/groupnorm_vulkan.h b/src/layer/vulkan/groupnorm_vulkan.h
index d850b38b637e..4d8656104b36 100644
--- a/src/layer/vulkan/groupnorm_vulkan.h
+++ b/src/layer/vulkan/groupnorm_vulkan.h
@@ -38,6 +38,9 @@ class GroupNorm_vulkan : public GroupNorm
     Pipeline* pipeline_groupnorm_sub_mean_square_pack4;
     Pipeline* pipeline_groupnorm_coeffs_pack4;
     Pipeline* pipeline_groupnorm_norm_pack4;
+
+    Pipeline* pipeline_groupnorm_reduce_subgroup;
+    Pipeline* pipeline_groupnorm_reduce_subgroup_pack4;
 };
 
 } // namespace ncnn
diff --git a/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp b/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp
new file mode 100644
index 000000000000..75558e9d8f47
--- /dev/null
+++ b/src/layer/vulkan/shader/groupnorm_reduce_subgroup.comp
@@ -0,0 +1,310 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_subgroup_extended_types_float16 : require
+#endif
+
+layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
+layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int group_area;
+    int channels_g;
+    int cstep;
+    int group;
+} p;
+
+shared float sdata[256];
+
+void main()
+{
+    const int tid = int(gl_LocalInvocationID.x);
+    const int group_id = int(gl_WorkGroupID.y);
+
+    const float area = float(p.channels_g * p.group_area);
+    const int actual_channel_start = group_id * p.channels_g;
+    const int group_size = p.channels_g * p.group_area;
+
+    // Phase 1: compute sum -> mean
+    afp sum = afp(0.f);
+    for (int t = tid; t < group_size; t += 256)
+    {
+        int q = t / p.group_area;
+        int s = t % p.group_area;
+        int actual_channel = actual_channel_start + q;
+        int v_offset = actual_channel * p.cstep + s;
+        afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+        sum += v;
+    }
+
+    afp sg_sum = subgroupAdd(sum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = float(sg_sum);
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f);
+        afp r_sum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r_sum);
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = afp(sdata[lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float mean_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        sdata[0] = mean_val;
+        mean_data[group_id] = mean_val;
+    }
+    barrier();
+    mean_val = sdata[0];
+
+    // Phase 2: compute sqsum -> var
+    afp sqsum = afp(0.f);
+    for (int t = tid; t < group_size; t += 256)
+    {
+        int q = t / p.group_area;
+        int s = t % p.group_area;
+        int actual_channel = actual_channel_start + q;
+        int v_offset = actual_channel * p.cstep + s;
+        afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+        afp d = v - afp(mean_val);
+        sqsum += d * d;
+    }
+
+    afp sg_sqsum = subgroupAdd(sqsum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = float(sg_sqsum);
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f);
+        afp r_sqsum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r_sqsum);
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = afp(sdata[lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float var_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        var_data[group_id] = var_val;
+    }
+}
diff --git a/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp
new file mode 100644
index 000000000000..dc8d0767329d
--- /dev/null
+++ b/src/layer/vulkan/shader/groupnorm_reduce_subgroup_pack4.comp
@@ -0,0 +1,338 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_subgroup_extended_types_float16 : require
+#endif
+
+layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
+layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int group_area;
+    int channels_g;
+    int cstep;
+    int group;
+} p;
+
+shared float sdata[256];
+
+void main()
+{
+    const int tid = int(gl_LocalInvocationID.x);
+    const int group_id = int(gl_WorkGroupID.y);
+
+    const float area = float(p.channels_g * p.group_area);
+    const int actual_channel_start = group_id * p.channels_g;
+    const int actual_channel_end = actual_channel_start + p.channels_g;
+    const int packed_start = actual_channel_start / 4;
+    const int packed_end = (actual_channel_end - 1) / 4;
+
+    // Phase 1: compute sum -> mean
+    float sum = 0.f;
+    for (int s = tid; s < p.group_area; s += 256)
+    {
+        for (int pc = packed_start; pc <= packed_end; pc++)
+        {
+            vec4 v = vec4(buffer_ld4(bottom_top_blob_data, pc * p.cstep + s));
+            int ac_base = pc * 4;
+            if (ac_base >= actual_channel_start && ac_base < actual_channel_end)
+                sum += v.r;
+            if (ac_base + 1 >= actual_channel_start && ac_base + 1 < actual_channel_end)
+                sum += v.g;
+            if (ac_base + 2 >= actual_channel_start && ac_base + 2 < actual_channel_end)
+                sum += v.b;
+            if (ac_base + 3 >= actual_channel_start && ac_base + 3 < actual_channel_end)
+                sum += v.a;
+        }
+    }
+
+    float sg_sum = subgroupAdd(sum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = sg_sum;
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        float v = lane < num_sg ? sdata[lane] : 0.f;
+        float r_sum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r_sum;
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        float v = lane < 4 ? sdata[lane] : 0.f;
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r;
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        float v = sdata[lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r;
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float mean_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        sdata[0] = mean_val;
+        mean_data[group_id] = mean_val;
+    }
+    barrier();
+    mean_val = sdata[0];
+
+    // Phase 2: compute sqsum -> var
+    float sqsum = 0.f;
+    for (int s = tid; s < p.group_area; s += 256)
+    {
+        for (int pc = packed_start; pc <= packed_end; pc++)
+        {
+            vec4 v = vec4(buffer_ld4(bottom_top_blob_data, pc * p.cstep + s));
+            int ac_base = pc * 4;
+            float d;
+            if (ac_base >= actual_channel_start && ac_base < actual_channel_end)
+            {
+                d = v.r - mean_val;
+                sqsum += d * d;
+            }
+            if (ac_base + 1 >= actual_channel_start && ac_base + 1 < actual_channel_end)
+            {
+                d = v.g - mean_val;
+                sqsum += d * d;
+            }
+            if (ac_base + 2 >= actual_channel_start && ac_base + 2 < actual_channel_end)
+            {
+                d = v.b - mean_val;
+                sqsum += d * d;
+            }
+            if (ac_base + 3 >= actual_channel_start && ac_base + 3 < actual_channel_end)
+            {
+                d = v.a - mean_val;
+                sqsum += d * d;
+            }
+        }
+    }
+
+    float sg_sqsum = subgroupAdd(sqsum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = sg_sqsum;
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        float v = lane < num_sg ? sdata[lane] : 0.f;
+        float r_sqsum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r_sqsum;
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        float v = lane < 4 ? sdata[lane] : 0.f;
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r;
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        float v = sdata[base + lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        float v = sdata[lane];
+        float r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = r;
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float var_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        var_data[group_id] = var_val;
+    }
+}
diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt
index 10c0535d8087..1a9dcc724936 100644
--- a/tests/perf/CMakeLists.txt
+++ b/tests/perf/CMakeLists.txt
@@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp)
 ncnn_add_layer_perf(Concat)
 ncnn_add_layer_perf(Sigmoid)
 ncnn_add_layer_perf(BatchNorm)
+ncnn_add_layer_perf(GroupNorm)
 
 # SDPA perf tests (decode and prefill phases)
 if(WITH_LAYER_sdpa)
diff --git a/tests/perf/perf_groupnorm.cpp b/tests/perf/perf_groupnorm.cpp
new file mode 100644
index 000000000000..e41f304a9b50
--- /dev/null
+++ b/tests/perf/perf_groupnorm.cpp
@@ -0,0 +1,38 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "perfutil.h"
+
+static void perf_groupnorm(int w, int h, int c, int group)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, group);
+    pd.set(1, c);
+    pd.set(2, 1e-5f);
+    pd.set(3, 1);
+
+    std::vector<ncnn::Mat> weights(2);
+    weights[0] = PerfMat(c, 1.0f);
+    weights[1] = PerfMat(c, 0.0f);
+
+    perf_layer("GroupNorm", pd, weights, PerfMat(w, h, c), NULL);
+}
+
+int main()
+{
+    // Stable Diffusion-like shapes
+    perf_groupnorm(64, 64, 128, 32);
+    perf_groupnorm(32, 32, 256, 32);
+    perf_groupnorm(16, 16, 512, 32);
+    perf_groupnorm(8, 8, 512, 32);
+
+    // Image-like shapes
+    perf_groupnorm(224, 224, 3, 3);
+    perf_groupnorm(224, 224, 64, 32);
+
+    // 1D / LLM-like
+    perf_groupnorm(4096, 1, 1, 1);
+    perf_groupnorm(512, 1, 1, 1);
+
+    return 0;
+}