From b51bcfc30807171450e8c3b4be1a9ee828ceb09c Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Fri, 29 May 2026 14:42:26 +0800
Subject: [PATCH 1/3] InstanceNorm Vulkan subgroup reduce optimization

Add subgroup reduction fast-path for InstanceNorm Vulkan backend.
Replaces multi-pass reduce chain (sum -> mean -> sub -> square -> reduce -> var)
with a single dispatch per channel using subgroupAdd.

Changes:
- New shader: instancenorm_reduce_subgroup.comp (pack1)
- New shader: instancenorm_reduce_subgroup_pack4.comp (pack4)
- C++ dispatch uses w=1, h=c, c=1 to avoid dispatching over spatial dims
- Falls back to existing reduce chain when subgroup ops unavailable
- Added perf benchmark: tests/perf/perf_instancenorm.cpp

NVIDIA RTX 4060 Laptop (gpu-1) per-op speedup:

| shape          | precision | baseline (us) | optimized (us) | speedup |
|----------------|-----------|---------------|----------------|---------|
| [64,64,128]    | fp32      |         35.64 |          12.08 |    3.0x |
| [64,64,128]    | fp16ps    |         34.79 |           8.77 |    4.0x |
| [64,64,128]    | fp16psa   |         34.71 |           8.74 |    4.0x |
| [64,64,128]    | bf16ps    |         35.24 |           9.50 |    3.7x |
| [32,32,256]    | fp32      |         29.80 |          59.70 |    0.5x |
| [32,32,256]    | fp16ps    |         26.00 |           8.70 |    3.0x |
| [32,32,256]    | fp16psa   |         26.70 |           8.50 |    3.1x |
| [32,32,256]    | bf16ps    |         26.50 |           9.00 |    2.9x |
| [16,16,512]    | fp32      |         19.64 |          62.40 |    0.3x |
| [16,16,512]    | fp16ps    |         19.12 |           9.10 |    2.1x |
| [16,16,512]    | fp16psa   |         19.13 |           9.10 |    2.1x |
| [16,16,512]    | bf16ps    |         19.34 |           9.70 |    2.0x |
| [8,8,512]      | fp32      |         18.10 |           8.40 |    2.2x |
| [8,8,512]      | fp16ps    |         17.70 |           9.00 |    2.0x |
| [8,8,512]      | fp16psa   |         15.30 |           9.20 |    1.7x |
| [8,8,512]      | bf16ps    |         16.20 |           8.30 |    2.0x |
| [224,224,64]   | fp32      |        130.00 |          52.90 |    2.5x |
| [224,224,64]   | fp16ps    |         83.50 |          36.30 |    2.3x |
| [224,224,64]   | fp16psa   |         83.90 |          36.50 |    2.3x |
| [224,224,64]   | bf16ps    |         91.50 |          40.50 |    2.3x |
| [224,224,3]    | fp32      |         38.27 |          22.70 |    1.7x |
| [224,224,3]    | fp16ps    |         37.99 |          18.00 |    2.1x |
| [224,224,3]    | fp16psa   |         37.80 |          15.90 |    2.4x |
| [224,224,3]    | bf16ps    |         37.83 |          16.20 |    2.3x |
| [4096,1,1]     | fp32      |         24.87 |           6.76 |    3.7x |
| [4096,1,1]     | fp16ps    |         24.76 |           6.07 |    4.1x |
| [4096,1,1]     | fp16psa   |         24.76 |           6.03 |    4.1x |
| [4096,1,1]     | bf16ps    |         24.83 |           6.22 |    4.0x |
| [512,1,1]      | fp32      |         74.40 |           5.91 |   12.6x |
| [512,1,1]      | fp16ps    |         35.10 |           5.20 |    6.8x |
| [512,1,1]      | fp16psa   |         27.70 |           5.22 |    5.3x |
| [512,1,1]      | bf16ps    |         23.30 |           5.21 |    4.5x |

Note: fp32 pack1 path may regress on small spatial sizes (e.g. [32,32,256],
[16,16,512]) due to under-utilized 256-thread workgroups. fp16/fp16a/bf16
paths use pack4 and show consistent speedups across all tested shapes.
---
 src/layer/vulkan/instancenorm_vulkan.cpp      | 337 +++++++++-------
 src/layer/vulkan/instancenorm_vulkan.h        |   3 +
 .../shader/instancenorm_reduce_subgroup.comp  | 348 +++++++++++++++++
 .../instancenorm_reduce_subgroup_pack4.comp   | 362 ++++++++++++++++++
 tests/perf/CMakeLists.txt                     |   1 +
 tests/perf/perf_instancenorm.cpp              |  37 ++
 6 files changed, 941 insertions(+), 147 deletions(-)
 create mode 100644 src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
 create mode 100644 src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
 create mode 100644 tests/perf/perf_instancenorm.cpp

diff --git a/src/layer/vulkan/instancenorm_vulkan.cpp b/src/layer/vulkan/instancenorm_vulkan.cpp
index 17cd106f3fcd..b35a97ad78ec 100644
--- a/src/layer/vulkan/instancenorm_vulkan.cpp
+++ b/src/layer/vulkan/instancenorm_vulkan.cpp
@@ -20,6 +20,9 @@ InstanceNorm_vulkan::InstanceNorm_vulkan()
     pipeline_instancenorm_coeffs = 0;
     pipeline_instancenorm_norm = 0;
 
+    pipeline_instancenorm_reduce_subgroup = 0;
+    pipeline_instancenorm_reduce_subgroup_pack4 = 0;
+
     pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0;
     pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = 0;
     pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = 0;
@@ -239,6 +242,17 @@ int InstanceNorm_vulkan::create_pipeline(const Option& opt)
         }
     }
 
+    if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT)
+    {
+        pipeline_instancenorm_reduce_subgroup = new Pipeline(vkdev);
+        pipeline_instancenorm_reduce_subgroup->set_local_size_xyz(256, 1, 1);
+        pipeline_instancenorm_reduce_subgroup->create(LayerShaderType::instancenorm_reduce_subgroup, opt, std::vector<vk_specialization_type>());
+
+        pipeline_instancenorm_reduce_subgroup_pack4 = new Pipeline(vkdev);
+        pipeline_instancenorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1);
+        pipeline_instancenorm_reduce_subgroup_pack4->create(LayerShaderType::instancenorm_reduce_subgroup_pack4, opt, std::vector<vk_specialization_type>());
+    }
+
     return 0;
 }
 
@@ -264,6 +278,11 @@ int InstanceNorm_vulkan::destroy_pipeline(const Option& /*opt*/)
     delete pipeline_instancenorm_norm;
     pipeline_instancenorm_norm = 0;
 
+    delete pipeline_instancenorm_reduce_subgroup;
+    pipeline_instancenorm_reduce_subgroup = 0;
+    delete pipeline_instancenorm_reduce_subgroup_pack4;
+    pipeline_instancenorm_reduce_subgroup_pack4 = 0;
+
     delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4;
     pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0;
 
@@ -308,200 +327,224 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd,
     size_t elemsize = bottom_top_blob.elemsize;
     int elempack = bottom_top_blob.elempack;
 
-    // mean
+    // mean and var
     VkMat mean_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
+    VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
+
+    const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_instancenorm_reduce_subgroup_pack4 : pipeline_instancenorm_reduce_subgroup;
+    if (pipeline_reduce_subgroup)
+    {
+        std::vector<VkMat> bindings(3);
+        bindings[0] = bottom_top_blob;
+        bindings[1] = mean_workspace;
+        bindings[2] = var_workspace;
+
+        std::vector<vk_constant_type> constants(2);
+        constants[0].i = bottom_top_blob.cstep;
+        constants[1].i = size;
+
+        VkMat dispatcher;
+        dispatcher.w = 1;
+        dispatcher.h = c;
+        dispatcher.c = 1;
+
+        cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher);
+    }
+    else
     {
-        // reduce sum
-        VkMat sum_workspace;
+        // mean
         {
-            int reduced_w = (size + 3) / 4;
-            int reduced_h = 1;
-            int reduced_c = bottom_top_blob.c;
+            // reduce sum
+            VkMat sum_workspace;
+            {
+                int reduced_w = (size + 3) / 4;
+                int reduced_h = 1;
+                int reduced_c = bottom_top_blob.c;
+
+                sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = bottom_top_blob;
+                    bindings[1] = sum_workspace;
+
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = size;
+                    constants[1].i = 1;
+                    constants[2].i = bottom_top_blob.c;
+                    constants[3].i = bottom_top_blob.cstep;
+                    constants[4].i = sum_workspace.w;
+                    constants[5].i = 1;
+                    constants[6].i = sum_workspace.c;
+                    constants[7].i = sum_workspace.cstep;
+
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32;
+
+                    cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
+                }
+            }
 
-            sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+            int pb = 0;
+            while (sum_workspace.w > 4)
             {
-                std::vector<VkMat> bindings(2);
-                bindings[0] = bottom_top_blob;
-                bindings[1] = sum_workspace;
+                int reduced_w = (sum_workspace.w + 3) / 4;
+                int reduced_h = 1;
+                int reduced_c = sum_workspace.c;
 
-                std::vector<vk_constant_type> constants(8);
-                constants[0].i = size;
-                constants[1].i = 1;
-                constants[2].i = bottom_top_blob.c;
-                constants[3].i = bottom_top_blob.cstep;
-                constants[4].i = sum_workspace.w;
-                constants[5].i = 1;
-                constants[6].i = sum_workspace.c;
-                constants[7].i = sum_workspace.cstep;
+                VkMat sum_workspace_reduced;
+                sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
 
-                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32;
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = sum_workspace;
+                    bindings[1] = sum_workspace_reduced;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sum_workspace);
-            }
-        }
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = sum_workspace.w;
+                    constants[1].i = 1;
+                    constants[2].i = sum_workspace.c;
+                    constants[3].i = sum_workspace.cstep;
+                    constants[4].i = sum_workspace_reduced.w;
+                    constants[5].i = 1;
+                    constants[6].i = sum_workspace_reduced.c;
+                    constants[7].i = sum_workspace_reduced.cstep;
 
-        int pb = 0;
-        while (sum_workspace.w > 4)
-        {
-            int reduced_w = (sum_workspace.w + 3) / 4;
-            int reduced_h = 1;
-            int reduced_c = sum_workspace.c;
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
 
-            VkMat sum_workspace_reduced;
-            sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                    cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
+
+                    pb++;
+                }
+
+                sum_workspace = sum_workspace_reduced;
+            }
 
             {
                 std::vector<VkMat> bindings(2);
                 bindings[0] = sum_workspace;
-                bindings[1] = sum_workspace_reduced;
+                bindings[1] = mean_workspace;
 
-                std::vector<vk_constant_type> constants(8);
+                std::vector<vk_constant_type> constants(5);
                 constants[0].i = sum_workspace.w;
                 constants[1].i = 1;
                 constants[2].i = sum_workspace.c;
                 constants[3].i = sum_workspace.cstep;
-                constants[4].i = sum_workspace_reduced.w;
-                constants[5].i = 1;
-                constants[6].i = sum_workspace_reduced.c;
-                constants[7].i = sum_workspace_reduced.cstep;
-
-                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
+                constants[4].f = size;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced);
+                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean;
 
-                pb++;
+                cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
             }
-
-            sum_workspace = sum_workspace_reduced;
         }
 
+        // var
         {
-            std::vector<VkMat> bindings(2);
-            bindings[0] = sum_workspace;
-            bindings[1] = mean_workspace;
+            // sub mean and square
+            VkMat square_workspace;
+            square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
+            {
+                std::vector<VkMat> bindings(3);
+                bindings[0] = bottom_top_blob;
+                bindings[1] = mean_workspace;
+                bindings[2] = square_workspace;
+
+                std::vector<vk_constant_type> constants(10);
+                constants[0].i = std::min(3, bottom_top_blob.dims);
+                constants[1].i = bottom_top_blob.w;
+                constants[2].i = h;
+                constants[3].i = bottom_top_blob.c;
+                constants[4].i = bottom_top_blob.cstep;
+                constants[5].i = square_workspace.dims;
+                constants[6].i = square_workspace.w;
+                constants[7].i = square_workspace.h;
+                constants[8].i = square_workspace.c;
+                constants[9].i = square_workspace.cstep;
+
+                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square;
+
+                cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
+            }
 
-            std::vector<vk_constant_type> constants(5);
-            constants[0].i = sum_workspace.w;
-            constants[1].i = 1;
-            constants[2].i = sum_workspace.c;
-            constants[3].i = sum_workspace.cstep;
-            constants[4].f = size;
+            // reduce square
+            VkMat sqsum_workspace;
+            {
+                int reduced_w = (size + 3) / 4;
+                int reduced_h = 1;
+                int reduced_c = square_workspace.c;
+
+                sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = square_workspace;
+                    bindings[1] = sqsum_workspace;
+
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = size;
+                    constants[1].i = 1;
+                    constants[2].i = square_workspace.c;
+                    constants[3].i = square_workspace.cstep;
+                    constants[4].i = sqsum_workspace.w;
+                    constants[5].i = 1;
+                    constants[6].i = sqsum_workspace.c;
+                    constants[7].i = sqsum_workspace.cstep;
+
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0];
+
+                    cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace);
+                }
+            }
 
-            const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean;
+            int pb = 1;
+            while (sqsum_workspace.w > 4)
+            {
+                int reduced_w = (sqsum_workspace.w + 3) / 4;
+                int reduced_h = 1;
+                int reduced_c = sqsum_workspace.c;
 
-            cmd.record_pipeline(pipeline, bindings, constants, mean_workspace);
-        }
-    }
+                VkMat sqsum_workspace_reduced;
+                sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
 
-    // var
-    VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator);
-    {
-        // sub mean and square
-        VkMat square_workspace;
-        square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator);
-        {
-            std::vector<VkMat> bindings(3);
-            bindings[0] = bottom_top_blob;
-            bindings[1] = mean_workspace;
-            bindings[2] = square_workspace;
-
-            std::vector<vk_constant_type> constants(10);
-            constants[0].i = std::min(3, bottom_top_blob.dims);
-            constants[1].i = bottom_top_blob.w;
-            constants[2].i = h;
-            constants[3].i = bottom_top_blob.c;
-            constants[4].i = bottom_top_blob.cstep;
-            constants[5].i = square_workspace.dims;
-            constants[6].i = square_workspace.w;
-            constants[7].i = square_workspace.h;
-            constants[8].i = square_workspace.c;
-            constants[9].i = square_workspace.cstep;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square;
-
-            cmd.record_pipeline(pipeline, bindings, constants, square_workspace);
-        }
+                {
+                    std::vector<VkMat> bindings(2);
+                    bindings[0] = sqsum_workspace;
+                    bindings[1] = sqsum_workspace_reduced;
 
-        // reduce square
-        VkMat sqsum_workspace;
-        {
-            int reduced_w = (size + 3) / 4;
-            int reduced_h = 1;
-            int reduced_c = square_workspace.c;
+                    std::vector<vk_constant_type> constants(8);
+                    constants[0].i = sqsum_workspace.w;
+                    constants[1].i = 1;
+                    constants[2].i = sqsum_workspace.c;
+                    constants[3].i = sqsum_workspace.cstep;
+                    constants[4].i = sqsum_workspace_reduced.w;
+                    constants[5].i = 1;
+                    constants[6].i = sqsum_workspace_reduced.c;
+                    constants[7].i = sqsum_workspace_reduced.cstep;
 
-            sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
+                    const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
 
-            {
-                std::vector<VkMat> bindings(2);
-                bindings[0] = square_workspace;
-                bindings[1] = sqsum_workspace;
+                    cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
 
-                std::vector<vk_constant_type> constants(8);
-                constants[0].i = size;
-                constants[1].i = 1;
-                constants[2].i = square_workspace.c;
-                constants[3].i = square_workspace.cstep;
-                constants[4].i = sqsum_workspace.w;
-                constants[5].i = 1;
-                constants[6].i = sqsum_workspace.c;
-                constants[7].i = sqsum_workspace.cstep;
-
-                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0];
+                    pb++;
+                }
 
-                cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace);
+                sqsum_workspace = sqsum_workspace_reduced;
             }
-        }
-
-        int pb = 1;
-        while (sqsum_workspace.w > 4)
-        {
-            int reduced_w = (sqsum_workspace.w + 3) / 4;
-            int reduced_h = 1;
-            int reduced_c = sqsum_workspace.c;
-
-            VkMat sqsum_workspace_reduced;
-            sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator);
 
             {
                 std::vector<VkMat> bindings(2);
                 bindings[0] = sqsum_workspace;
-                bindings[1] = sqsum_workspace_reduced;
+                bindings[1] = var_workspace;
 
-                std::vector<vk_constant_type> constants(8);
+                std::vector<vk_constant_type> constants(5);
                 constants[0].i = sqsum_workspace.w;
                 constants[1].i = 1;
                 constants[2].i = sqsum_workspace.c;
                 constants[3].i = sqsum_workspace.cstep;
-                constants[4].i = sqsum_workspace_reduced.w;
-                constants[5].i = 1;
-                constants[6].i = sqsum_workspace_reduced.c;
-                constants[7].i = sqsum_workspace_reduced.cstep;
-
-                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2];
+                constants[4].f = size;
 
-                cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced);
+                const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean;
 
-                pb++;
+                cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
             }
-
-            sqsum_workspace = sqsum_workspace_reduced;
-        }
-
-        {
-            std::vector<VkMat> bindings(2);
-            bindings[0] = sqsum_workspace;
-            bindings[1] = var_workspace;
-
-            std::vector<vk_constant_type> constants(5);
-            constants[0].i = sqsum_workspace.w;
-            constants[1].i = 1;
-            constants[2].i = sqsum_workspace.c;
-            constants[3].i = sqsum_workspace.cstep;
-            constants[4].f = size;
-
-            const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean;
-
-            cmd.record_pipeline(pipeline, bindings, constants, var_workspace);
         }
     }
 
diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h
index e2a27de85129..edec5a918e1c 100644
--- a/src/layer/vulkan/instancenorm_vulkan.h
+++ b/src/layer/vulkan/instancenorm_vulkan.h
@@ -32,6 +32,9 @@ class InstanceNorm_vulkan : public InstanceNorm
     Pipeline* pipeline_instancenorm_coeffs;
     Pipeline* pipeline_instancenorm_norm;
 
+    Pipeline* pipeline_instancenorm_reduce_subgroup;
+    Pipeline* pipeline_instancenorm_reduce_subgroup_pack4;
+
     Pipeline* pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4;
     Pipeline* pipeline_instancenorm_reduce_sum4_fp32_pack4[2];
     Pipeline* pipeline_instancenorm_reduce_mean_pack4;
diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
new file mode 100644
index 000000000000..91117d4c121e
--- /dev/null
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
@@ -0,0 +1,348 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_subgroup_arithmetic
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_subgroup_extended_types_float16 : require
+#endif
+#endif
+
+layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
+layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
+layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+    int size;
+} p;
+
+shared float sdata[256];
+
+void main()
+{
+    const int tid = int(gl_LocalInvocationID.x);
+    const int channel_id = int(gl_WorkGroupID.y);
+
+    const float area = float(p.size);
+    const int base_offset = channel_id * p.cstep;
+
+    // Phase 1: compute sum -> mean
+    afp sum = afp(0.f);
+    for (int t = tid; t < p.size; t += 256)
+    {
+        int v_offset = base_offset + t;
+        afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+        sum += v;
+    }
+
+#if ncnn_subgroup_arithmetic
+    afp sg_sum = subgroupAdd(sum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = float(sg_sum);
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f);
+        afp r_sum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r_sum);
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = afp(sdata[lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float mean_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        sdata[0] = mean_val;
+        mean_data[channel_id] = mean_val;
+    }
+    barrier();
+    mean_val = sdata[0];
+
+#else
+    sdata[tid] = float(sum);
+    barrier();
+
+    for (int stride = 128; stride > 0; stride >>= 1)
+    {
+        if (tid < stride)
+        {
+            sdata[tid] = sdata[tid] + sdata[tid + stride];
+        }
+        barrier();
+    }
+
+    float mean_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        sdata[0] = mean_val;
+        mean_data[channel_id] = mean_val;
+    }
+    barrier();
+    mean_val = sdata[0];
+#endif
+
+    // Phase 2: compute sqsum -> var
+    afp sqsum = afp(0.f);
+    for (int t = tid; t < p.size; t += 256)
+    {
+        int v_offset = base_offset + t;
+        afp v = buffer_ld1(bottom_top_blob_data, v_offset);
+        afp d = v - afp(mean_val);
+        sqsum += d * d;
+    }
+
+#if ncnn_subgroup_arithmetic
+    afp sg_sqsum = subgroupAdd(sqsum);
+    if (subgroupElect())
+    {
+        sdata[int(gl_SubgroupID)] = float(sg_sqsum);
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f);
+        afp r_sqsum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r_sqsum);
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        afp v = afp(sdata[base + lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[int(gl_SubgroupID)] = float(r);
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        afp v = afp(sdata[lane]);
+        afp r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata[0] = float(r);
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata[tid] = sdata[tid] + sdata[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata[0] = sdata[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    float var_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        var_data[channel_id] = var_val;
+    }
+
+#else
+    sdata[tid] = float(sqsum);
+    barrier();
+
+    for (int stride = 128; stride > 0; stride >>= 1)
+    {
+        if (tid < stride)
+        {
+            sdata[tid] = sdata[tid] + sdata[tid + stride];
+        }
+        barrier();
+    }
+
+    float var_val = sdata[0] / area;
+    if (tid == 0)
+    {
+        var_data[channel_id] = var_val;
+    }
+#endif
+}
diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
new file mode 100644
index 000000000000..f0ad4e3042a5
--- /dev/null
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
@@ -0,0 +1,362 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#version 450
+
+#if ncnn_subgroup_arithmetic
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_KHR_shader_subgroup_arithmetic : enable
+#if NCNN_fp16_storage
+#extension GL_EXT_shader_subgroup_extended_types_float16 : require
+#endif
+#endif
+
+layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
+layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
+layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
+
+layout(push_constant) uniform parameter
+{
+    int cstep;
+    int size;
+} p;
+
+shared vec4 sdata_v4[64];
+
+void main()
+{
+    const int tid = int(gl_LocalInvocationID.x);
+    const int packed_channel_id = int(gl_WorkGroupID.y);
+    const int actual_channel_base = packed_channel_id * 4;
+
+    const float area = float(p.size);
+    const int base_offset = packed_channel_id * p.cstep;
+
+    // Phase 1: compute sum -> mean
+    vec4 sum = vec4(0.f);
+    for (int t = tid; t < p.size; t += 256)
+    {
+        int v_offset = base_offset + t;
+        vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset));
+        sum += v;
+    }
+
+#if ncnn_subgroup_arithmetic
+    vec4 sg_sum = subgroupAdd(sum);
+    if (subgroupElect())
+    {
+        sdata_v4[int(gl_SubgroupID)] = sg_sum;
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f);
+        vec4 r_sum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r_sum;
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f);
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r;
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        vec4 v = sdata_v4[lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r;
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata_v4[0] = sdata_v4[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    vec4 mean_v4 = sdata_v4[0] / area;
+    if (tid == 0)
+    {
+        sdata_v4[0] = mean_v4;
+        mean_data[actual_channel_base + 0] = mean_v4.r;
+        mean_data[actual_channel_base + 1] = mean_v4.g;
+        mean_data[actual_channel_base + 2] = mean_v4.b;
+        mean_data[actual_channel_base + 3] = mean_v4.a;
+    }
+    barrier();
+    mean_v4 = sdata_v4[0];
+
+#else
+    // non-subgroup fallback: use shared memory scalar tree reduce per component
+    sdata_v4[tid] = sum;
+    barrier();
+
+    for (int stride = 128; stride > 0; stride >>= 1)
+    {
+        if (tid < stride)
+        {
+            sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
+        }
+        barrier();
+    }
+
+    vec4 mean_v4 = sdata_v4[0] / area;
+    if (tid == 0)
+    {
+        sdata_v4[0] = mean_v4;
+        mean_data[actual_channel_base + 0] = mean_v4.r;
+        mean_data[actual_channel_base + 1] = mean_v4.g;
+        mean_data[actual_channel_base + 2] = mean_v4.b;
+        mean_data[actual_channel_base + 3] = mean_v4.a;
+    }
+    barrier();
+    mean_v4 = sdata_v4[0];
+#endif
+
+    // Phase 2: compute sqsum -> var
+    vec4 sqsum = vec4(0.f);
+    for (int t = tid; t < p.size; t += 256)
+    {
+        int v_offset = base_offset + t;
+        vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset));
+        vec4 d = v - mean_v4;
+        sqsum += d * d;
+    }
+
+#if ncnn_subgroup_arithmetic
+    vec4 sg_sqsum = subgroupAdd(sqsum);
+    if (subgroupElect())
+    {
+        sdata_v4[int(gl_SubgroupID)] = sg_sqsum;
+    }
+
+    barrier();
+
+#if ncnn_subgroupSize >= 16
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int num_sg = int(gl_NumSubgroups);
+
+        vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f);
+        vec4 r_sqsum = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r_sqsum;
+        }
+    }
+#elif ncnn_subgroupSize == 8
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 8;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f);
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r;
+        }
+    }
+#elif ncnn_subgroupSize == 4
+    if (int(gl_SubgroupID) < 16)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) < 4)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+        const int base = int(gl_SubgroupID) * 4;
+
+        vec4 v = sdata_v4[base + lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[int(gl_SubgroupID)] = r;
+        }
+    }
+
+    barrier();
+
+    if (int(gl_SubgroupID) == 0)
+    {
+        const int lane = int(gl_SubgroupInvocationID);
+
+        vec4 v = sdata_v4[lane];
+        vec4 r = subgroupAdd(v);
+
+        if (subgroupElect())
+        {
+            sdata_v4[0] = r;
+        }
+    }
+#else
+    {
+        const int num_sg = 256 / ncnn_subgroupSize;
+
+        for (int stride = num_sg / 2; stride > 0; stride >>= 1)
+        {
+            if (tid < stride)
+            {
+                sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
+            }
+            barrier();
+        }
+
+        if (tid == 0)
+        {
+            sdata_v4[0] = sdata_v4[0];
+        }
+    }
+#endif
+
+    barrier();
+
+    vec4 var_v4 = sdata_v4[0] / area;
+    if (tid == 0)
+    {
+        var_data[actual_channel_base + 0] = var_v4.r;
+        var_data[actual_channel_base + 1] = var_v4.g;
+        var_data[actual_channel_base + 2] = var_v4.b;
+        var_data[actual_channel_base + 3] = var_v4.a;
+    }
+
+#else
+    sdata_v4[tid] = sqsum;
+    barrier();
+
+    for (int stride = 128; stride > 0; stride >>= 1)
+    {
+        if (tid < stride)
+        {
+            sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
+        }
+        barrier();
+    }
+
+    vec4 var_v4 = sdata_v4[0] / area;
+    if (tid == 0)
+    {
+        var_data[actual_channel_base + 0] = var_v4.r;
+        var_data[actual_channel_base + 1] = var_v4.g;
+        var_data[actual_channel_base + 2] = var_v4.b;
+        var_data[actual_channel_base + 3] = var_v4.a;
+    }
+#endif
+}
diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt
index 10c0535d8087..06ed1e80c966 100644
--- a/tests/perf/CMakeLists.txt
+++ b/tests/perf/CMakeLists.txt
@@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp)
 ncnn_add_layer_perf(Concat)
 ncnn_add_layer_perf(Sigmoid)
 ncnn_add_layer_perf(BatchNorm)
+ncnn_add_layer_perf(InstanceNorm)
 
 # SDPA perf tests (decode and prefill phases)
 if(WITH_LAYER_sdpa)
diff --git a/tests/perf/perf_instancenorm.cpp b/tests/perf/perf_instancenorm.cpp
new file mode 100644
index 000000000000..5b22f3acb387
--- /dev/null
+++ b/tests/perf/perf_instancenorm.cpp
@@ -0,0 +1,37 @@
+// Copyright 2026 Futz12 <pchar.cn>
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "perfutil.h"
+
+static void perf_instancenorm(int w, int h, int c)
+{
+    ncnn::ParamDict pd;
+    pd.set(0, c);
+    pd.set(1, 1e-5f);
+    pd.set(2, 1);
+
+    std::vector<ncnn::Mat> weights(2);
+    weights[0] = PerfMat(c, 1.0f);
+    weights[1] = PerfMat(c, 0.0f);
+
+    perf_layer("InstanceNorm", pd, weights, PerfMat(w, h, c), "channels=%d", c);
+}
+
+int main()
+{
+    // StyleGAN / diffusion representative shapes
+    perf_instancenorm(64, 64, 128);
+    perf_instancenorm(32, 32, 256);
+    perf_instancenorm(16, 16, 512);
+    perf_instancenorm(8, 8, 512);
+
+    // Larger spatial
+    perf_instancenorm(224, 224, 64);
+    perf_instancenorm(224, 224, 3);
+
+    // LLM-style degenerate case
+    perf_instancenorm(4096, 1, 1);
+    perf_instancenorm(512, 1, 1);
+
+    return 0;
+}

From 1358f9946f346958b0051acdc4de1f3fc2236679 Mon Sep 17 00:00:00 2001
From: futz12 <1391525377@qq.com>
Date: Fri, 29 May 2026 16:44:47 +0800
Subject: [PATCH 2/3] vulkan: remove dead #else branches in instancenorm
 subgroup reduce shaders

---
 .../shader/instancenorm_reduce_subgroup.comp  | 46 +---------------
 .../instancenorm_reduce_subgroup_pack4.comp   | 53 +------------------
 2 files changed, 2 insertions(+), 97 deletions(-)

diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
index 91117d4c121e..c91b0673e9f4 100644
--- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
@@ -3,13 +3,12 @@
 
 #version 450
 
-#if ncnn_subgroup_arithmetic
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #if NCNN_fp16_storage
 #extension GL_EXT_shader_subgroup_extended_types_float16 : require
 #endif
-#endif
+
 
 layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
@@ -40,7 +39,6 @@ void main()
         sum += v;
     }
 
-#if ncnn_subgroup_arithmetic
     afp sg_sum = subgroupAdd(sum);
     if (subgroupElect())
     {
@@ -168,28 +166,6 @@ void main()
     barrier();
     mean_val = sdata[0];
 
-#else
-    sdata[tid] = float(sum);
-    barrier();
-
-    for (int stride = 128; stride > 0; stride >>= 1)
-    {
-        if (tid < stride)
-        {
-            sdata[tid] = sdata[tid] + sdata[tid + stride];
-        }
-        barrier();
-    }
-
-    float mean_val = sdata[0] / area;
-    if (tid == 0)
-    {
-        sdata[0] = mean_val;
-        mean_data[channel_id] = mean_val;
-    }
-    barrier();
-    mean_val = sdata[0];
-#endif
 
     // Phase 2: compute sqsum -> var
     afp sqsum = afp(0.f);
@@ -201,7 +177,6 @@ void main()
         sqsum += d * d;
     }
 
-#if ncnn_subgroup_arithmetic
     afp sg_sqsum = subgroupAdd(sqsum);
     if (subgroupElect())
     {
@@ -326,23 +301,4 @@ void main()
         var_data[channel_id] = var_val;
     }
 
-#else
-    sdata[tid] = float(sqsum);
-    barrier();
-
-    for (int stride = 128; stride > 0; stride >>= 1)
-    {
-        if (tid < stride)
-        {
-            sdata[tid] = sdata[tid] + sdata[tid + stride];
-        }
-        barrier();
-    }
-
-    float var_val = sdata[0] / area;
-    if (tid == 0)
-    {
-        var_data[channel_id] = var_val;
-    }
-#endif
 }
diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
index f0ad4e3042a5..b6476f65d439 100644
--- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
@@ -3,13 +3,12 @@
 
 #version 450
 
-#if ncnn_subgroup_arithmetic
 #extension GL_KHR_shader_subgroup_basic : enable
 #extension GL_KHR_shader_subgroup_arithmetic : enable
 #if NCNN_fp16_storage
 #extension GL_EXT_shader_subgroup_extended_types_float16 : require
 #endif
-#endif
+
 
 layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
 layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
@@ -41,7 +40,6 @@ void main()
         sum += v;
     }
 
-#if ncnn_subgroup_arithmetic
     vec4 sg_sum = subgroupAdd(sum);
     if (subgroupElect())
     {
@@ -172,32 +170,6 @@ void main()
     barrier();
     mean_v4 = sdata_v4[0];
 
-#else
-    // non-subgroup fallback: use shared memory scalar tree reduce per component
-    sdata_v4[tid] = sum;
-    barrier();
-
-    for (int stride = 128; stride > 0; stride >>= 1)
-    {
-        if (tid < stride)
-        {
-            sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
-        }
-        barrier();
-    }
-
-    vec4 mean_v4 = sdata_v4[0] / area;
-    if (tid == 0)
-    {
-        sdata_v4[0] = mean_v4;
-        mean_data[actual_channel_base + 0] = mean_v4.r;
-        mean_data[actual_channel_base + 1] = mean_v4.g;
-        mean_data[actual_channel_base + 2] = mean_v4.b;
-        mean_data[actual_channel_base + 3] = mean_v4.a;
-    }
-    barrier();
-    mean_v4 = sdata_v4[0];
-#endif
 
     // Phase 2: compute sqsum -> var
     vec4 sqsum = vec4(0.f);
@@ -209,7 +181,6 @@ void main()
         sqsum += d * d;
     }
 
-#if ncnn_subgroup_arithmetic
     vec4 sg_sqsum = subgroupAdd(sqsum);
     if (subgroupElect())
     {
@@ -337,26 +308,4 @@ void main()
         var_data[actual_channel_base + 3] = var_v4.a;
     }
 
-#else
-    sdata_v4[tid] = sqsum;
-    barrier();
-
-    for (int stride = 128; stride > 0; stride >>= 1)
-    {
-        if (tid < stride)
-        {
-            sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride];
-        }
-        barrier();
-    }
-
-    vec4 var_v4 = sdata_v4[0] / area;
-    if (tid == 0)
-    {
-        var_data[actual_channel_base + 0] = var_v4.r;
-        var_data[actual_channel_base + 1] = var_v4.g;
-        var_data[actual_channel_base + 2] = var_v4.b;
-        var_data[actual_channel_base + 3] = var_v4.a;
-    }
-#endif
 }

From 1cad3e0e1a77a2dd0761f7fbd5351e5d3d907baa Mon Sep 17 00:00:00 2001
From: futz12 <56149058+futz12@users.noreply.github.com>
Date: Fri, 29 May 2026 08:47:17 +0000
Subject: [PATCH 3/3] apply code-format changes

---
 src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp      | 3 ---
 .../vulkan/shader/instancenorm_reduce_subgroup_pack4.comp      | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
index c91b0673e9f4..6ae8004298ab 100644
--- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp
@@ -9,7 +9,6 @@
 #extension GL_EXT_shader_subgroup_extended_types_float16 : require
 #endif
 
-
 layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; };
 layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
 layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
@@ -166,7 +165,6 @@ void main()
     barrier();
     mean_val = sdata[0];
 
-
     // Phase 2: compute sqsum -> var
     afp sqsum = afp(0.f);
     for (int t = tid; t < p.size; t += 256)
@@ -300,5 +298,4 @@ void main()
     {
         var_data[channel_id] = var_val;
     }
-
 }
diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
index b6476f65d439..ab16c2e8f1ab 100644
--- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
+++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp
@@ -9,7 +9,6 @@
 #extension GL_EXT_shader_subgroup_extended_types_float16 : require
 #endif
 
-
 layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; };
 layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; };
 layout(binding = 2) writeonly buffer var_blob { float var_data[]; };
@@ -170,7 +169,6 @@ void main()
     barrier();
     mean_v4 = sdata_v4[0];
 
-
     // Phase 2: compute sqsum -> var
     vec4 sqsum = vec4(0.f);
     for (int t = tid; t < p.size; t += 256)
@@ -307,5 +305,4 @@ void main()
         var_data[actual_channel_base + 2] = var_v4.b;
         var_data[actual_channel_base + 3] = var_v4.a;
     }
-
 }