From b51bcfc30807171450e8c3b4be1a9ee828ceb09c Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Fri, 29 May 2026 14:42:26 +0800 Subject: [PATCH 1/3] InstanceNorm Vulkan subgroup reduce optimization Add subgroup reduction fast-path for InstanceNorm Vulkan backend. Replaces multi-pass reduce chain (sum -> mean -> sub -> square -> reduce -> var) with a single dispatch per channel using subgroupAdd. Changes: - New shader: instancenorm_reduce_subgroup.comp (pack1) - New shader: instancenorm_reduce_subgroup_pack4.comp (pack4) - C++ dispatch uses w=1, h=c, c=1 to avoid dispatching over spatial dims - Falls back to existing reduce chain when subgroup ops unavailable - Added perf benchmark: tests/perf/perf_instancenorm.cpp NVIDIA RTX 4060 Laptop (gpu-1) per-op speedup: | shape | precision | baseline (us) | optimized (us) | speedup | |----------------|-----------|---------------|----------------|---------| | [64,64,128] | fp32 | 35.64 | 12.08 | 3.0x | | [64,64,128] | fp16ps | 34.79 | 8.77 | 4.0x | | [64,64,128] | fp16psa | 34.71 | 8.74 | 4.0x | | [64,64,128] | bf16ps | 35.24 | 9.50 | 3.7x | | [32,32,256] | fp32 | 29.80 | 59.70 | 0.5x | | [32,32,256] | fp16ps | 26.00 | 8.70 | 3.0x | | [32,32,256] | fp16psa | 26.70 | 8.50 | 3.1x | | [32,32,256] | bf16ps | 26.50 | 9.00 | 2.9x | | [16,16,512] | fp32 | 19.64 | 62.40 | 0.3x | | [16,16,512] | fp16ps | 19.12 | 9.10 | 2.1x | | [16,16,512] | fp16psa | 19.13 | 9.10 | 2.1x | | [16,16,512] | bf16ps | 19.34 | 9.70 | 2.0x | | [8,8,512] | fp32 | 18.10 | 8.40 | 2.2x | | [8,8,512] | fp16ps | 17.70 | 9.00 | 2.0x | | [8,8,512] | fp16psa | 15.30 | 9.20 | 1.7x | | [8,8,512] | bf16ps | 16.20 | 8.30 | 2.0x | | [224,224,64] | fp32 | 130.00 | 52.90 | 2.5x | | [224,224,64] | fp16ps | 83.50 | 36.30 | 2.3x | | [224,224,64] | fp16psa | 83.90 | 36.50 | 2.3x | | [224,224,64] | bf16ps | 91.50 | 40.50 | 2.3x | | [224,224,3] | fp32 | 38.27 | 22.70 | 1.7x | | [224,224,3] | fp16ps | 37.99 | 18.00 | 2.1x | | [224,224,3] | fp16psa | 37.80 | 15.90 | 2.4x | | [224,224,3] | bf16ps | 37.83 | 16.20 | 2.3x | | [4096,1,1] | fp32 | 24.87 | 6.76 | 3.7x | | [4096,1,1] | fp16ps | 24.76 | 6.07 | 4.1x | | [4096,1,1] | fp16psa | 24.76 | 6.03 | 4.1x | | [4096,1,1] | bf16ps | 24.83 | 6.22 | 4.0x | | [512,1,1] | fp32 | 74.40 | 5.91 | 12.6x | | [512,1,1] | fp16ps | 35.10 | 5.20 | 6.8x | | [512,1,1] | fp16psa | 27.70 | 5.22 | 5.3x | | [512,1,1] | bf16ps | 23.30 | 5.21 | 4.5x | Note: fp32 pack1 path may regress on small spatial sizes (e.g. [32,32,256], [16,16,512]) due to under-utilized 256-thread workgroups. fp16/fp16a/bf16 paths use pack4 and show consistent speedups across all tested shapes. --- src/layer/vulkan/instancenorm_vulkan.cpp | 337 +++++++++------- src/layer/vulkan/instancenorm_vulkan.h | 3 + .../shader/instancenorm_reduce_subgroup.comp | 348 +++++++++++++++++ .../instancenorm_reduce_subgroup_pack4.comp | 362 ++++++++++++++++++ tests/perf/CMakeLists.txt | 1 + tests/perf/perf_instancenorm.cpp | 37 ++ 6 files changed, 941 insertions(+), 147 deletions(-) create mode 100644 src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp create mode 100644 src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp create mode 100644 tests/perf/perf_instancenorm.cpp diff --git a/src/layer/vulkan/instancenorm_vulkan.cpp b/src/layer/vulkan/instancenorm_vulkan.cpp index 17cd106f3fcd..b35a97ad78ec 100644 --- a/src/layer/vulkan/instancenorm_vulkan.cpp +++ b/src/layer/vulkan/instancenorm_vulkan.cpp @@ -20,6 +20,9 @@ InstanceNorm_vulkan::InstanceNorm_vulkan() pipeline_instancenorm_coeffs = 0; pipeline_instancenorm_norm = 0; + pipeline_instancenorm_reduce_subgroup = 0; + pipeline_instancenorm_reduce_subgroup_pack4 = 0; + pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0; pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = 0; pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = 0; @@ -239,6 +242,17 @@ int InstanceNorm_vulkan::create_pipeline(const Option& opt) } } + if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) + { + pipeline_instancenorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_instancenorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_instancenorm_reduce_subgroup->create(LayerShaderType::instancenorm_reduce_subgroup, opt, std::vector()); + + pipeline_instancenorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_instancenorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_instancenorm_reduce_subgroup_pack4->create(LayerShaderType::instancenorm_reduce_subgroup_pack4, opt, std::vector()); + } + return 0; } @@ -264,6 +278,11 @@ int InstanceNorm_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_instancenorm_norm; pipeline_instancenorm_norm = 0; + delete pipeline_instancenorm_reduce_subgroup; + pipeline_instancenorm_reduce_subgroup = 0; + delete pipeline_instancenorm_reduce_subgroup_pack4; + pipeline_instancenorm_reduce_subgroup_pack4 = 0; + delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4; pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0; @@ -308,200 +327,224 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, size_t elemsize = bottom_top_blob.elemsize; int elempack = bottom_top_blob.elempack; - // mean + // mean and var VkMat mean_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); + VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); + + const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_instancenorm_reduce_subgroup_pack4 : pipeline_instancenorm_reduce_subgroup; + if (pipeline_reduce_subgroup) + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = var_workspace; + + std::vector constants(2); + constants[0].i = bottom_top_blob.cstep; + constants[1].i = size; + + VkMat dispatcher; + dispatcher.w = 1; + dispatcher.h = c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher); + } + else { - // reduce sum - VkMat sum_workspace; + // mean { - int reduced_w = (size + 3) / 4; - int reduced_h = 1; - int reduced_c = bottom_top_blob.c; + // reduce sum + VkMat sum_workspace; + { + int reduced_w = (size + 3) / 4; + int reduced_h = 1; + int reduced_c = bottom_top_blob.c; + + sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(8); + constants[0].i = size; + constants[1].i = 1; + constants[2].i = bottom_top_blob.c; + constants[3].i = bottom_top_blob.cstep; + constants[4].i = sum_workspace.w; + constants[5].i = 1; + constants[6].i = sum_workspace.c; + constants[7].i = sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); + } + } - sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + int pb = 0; + while (sum_workspace.w > 4) { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + int reduced_w = (sum_workspace.w + 3) / 4; + int reduced_h = 1; + int reduced_c = sum_workspace.c; - std::vector constants(8); - constants[0].i = size; - constants[1].i = 1; - constants[2].i = bottom_top_blob.c; - constants[3].i = bottom_top_blob.cstep; - constants[4].i = sum_workspace.w; - constants[5].i = 1; - constants[6].i = sum_workspace.c; - constants[7].i = sum_workspace.cstep; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32; + { + std::vector bindings(2); + bindings[0] = sum_workspace; + bindings[1] = sum_workspace_reduced; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); - } - } + std::vector constants(8); + constants[0].i = sum_workspace.w; + constants[1].i = 1; + constants[2].i = sum_workspace.c; + constants[3].i = sum_workspace.cstep; + constants[4].i = sum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sum_workspace_reduced.c; + constants[7].i = sum_workspace_reduced.cstep; - int pb = 0; - while (sum_workspace.w > 4) - { - int reduced_w = (sum_workspace.w + 3) / 4; - int reduced_h = 1; - int reduced_c = sum_workspace.c; + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + + pb++; + } + + sum_workspace = sum_workspace_reduced; + } { std::vector bindings(2); bindings[0] = sum_workspace; - bindings[1] = sum_workspace_reduced; + bindings[1] = mean_workspace; - std::vector constants(8); + std::vector constants(5); constants[0].i = sum_workspace.w; constants[1].i = 1; constants[2].i = sum_workspace.c; constants[3].i = sum_workspace.cstep; - constants[4].i = sum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sum_workspace_reduced.c; - constants[7].i = sum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = size; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); } - - sum_workspace = sum_workspace_reduced; } + // var { - std::vector bindings(2); - bindings[0] = sum_workspace; - bindings[1] = mean_workspace; + // sub mean and square + VkMat square_workspace; + square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = square_workspace; + + std::vector constants(10); + constants[0].i = std::min(3, bottom_top_blob.dims); + constants[1].i = bottom_top_blob.w; + constants[2].i = h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = square_workspace.dims; + constants[6].i = square_workspace.w; + constants[7].i = square_workspace.h; + constants[8].i = square_workspace.c; + constants[9].i = square_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square; + + cmd.record_pipeline(pipeline, bindings, constants, square_workspace); + } - std::vector constants(5); - constants[0].i = sum_workspace.w; - constants[1].i = 1; - constants[2].i = sum_workspace.c; - constants[3].i = sum_workspace.cstep; - constants[4].f = size; + // reduce square + VkMat sqsum_workspace; + { + int reduced_w = (size + 3) / 4; + int reduced_h = 1; + int reduced_c = square_workspace.c; + + sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + + { + std::vector bindings(2); + bindings[0] = square_workspace; + bindings[1] = sqsum_workspace; + + std::vector constants(8); + constants[0].i = size; + constants[1].i = 1; + constants[2].i = square_workspace.c; + constants[3].i = square_workspace.cstep; + constants[4].i = sqsum_workspace.w; + constants[5].i = 1; + constants[6].i = sqsum_workspace.c; + constants[7].i = sqsum_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0]; + + cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace); + } + } - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; + int pb = 1; + while (sqsum_workspace.w > 4) + { + int reduced_w = (sqsum_workspace.w + 3) / 4; + int reduced_h = 1; + int reduced_c = sqsum_workspace.c; - cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); - } - } + VkMat sqsum_workspace_reduced; + sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - // var - VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); - { - // sub mean and square - VkMat square_workspace; - square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); - { - std::vector bindings(3); - bindings[0] = bottom_top_blob; - bindings[1] = mean_workspace; - bindings[2] = square_workspace; - - std::vector constants(10); - constants[0].i = std::min(3, bottom_top_blob.dims); - constants[1].i = bottom_top_blob.w; - constants[2].i = h; - constants[3].i = bottom_top_blob.c; - constants[4].i = bottom_top_blob.cstep; - constants[5].i = square_workspace.dims; - constants[6].i = square_workspace.w; - constants[7].i = square_workspace.h; - constants[8].i = square_workspace.c; - constants[9].i = square_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square; - - cmd.record_pipeline(pipeline, bindings, constants, square_workspace); - } + { + std::vector bindings(2); + bindings[0] = sqsum_workspace; + bindings[1] = sqsum_workspace_reduced; - // reduce square - VkMat sqsum_workspace; - { - int reduced_w = (size + 3) / 4; - int reduced_h = 1; - int reduced_c = square_workspace.c; + std::vector constants(8); + constants[0].i = sqsum_workspace.w; + constants[1].i = 1; + constants[2].i = sqsum_workspace.c; + constants[3].i = sqsum_workspace.cstep; + constants[4].i = sqsum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sqsum_workspace_reduced.c; + constants[7].i = sqsum_workspace_reduced.cstep; - sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; - { - std::vector bindings(2); - bindings[0] = square_workspace; - bindings[1] = sqsum_workspace; + cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); - std::vector constants(8); - constants[0].i = size; - constants[1].i = 1; - constants[2].i = square_workspace.c; - constants[3].i = square_workspace.cstep; - constants[4].i = sqsum_workspace.w; - constants[5].i = 1; - constants[6].i = sqsum_workspace.c; - constants[7].i = sqsum_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0]; + pb++; + } - cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace); + sqsum_workspace = sqsum_workspace_reduced; } - } - - int pb = 1; - while (sqsum_workspace.w > 4) - { - int reduced_w = (sqsum_workspace.w + 3) / 4; - int reduced_h = 1; - int reduced_c = sqsum_workspace.c; - - VkMat sqsum_workspace_reduced; - sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); bindings[0] = sqsum_workspace; - bindings[1] = sqsum_workspace_reduced; + bindings[1] = var_workspace; - std::vector constants(8); + std::vector constants(5); constants[0].i = sqsum_workspace.w; constants[1].i = 1; constants[2].i = sqsum_workspace.c; constants[3].i = sqsum_workspace.cstep; - constants[4].i = sqsum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sqsum_workspace_reduced.c; - constants[7].i = sqsum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = size; - cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } - - sqsum_workspace = sqsum_workspace_reduced; - } - - { - std::vector bindings(2); - bindings[0] = sqsum_workspace; - bindings[1] = var_workspace; - - std::vector constants(5); - constants[0].i = sqsum_workspace.w; - constants[1].i = 1; - constants[2].i = sqsum_workspace.c; - constants[3].i = sqsum_workspace.cstep; - constants[4].f = size; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - - cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } } diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h index e2a27de85129..edec5a918e1c 100644 --- a/src/layer/vulkan/instancenorm_vulkan.h +++ b/src/layer/vulkan/instancenorm_vulkan.h @@ -32,6 +32,9 @@ class InstanceNorm_vulkan : public InstanceNorm Pipeline* pipeline_instancenorm_coeffs; Pipeline* pipeline_instancenorm_norm; + Pipeline* pipeline_instancenorm_reduce_subgroup; + Pipeline* pipeline_instancenorm_reduce_subgroup_pack4; + Pipeline* pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4; Pipeline* pipeline_instancenorm_reduce_sum4_fp32_pack4[2]; Pipeline* pipeline_instancenorm_reduce_mean_pack4; diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp new file mode 100644 index 000000000000..91117d4c121e --- /dev/null +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp @@ -0,0 +1,348 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_subgroup_arithmetic +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + int size; +} p; + +shared float sdata[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int channel_id = int(gl_WorkGroupID.y); + + const float area = float(p.size); + const int base_offset = channel_id * p.cstep; + + // Phase 1: compute sum -> mean + afp sum = afp(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + sum += v; + } + +#if ncnn_subgroup_arithmetic + afp sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float mean_val = sdata[0] / area; + if (tid == 0) + { + sdata[0] = mean_val; + mean_data[channel_id] = mean_val; + } + barrier(); + mean_val = sdata[0]; + +#else + sdata[tid] = float(sum); + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + float mean_val = sdata[0] / area; + if (tid == 0) + { + sdata[0] = mean_val; + mean_data[channel_id] = mean_val; + } + barrier(); + mean_val = sdata[0]; +#endif + + // Phase 2: compute sqsum -> var + afp sqsum = afp(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + afp d = v - afp(mean_val); + sqsum += d * d; + } + +#if ncnn_subgroup_arithmetic + afp sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sqsum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float var_val = sdata[0] / area; + if (tid == 0) + { + var_data[channel_id] = var_val; + } + +#else + sdata[tid] = float(sqsum); + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + float var_val = sdata[0] / area; + if (tid == 0) + { + var_data[channel_id] = var_val; + } +#endif +} diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp new file mode 100644 index 000000000000..f0ad4e3042a5 --- /dev/null +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp @@ -0,0 +1,362 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#if ncnn_subgroup_arithmetic +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + int size; +} p; + +shared vec4 sdata_v4[64]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int packed_channel_id = int(gl_WorkGroupID.y); + const int actual_channel_base = packed_channel_id * 4; + + const float area = float(p.size); + const int base_offset = packed_channel_id * p.cstep; + + // Phase 1: compute sum -> mean + vec4 sum = vec4(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset)); + sum += v; + } + +#if ncnn_subgroup_arithmetic + vec4 sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = sg_sum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r_sum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f); + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = sdata_v4[lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata_v4[0] = sdata_v4[0]; + } + } +#endif + + barrier(); + + vec4 mean_v4 = sdata_v4[0] / area; + if (tid == 0) + { + sdata_v4[0] = mean_v4; + mean_data[actual_channel_base + 0] = mean_v4.r; + mean_data[actual_channel_base + 1] = mean_v4.g; + mean_data[actual_channel_base + 2] = mean_v4.b; + mean_data[actual_channel_base + 3] = mean_v4.a; + } + barrier(); + mean_v4 = sdata_v4[0]; + +#else + // non-subgroup fallback: use shared memory scalar tree reduce per component + sdata_v4[tid] = sum; + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + vec4 mean_v4 = sdata_v4[0] / area; + if (tid == 0) + { + sdata_v4[0] = mean_v4; + mean_data[actual_channel_base + 0] = mean_v4.r; + mean_data[actual_channel_base + 1] = mean_v4.g; + mean_data[actual_channel_base + 2] = mean_v4.b; + mean_data[actual_channel_base + 3] = mean_v4.a; + } + barrier(); + mean_v4 = sdata_v4[0]; +#endif + + // Phase 2: compute sqsum -> var + vec4 sqsum = vec4(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset)); + vec4 d = v - mean_v4; + sqsum += d * d; + } + +#if ncnn_subgroup_arithmetic + vec4 sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = sg_sqsum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f); + vec4 r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r_sqsum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f); + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = sdata_v4[lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata_v4[0] = sdata_v4[0]; + } + } +#endif + + barrier(); + + vec4 var_v4 = sdata_v4[0] / area; + if (tid == 0) + { + var_data[actual_channel_base + 0] = var_v4.r; + var_data[actual_channel_base + 1] = var_v4.g; + var_data[actual_channel_base + 2] = var_v4.b; + var_data[actual_channel_base + 3] = var_v4.a; + } + +#else + sdata_v4[tid] = sqsum; + barrier(); + + for (int stride = 128; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + vec4 var_v4 = sdata_v4[0] / area; + if (tid == 0) + { + var_data[actual_channel_base + 0] = var_v4.r; + var_data[actual_channel_base + 1] = var_v4.g; + var_data[actual_channel_base + 2] = var_v4.b; + var_data[actual_channel_base + 3] = var_v4.a; + } +#endif +} diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt index 10c0535d8087..06ed1e80c966 100644 --- a/tests/perf/CMakeLists.txt +++ b/tests/perf/CMakeLists.txt @@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp) ncnn_add_layer_perf(Concat) ncnn_add_layer_perf(Sigmoid) ncnn_add_layer_perf(BatchNorm) +ncnn_add_layer_perf(InstanceNorm) # SDPA perf tests (decode and prefill phases) if(WITH_LAYER_sdpa) diff --git a/tests/perf/perf_instancenorm.cpp b/tests/perf/perf_instancenorm.cpp new file mode 100644 index 000000000000..5b22f3acb387 --- /dev/null +++ b/tests/perf/perf_instancenorm.cpp @@ -0,0 +1,37 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "perfutil.h" + +static void perf_instancenorm(int w, int h, int c) +{ + ncnn::ParamDict pd; + pd.set(0, c); + pd.set(1, 1e-5f); + pd.set(2, 1); + + std::vector weights(2); + weights[0] = PerfMat(c, 1.0f); + weights[1] = PerfMat(c, 0.0f); + + perf_layer("InstanceNorm", pd, weights, PerfMat(w, h, c), "channels=%d", c); +} + +int main() +{ + // StyleGAN / diffusion representative shapes + perf_instancenorm(64, 64, 128); + perf_instancenorm(32, 32, 256); + perf_instancenorm(16, 16, 512); + perf_instancenorm(8, 8, 512); + + // Larger spatial + perf_instancenorm(224, 224, 64); + perf_instancenorm(224, 224, 3); + + // LLM-style degenerate case + perf_instancenorm(4096, 1, 1); + perf_instancenorm(512, 1, 1); + + return 0; +} From 1358f9946f346958b0051acdc4de1f3fc2236679 Mon Sep 17 00:00:00 2001 From: futz12 <1391525377@qq.com> Date: Fri, 29 May 2026 16:44:47 +0800 Subject: [PATCH 2/3] vulkan: remove dead #else branches in instancenorm subgroup reduce shaders --- .../shader/instancenorm_reduce_subgroup.comp | 46 +--------------- .../instancenorm_reduce_subgroup_pack4.comp | 53 +------------------ 2 files changed, 2 insertions(+), 97 deletions(-) diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp index 91117d4c121e..c91b0673e9f4 100644 --- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp @@ -3,13 +3,12 @@ #version 450 -#if ncnn_subgroup_arithmetic #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_arithmetic : enable #if NCNN_fp16_storage #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif -#endif + layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; @@ -40,7 +39,6 @@ void main() sum += v; } -#if ncnn_subgroup_arithmetic afp sg_sum = subgroupAdd(sum); if (subgroupElect()) { @@ -168,28 +166,6 @@ void main() barrier(); mean_val = sdata[0]; -#else - sdata[tid] = float(sum); - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata[tid] = sdata[tid] + sdata[tid + stride]; - } - barrier(); - } - - float mean_val = sdata[0] / area; - if (tid == 0) - { - sdata[0] = mean_val; - mean_data[channel_id] = mean_val; - } - barrier(); - mean_val = sdata[0]; -#endif // Phase 2: compute sqsum -> var afp sqsum = afp(0.f); @@ -201,7 +177,6 @@ void main() sqsum += d * d; } -#if ncnn_subgroup_arithmetic afp sg_sqsum = subgroupAdd(sqsum); if (subgroupElect()) { @@ -326,23 +301,4 @@ void main() var_data[channel_id] = var_val; } -#else - sdata[tid] = float(sqsum); - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata[tid] = sdata[tid] + sdata[tid + stride]; - } - barrier(); - } - - float var_val = sdata[0] / area; - if (tid == 0) - { - var_data[channel_id] = var_val; - } -#endif } diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp index f0ad4e3042a5..b6476f65d439 100644 --- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp @@ -3,13 +3,12 @@ #version 450 -#if ncnn_subgroup_arithmetic #extension GL_KHR_shader_subgroup_basic : enable #extension GL_KHR_shader_subgroup_arithmetic : enable #if NCNN_fp16_storage #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif -#endif + layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; @@ -41,7 +40,6 @@ void main() sum += v; } -#if ncnn_subgroup_arithmetic vec4 sg_sum = subgroupAdd(sum); if (subgroupElect()) { @@ -172,32 +170,6 @@ void main() barrier(); mean_v4 = sdata_v4[0]; -#else - // non-subgroup fallback: use shared memory scalar tree reduce per component - sdata_v4[tid] = sum; - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; - } - barrier(); - } - - vec4 mean_v4 = sdata_v4[0] / area; - if (tid == 0) - { - sdata_v4[0] = mean_v4; - mean_data[actual_channel_base + 0] = mean_v4.r; - mean_data[actual_channel_base + 1] = mean_v4.g; - mean_data[actual_channel_base + 2] = mean_v4.b; - mean_data[actual_channel_base + 3] = mean_v4.a; - } - barrier(); - mean_v4 = sdata_v4[0]; -#endif // Phase 2: compute sqsum -> var vec4 sqsum = vec4(0.f); @@ -209,7 +181,6 @@ void main() sqsum += d * d; } -#if ncnn_subgroup_arithmetic vec4 sg_sqsum = subgroupAdd(sqsum); if (subgroupElect()) { @@ -337,26 +308,4 @@ void main() var_data[actual_channel_base + 3] = var_v4.a; } -#else - sdata_v4[tid] = sqsum; - barrier(); - - for (int stride = 128; stride > 0; stride >>= 1) - { - if (tid < stride) - { - sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; - } - barrier(); - } - - vec4 var_v4 = sdata_v4[0] / area; - if (tid == 0) - { - var_data[actual_channel_base + 0] = var_v4.r; - var_data[actual_channel_base + 1] = var_v4.g; - var_data[actual_channel_base + 2] = var_v4.b; - var_data[actual_channel_base + 3] = var_v4.a; - } -#endif } From 1cad3e0e1a77a2dd0761f7fbd5351e5d3d907baa Mon Sep 17 00:00:00 2001 From: futz12 <56149058+futz12@users.noreply.github.com> Date: Fri, 29 May 2026 08:47:17 +0000 Subject: [PATCH 3/3] apply code-format changes --- src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp | 3 --- .../vulkan/shader/instancenorm_reduce_subgroup_pack4.comp | 3 --- 2 files changed, 6 deletions(-) diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp index c91b0673e9f4..6ae8004298ab 100644 --- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp @@ -9,7 +9,6 @@ #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif - layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; @@ -166,7 +165,6 @@ void main() barrier(); mean_val = sdata[0]; - // Phase 2: compute sqsum -> var afp sqsum = afp(0.f); for (int t = tid; t < p.size; t += 256) @@ -300,5 +298,4 @@ void main() { var_data[channel_id] = var_val; } - } diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp index b6476f65d439..ab16c2e8f1ab 100644 --- a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp @@ -9,7 +9,6 @@ #extension GL_EXT_shader_subgroup_extended_types_float16 : require #endif - layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; @@ -170,7 +169,6 @@ void main() barrier(); mean_v4 = sdata_v4[0]; - // Phase 2: compute sqsum -> var vec4 sqsum = vec4(0.f); for (int t = tid; t < p.size; t += 256) @@ -307,5 +305,4 @@ void main() var_data[actual_channel_base + 2] = var_v4.b; var_data[actual_channel_base + 3] = var_v4.a; } - }