diff --git a/src/layer/vulkan/instancenorm_vulkan.cpp b/src/layer/vulkan/instancenorm_vulkan.cpp index 17cd106f3fcd..b35a97ad78ec 100644 --- a/src/layer/vulkan/instancenorm_vulkan.cpp +++ b/src/layer/vulkan/instancenorm_vulkan.cpp @@ -20,6 +20,9 @@ InstanceNorm_vulkan::InstanceNorm_vulkan() pipeline_instancenorm_coeffs = 0; pipeline_instancenorm_norm = 0; + pipeline_instancenorm_reduce_subgroup = 0; + pipeline_instancenorm_reduce_subgroup_pack4 = 0; + pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0; pipeline_instancenorm_reduce_sum4_fp32_pack4[0] = 0; pipeline_instancenorm_reduce_sum4_fp32_pack4[1] = 0; @@ -239,6 +242,17 @@ int InstanceNorm_vulkan::create_pipeline(const Option& opt) } } + if (vkdev->info.support_subgroup_ops() & VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) + { + pipeline_instancenorm_reduce_subgroup = new Pipeline(vkdev); + pipeline_instancenorm_reduce_subgroup->set_local_size_xyz(256, 1, 1); + pipeline_instancenorm_reduce_subgroup->create(LayerShaderType::instancenorm_reduce_subgroup, opt, std::vector()); + + pipeline_instancenorm_reduce_subgroup_pack4 = new Pipeline(vkdev); + pipeline_instancenorm_reduce_subgroup_pack4->set_local_size_xyz(256, 1, 1); + pipeline_instancenorm_reduce_subgroup_pack4->create(LayerShaderType::instancenorm_reduce_subgroup_pack4, opt, std::vector()); + } + return 0; } @@ -264,6 +278,11 @@ int InstanceNorm_vulkan::destroy_pipeline(const Option& /*opt*/) delete pipeline_instancenorm_norm; pipeline_instancenorm_norm = 0; + delete pipeline_instancenorm_reduce_subgroup; + pipeline_instancenorm_reduce_subgroup = 0; + delete pipeline_instancenorm_reduce_subgroup_pack4; + pipeline_instancenorm_reduce_subgroup_pack4 = 0; + delete pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4; pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 = 0; @@ -308,200 +327,224 @@ int InstanceNorm_vulkan::forward_inplace(VkMat& bottom_top_blob, VkCompute& cmd, size_t elemsize = bottom_top_blob.elemsize; int elempack = bottom_top_blob.elempack; - // mean + // mean and var VkMat mean_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); + VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); + + const Pipeline* pipeline_reduce_subgroup = elempack == 4 ? pipeline_instancenorm_reduce_subgroup_pack4 : pipeline_instancenorm_reduce_subgroup; + if (pipeline_reduce_subgroup) + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = var_workspace; + + std::vector constants(2); + constants[0].i = bottom_top_blob.cstep; + constants[1].i = size; + + VkMat dispatcher; + dispatcher.w = 1; + dispatcher.h = c; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_reduce_subgroup, bindings, constants, dispatcher); + } + else { - // reduce sum - VkMat sum_workspace; + // mean { - int reduced_w = (size + 3) / 4; - int reduced_h = 1; - int reduced_c = bottom_top_blob.c; + // reduce sum + VkMat sum_workspace; + { + int reduced_w = (size + 3) / 4; + int reduced_h = 1; + int reduced_c = bottom_top_blob.c; + + sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + { + std::vector bindings(2); + bindings[0] = bottom_top_blob; + bindings[1] = sum_workspace; + + std::vector constants(8); + constants[0].i = size; + constants[1].i = 1; + constants[2].i = bottom_top_blob.c; + constants[3].i = bottom_top_blob.cstep; + constants[4].i = sum_workspace.w; + constants[5].i = 1; + constants[6].i = sum_workspace.c; + constants[7].i = sum_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32; + + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); + } + } - sum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + int pb = 0; + while (sum_workspace.w > 4) { - std::vector bindings(2); - bindings[0] = bottom_top_blob; - bindings[1] = sum_workspace; + int reduced_w = (sum_workspace.w + 3) / 4; + int reduced_h = 1; + int reduced_c = sum_workspace.c; - std::vector constants(8); - constants[0].i = size; - constants[1].i = 1; - constants[2].i = bottom_top_blob.c; - constants[3].i = bottom_top_blob.cstep; - constants[4].i = sum_workspace.w; - constants[5].i = 1; - constants[6].i = sum_workspace.c; - constants[7].i = sum_workspace.cstep; + VkMat sum_workspace_reduced; + sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4 : pipeline_instancenorm_reduce_sum4_fp16_to_fp32; + { + std::vector bindings(2); + bindings[0] = sum_workspace; + bindings[1] = sum_workspace_reduced; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace); - } - } + std::vector constants(8); + constants[0].i = sum_workspace.w; + constants[1].i = 1; + constants[2].i = sum_workspace.c; + constants[3].i = sum_workspace.cstep; + constants[4].i = sum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sum_workspace_reduced.c; + constants[7].i = sum_workspace_reduced.cstep; - int pb = 0; - while (sum_workspace.w > 4) - { - int reduced_w = (sum_workspace.w + 3) / 4; - int reduced_h = 1; - int reduced_c = sum_workspace.c; + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; - VkMat sum_workspace_reduced; - sum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + + pb++; + } + + sum_workspace = sum_workspace_reduced; + } { std::vector bindings(2); bindings[0] = sum_workspace; - bindings[1] = sum_workspace_reduced; + bindings[1] = mean_workspace; - std::vector constants(8); + std::vector constants(5); constants[0].i = sum_workspace.w; constants[1].i = 1; constants[2].i = sum_workspace.c; constants[3].i = sum_workspace.cstep; - constants[4].i = sum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sum_workspace_reduced.c; - constants[7].i = sum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = size; - cmd.record_pipeline(pipeline, bindings, constants, sum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); } - - sum_workspace = sum_workspace_reduced; } + // var { - std::vector bindings(2); - bindings[0] = sum_workspace; - bindings[1] = mean_workspace; + // sub mean and square + VkMat square_workspace; + square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); + { + std::vector bindings(3); + bindings[0] = bottom_top_blob; + bindings[1] = mean_workspace; + bindings[2] = square_workspace; + + std::vector constants(10); + constants[0].i = std::min(3, bottom_top_blob.dims); + constants[1].i = bottom_top_blob.w; + constants[2].i = h; + constants[3].i = bottom_top_blob.c; + constants[4].i = bottom_top_blob.cstep; + constants[5].i = square_workspace.dims; + constants[6].i = square_workspace.w; + constants[7].i = square_workspace.h; + constants[8].i = square_workspace.c; + constants[9].i = square_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square; + + cmd.record_pipeline(pipeline, bindings, constants, square_workspace); + } - std::vector constants(5); - constants[0].i = sum_workspace.w; - constants[1].i = 1; - constants[2].i = sum_workspace.c; - constants[3].i = sum_workspace.cstep; - constants[4].f = size; + // reduce square + VkMat sqsum_workspace; + { + int reduced_w = (size + 3) / 4; + int reduced_h = 1; + int reduced_c = square_workspace.c; + + sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + + { + std::vector bindings(2); + bindings[0] = square_workspace; + bindings[1] = sqsum_workspace; + + std::vector constants(8); + constants[0].i = size; + constants[1].i = 1; + constants[2].i = square_workspace.c; + constants[3].i = square_workspace.cstep; + constants[4].i = sqsum_workspace.w; + constants[5].i = 1; + constants[6].i = sqsum_workspace.c; + constants[7].i = sqsum_workspace.cstep; + + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0]; + + cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace); + } + } - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; + int pb = 1; + while (sqsum_workspace.w > 4) + { + int reduced_w = (sqsum_workspace.w + 3) / 4; + int reduced_h = 1; + int reduced_c = sqsum_workspace.c; - cmd.record_pipeline(pipeline, bindings, constants, mean_workspace); - } - } + VkMat sqsum_workspace_reduced; + sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); - // var - VkMat var_workspace(c, 4u * elempack, elempack, opt.workspace_vkallocator); - { - // sub mean and square - VkMat square_workspace; - square_workspace.create(w, h, c, 4u * elempack, elempack, opt.workspace_vkallocator); - { - std::vector bindings(3); - bindings[0] = bottom_top_blob; - bindings[1] = mean_workspace; - bindings[2] = square_workspace; - - std::vector constants(10); - constants[0].i = std::min(3, bottom_top_blob.dims); - constants[1].i = bottom_top_blob.w; - constants[2].i = h; - constants[3].i = bottom_top_blob.c; - constants[4].i = bottom_top_blob.cstep; - constants[5].i = square_workspace.dims; - constants[6].i = square_workspace.w; - constants[7].i = square_workspace.h; - constants[8].i = square_workspace.c; - constants[9].i = square_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_sub_mean_square_pack4 : pipeline_instancenorm_sub_mean_square; - - cmd.record_pipeline(pipeline, bindings, constants, square_workspace); - } + { + std::vector bindings(2); + bindings[0] = sqsum_workspace; + bindings[1] = sqsum_workspace_reduced; - // reduce square - VkMat sqsum_workspace; - { - int reduced_w = (size + 3) / 4; - int reduced_h = 1; - int reduced_c = square_workspace.c; + std::vector constants(8); + constants[0].i = sqsum_workspace.w; + constants[1].i = 1; + constants[2].i = sqsum_workspace.c; + constants[3].i = sqsum_workspace.cstep; + constants[4].i = sqsum_workspace_reduced.w; + constants[5].i = 1; + constants[6].i = sqsum_workspace_reduced.c; + constants[7].i = sqsum_workspace_reduced.cstep; - sqsum_workspace.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; - { - std::vector bindings(2); - bindings[0] = square_workspace; - bindings[1] = sqsum_workspace; + cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); - std::vector constants(8); - constants[0].i = size; - constants[1].i = 1; - constants[2].i = square_workspace.c; - constants[3].i = square_workspace.cstep; - constants[4].i = sqsum_workspace.w; - constants[5].i = 1; - constants[6].i = sqsum_workspace.c; - constants[7].i = sqsum_workspace.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[0] : pipeline_instancenorm_reduce_sum4_fp32[0]; + pb++; + } - cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace); + sqsum_workspace = sqsum_workspace_reduced; } - } - - int pb = 1; - while (sqsum_workspace.w > 4) - { - int reduced_w = (sqsum_workspace.w + 3) / 4; - int reduced_h = 1; - int reduced_c = sqsum_workspace.c; - - VkMat sqsum_workspace_reduced; - sqsum_workspace_reduced.create(reduced_w, reduced_h, reduced_c, 4u * elempack, elempack, opt.workspace_vkallocator); { std::vector bindings(2); bindings[0] = sqsum_workspace; - bindings[1] = sqsum_workspace_reduced; + bindings[1] = var_workspace; - std::vector constants(8); + std::vector constants(5); constants[0].i = sqsum_workspace.w; constants[1].i = 1; constants[2].i = sqsum_workspace.c; constants[3].i = sqsum_workspace.cstep; - constants[4].i = sqsum_workspace_reduced.w; - constants[5].i = 1; - constants[6].i = sqsum_workspace_reduced.c; - constants[7].i = sqsum_workspace_reduced.cstep; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_sum4_fp32_pack4[pb % 2] : pipeline_instancenorm_reduce_sum4_fp32[pb % 2]; + constants[4].f = size; - cmd.record_pipeline(pipeline, bindings, constants, sqsum_workspace_reduced); + const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - pb++; + cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } - - sqsum_workspace = sqsum_workspace_reduced; - } - - { - std::vector bindings(2); - bindings[0] = sqsum_workspace; - bindings[1] = var_workspace; - - std::vector constants(5); - constants[0].i = sqsum_workspace.w; - constants[1].i = 1; - constants[2].i = sqsum_workspace.c; - constants[3].i = sqsum_workspace.cstep; - constants[4].f = size; - - const Pipeline* pipeline = elempack == 4 ? pipeline_instancenorm_reduce_mean_pack4 : pipeline_instancenorm_reduce_mean; - - cmd.record_pipeline(pipeline, bindings, constants, var_workspace); } } diff --git a/src/layer/vulkan/instancenorm_vulkan.h b/src/layer/vulkan/instancenorm_vulkan.h index e2a27de85129..edec5a918e1c 100644 --- a/src/layer/vulkan/instancenorm_vulkan.h +++ b/src/layer/vulkan/instancenorm_vulkan.h @@ -32,6 +32,9 @@ class InstanceNorm_vulkan : public InstanceNorm Pipeline* pipeline_instancenorm_coeffs; Pipeline* pipeline_instancenorm_norm; + Pipeline* pipeline_instancenorm_reduce_subgroup; + Pipeline* pipeline_instancenorm_reduce_subgroup_pack4; + Pipeline* pipeline_instancenorm_reduce_sum4_fp16_to_fp32_pack4; Pipeline* pipeline_instancenorm_reduce_sum4_fp32_pack4[2]; Pipeline* pipeline_instancenorm_reduce_mean_pack4; diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp new file mode 100644 index 000000000000..6ae8004298ab --- /dev/null +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup.comp @@ -0,0 +1,301 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfp bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + int size; +} p; + +shared float sdata[256]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int channel_id = int(gl_WorkGroupID.y); + + const float area = float(p.size); + const int base_offset = channel_id * p.cstep; + + // Phase 1: compute sum -> mean + afp sum = afp(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + sum += v; + } + + afp sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float mean_val = sdata[0] / area; + if (tid == 0) + { + sdata[0] = mean_val; + mean_data[channel_id] = mean_val; + } + barrier(); + mean_val = sdata[0]; + + // Phase 2: compute sqsum -> var + afp sqsum = afp(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + afp v = buffer_ld1(bottom_top_blob_data, v_offset); + afp d = v - afp(mean_val); + sqsum += d * d; + } + + afp sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(sg_sqsum); + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + afp v = lane < num_sg ? afp(sdata[lane]) : afp(0.f); + afp r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r_sqsum); + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = lane < 4 ? afp(sdata[lane]) : afp(0.f); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + afp v = afp(sdata[base + lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[int(gl_SubgroupID)] = float(r); + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + afp v = afp(sdata[lane]); + afp r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata[0] = float(r); + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata[tid] = sdata[tid] + sdata[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata[0] = sdata[0]; + } + } +#endif + + barrier(); + + float var_val = sdata[0] / area; + if (tid == 0) + { + var_data[channel_id] = var_val; + } +} diff --git a/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp new file mode 100644 index 000000000000..ab16c2e8f1ab --- /dev/null +++ b/src/layer/vulkan/shader/instancenorm_reduce_subgroup_pack4.comp @@ -0,0 +1,308 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#version 450 + +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_KHR_shader_subgroup_arithmetic : enable +#if NCNN_fp16_storage +#extension GL_EXT_shader_subgroup_extended_types_float16 : require +#endif + +layout(binding = 0) readonly buffer bottom_top_blob { sfpvec4 bottom_top_blob_data[]; }; +layout(binding = 1) writeonly buffer mean_blob { float mean_data[]; }; +layout(binding = 2) writeonly buffer var_blob { float var_data[]; }; + +layout(push_constant) uniform parameter +{ + int cstep; + int size; +} p; + +shared vec4 sdata_v4[64]; + +void main() +{ + const int tid = int(gl_LocalInvocationID.x); + const int packed_channel_id = int(gl_WorkGroupID.y); + const int actual_channel_base = packed_channel_id * 4; + + const float area = float(p.size); + const int base_offset = packed_channel_id * p.cstep; + + // Phase 1: compute sum -> mean + vec4 sum = vec4(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset)); + sum += v; + } + + vec4 sg_sum = subgroupAdd(sum); + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = sg_sum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f); + vec4 r_sum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r_sum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f); + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = sdata_v4[lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata_v4[0] = sdata_v4[0]; + } + } +#endif + + barrier(); + + vec4 mean_v4 = sdata_v4[0] / area; + if (tid == 0) + { + sdata_v4[0] = mean_v4; + mean_data[actual_channel_base + 0] = mean_v4.r; + mean_data[actual_channel_base + 1] = mean_v4.g; + mean_data[actual_channel_base + 2] = mean_v4.b; + mean_data[actual_channel_base + 3] = mean_v4.a; + } + barrier(); + mean_v4 = sdata_v4[0]; + + // Phase 2: compute sqsum -> var + vec4 sqsum = vec4(0.f); + for (int t = tid; t < p.size; t += 256) + { + int v_offset = base_offset + t; + vec4 v = vec4(buffer_ld4(bottom_top_blob_data, v_offset)); + vec4 d = v - mean_v4; + sqsum += d * d; + } + + vec4 sg_sqsum = subgroupAdd(sqsum); + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = sg_sqsum; + } + + barrier(); + +#if ncnn_subgroupSize >= 16 + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + const int num_sg = int(gl_NumSubgroups); + + vec4 v = lane < num_sg ? sdata_v4[lane] : vec4(0.f); + vec4 r_sqsum = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r_sqsum; + } + } +#elif ncnn_subgroupSize == 8 + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 8; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = lane < 4 ? sdata_v4[lane] : vec4(0.f); + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#elif ncnn_subgroupSize == 4 + if (int(gl_SubgroupID) < 16) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) < 4) + { + const int lane = int(gl_SubgroupInvocationID); + const int base = int(gl_SubgroupID) * 4; + + vec4 v = sdata_v4[base + lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[int(gl_SubgroupID)] = r; + } + } + + barrier(); + + if (int(gl_SubgroupID) == 0) + { + const int lane = int(gl_SubgroupInvocationID); + + vec4 v = sdata_v4[lane]; + vec4 r = subgroupAdd(v); + + if (subgroupElect()) + { + sdata_v4[0] = r; + } + } +#else + { + const int num_sg = 256 / ncnn_subgroupSize; + + for (int stride = num_sg / 2; stride > 0; stride >>= 1) + { + if (tid < stride) + { + sdata_v4[tid] = sdata_v4[tid] + sdata_v4[tid + stride]; + } + barrier(); + } + + if (tid == 0) + { + sdata_v4[0] = sdata_v4[0]; + } + } +#endif + + barrier(); + + vec4 var_v4 = sdata_v4[0] / area; + if (tid == 0) + { + var_data[actual_channel_base + 0] = var_v4.r; + var_data[actual_channel_base + 1] = var_v4.g; + var_data[actual_channel_base + 2] = var_v4.b; + var_data[actual_channel_base + 3] = var_v4.a; + } +} diff --git a/tests/perf/CMakeLists.txt b/tests/perf/CMakeLists.txt index 10c0535d8087..06ed1e80c966 100644 --- a/tests/perf/CMakeLists.txt +++ b/tests/perf/CMakeLists.txt @@ -38,6 +38,7 @@ ncnn_add_layer_perf(BinaryOp) ncnn_add_layer_perf(Concat) ncnn_add_layer_perf(Sigmoid) ncnn_add_layer_perf(BatchNorm) +ncnn_add_layer_perf(InstanceNorm) # SDPA perf tests (decode and prefill phases) if(WITH_LAYER_sdpa) diff --git a/tests/perf/perf_instancenorm.cpp b/tests/perf/perf_instancenorm.cpp new file mode 100644 index 000000000000..5b22f3acb387 --- /dev/null +++ b/tests/perf/perf_instancenorm.cpp @@ -0,0 +1,37 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +#include "perfutil.h" + +static void perf_instancenorm(int w, int h, int c) +{ + ncnn::ParamDict pd; + pd.set(0, c); + pd.set(1, 1e-5f); + pd.set(2, 1); + + std::vector weights(2); + weights[0] = PerfMat(c, 1.0f); + weights[1] = PerfMat(c, 0.0f); + + perf_layer("InstanceNorm", pd, weights, PerfMat(w, h, c), "channels=%d", c); +} + +int main() +{ + // StyleGAN / diffusion representative shapes + perf_instancenorm(64, 64, 128); + perf_instancenorm(32, 32, 256); + perf_instancenorm(16, 16, 512); + perf_instancenorm(8, 8, 512); + + // Larger spatial + perf_instancenorm(224, 224, 64); + perf_instancenorm(224, 224, 3); + + // LLM-style degenerate case + perf_instancenorm(4096, 1, 1); + perf_instancenorm(512, 1, 1); + + return 0; +}