diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 68a4912f199a..37626773ca53 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -58,6 +58,9 @@ if(NCNN_PIXEL) ncnn_add_example(yolov8_cls) ncnn_add_example(yolox) ncnn_add_example(yolo11) + if(NCNN_VULKAN) + ncnn_add_example(yolo11_vk_e2e) + endif() ncnn_add_example(yolo11_seg) ncnn_add_example(yolo11_pose) ncnn_add_example(yolo11_cls) diff --git a/examples/yolo11_vk_e2e.cpp b/examples/yolo11_vk_e2e.cpp new file mode 100644 index 000000000000..f0df71d180e6 --- /dev/null +++ b/examples/yolo11_vk_e2e.cpp @@ -0,0 +1,1149 @@ +// Copyright 2026 Futz12 +// SPDX-License-Identifier: BSD-3-Clause + +// yolo11 vulkan preprocess + postprocess example +// demonstrate: +// 1. custom vulkan operator with inline GLSL compute shader +// 2. net vulkan zero-copy via VkMat chaining +// 3. GLSL-based image preprocessing (BGR->RGB + bilinear resize + normalize + letterbox) +// 4. GLSL-based postprocessing (DFL decode + bbox decode + parallel NMS) + +#include "layer.h" +#include "net.h" + +#include "gpu.h" +#include "pipeline.h" +#include "command.h" +#include "allocator.h" + +#if defined(USE_NCNN_SIMPLEOCV) +#include "simpleocv.h" +#else +#include +#include +#include +#endif +#include +#include +#include +#include + +struct Object +{ + cv::Rect_ rect; + int label; + float prob; +}; + +// custom vulkan layer: YoloPreprocess +// performs BGR->RGB + bilinear resize + normalize(1/255) + letterbox pad(114/255) on GPU +class YoloPreprocess : public ncnn::Layer +{ +public: + YoloPreprocess(); + + virtual int create_pipeline(const ncnn::Option& opt); + virtual int destroy_pipeline(const ncnn::Option& opt); + + virtual int forward(const ncnn::VkMat& bottom_blob, ncnn::VkMat& top_blob, ncnn::VkCompute& cmd, const ncnn::Option& opt) const; + +public: + // runtime letterbox geometry + int target_size; + float scale; + int pad_left; + int pad_top; + int dst_w; + int dst_h; + int src_w; + int src_h; + int stride; + +private: + ncnn::Pipeline* pipeline_preprocess; +}; + +DEFINE_LAYER_CREATOR(YoloPreprocess) + +YoloPreprocess::YoloPreprocess() +{ + support_vulkan = true; + one_blob_only = true; + + pipeline_preprocess = 0; + + target_size = 640; + scale = 1.f; + pad_left = 0; + pad_top = 0; + dst_w = 0; + dst_h = 0; + src_w = 0; + src_h = 0; + stride = 0; +} + +// GLSL compute shader for yolo preprocess +// input: interleaved BGR uint8 raw bytes (binding 0) -- read via uint[] + bit-shift +// output: planar RGB sfp (binding 1) -- sfp auto-adapts to fp32/fp16/bf16 storage +// uses afp for arithmetic precision, buffer_st1 for storage write +static const char yolo_preprocess_comp[] = R "( +#version 450 + + layout(push_constant) uniform parameter +{ + int src_w; + int src_h; + int dst_w; + int dst_h; + int pad_left; + int pad_top; + float scale; + int stride; // bytes per row of source image + int dst_cstep; +} +p; + +layout(binding = 0) readonly buffer src_blob +{ + uint src_data[]; +}; +layout(binding = 1) writeonly buffer dst_blob +{ + sfp dst_data[]; +}; + +// read one byte from the uint-packed buffer (little-endian host layout) +uint read_u8(int byte_idx) +{ + int word_idx = byte_idx / 4; + int byte_offset = byte_idx % 4; + return (src_data[word_idx] >> (byte_offset * 8)) & 0xFFu; +} + +void main() +{ + int gx = int(gl_GlobalInvocationID.x); + int gy = int(gl_GlobalInvocationID.y); + if (gx >= p.dst_w || gy >= p.dst_h) + return; + + int dst_idx = gy * p.dst_w + gx; + + afp r, g, b; + + int resize_w = int(float(p.src_w) * p.scale); + int resize_h = int(float(p.src_h) * p.scale); + + if (gx < p.pad_left || gx >= p.pad_left + resize_w || gy < p.pad_top || gy >= p.pad_top + resize_h) + { + r = afp(114.0 / 255.0); + g = afp(114.0 / 255.0); + b = afp(114.0 / 255.0); + } + else + { + float src_x = (float(gx - p.pad_left) + 0.5f) / p.scale - 0.5f; + float src_y = (float(gy - p.pad_top) + 0.5f) / p.scale - 0.5f; + + int x0 = int(floor(src_x)); + int y0 = int(floor(src_y)); + int x1 = x0 + 1; + int y1 = y0 + 1; + float fx = src_x - float(x0); + float fy = src_y - float(y0); + + x0 = clamp(x0, 0, p.src_w - 1); + y0 = clamp(y0, 0, p.src_h - 1); + x1 = clamp(x1, 0, p.src_w - 1); + y1 = clamp(y1, 0, p.src_h - 1); + + int y0_offset = y0 * p.stride; + int y1_offset = y1 * p.stride; + + // B channel + float b00 = float(read_u8(y0_offset + x0 * 3 + 0)); + float b01 = float(read_u8(y0_offset + x1 * 3 + 0)); + float b10 = float(read_u8(y1_offset + x0 * 3 + 0)); + float b11 = float(read_u8(y1_offset + x1 * 3 + 0)); + float bf = mix(mix(b00, b01, fx), mix(b10, b11, fx), fy) / 255.0; + + // G channel + float g00 = float(read_u8(y0_offset + x0 * 3 + 1)); + float g01 = float(read_u8(y0_offset + x1 * 3 + 1)); + float g10 = float(read_u8(y1_offset + x0 * 3 + 1)); + float g11 = float(read_u8(y1_offset + x1 * 3 + 1)); + float gf = mix(mix(g00, g01, fx), mix(g10, g11, fx), fy) / 255.0; + + // R channel + float r00 = float(read_u8(y0_offset + x0 * 3 + 2)); + float r01 = float(read_u8(y0_offset + x1 * 3 + 2)); + float r10 = float(read_u8(y1_offset + x0 * 3 + 2)); + float r11 = float(read_u8(y1_offset + x1 * 3 + 2)); + float rf = mix(mix(r00, r01, fx), mix(r10, r11, fx), fy) / 255.0; + + // BGR -> RGB + r = afp(rf); + g = afp(gf); + b = afp(bf); + } + + buffer_st1(dst_data, dst_idx + 0 * p.dst_cstep, r); + buffer_st1(dst_data, dst_idx + 1 * p.dst_cstep, g); + buffer_st1(dst_data, dst_idx + 2 * p.dst_cstep, b); +} +)"; + +int YoloPreprocess::create_pipeline(const ncnn::Option& opt) +{ + std::vector spirv; + int ret = ncnn::compile_spirv_module(yolo_preprocess_comp, (int)strlen(yolo_preprocess_comp), opt, spirv); + if (ret != 0) + { + NCNN_LOGE("compile_spirv_module failed %d", ret); + return -1; + } + + pipeline_preprocess = new ncnn::Pipeline(vkdev); + pipeline_preprocess->set_optimal_local_size_xyz(8, 8, 1); + pipeline_preprocess->create(spirv.data(), spirv.size() * sizeof(uint32_t), std::vector()); + + return 0; +} + +int YoloPreprocess::destroy_pipeline(const ncnn::Option& /*opt*/) +{ + delete pipeline_preprocess; + pipeline_preprocess = 0; + + return 0; +} + +int YoloPreprocess::forward(const ncnn::VkMat& bottom_blob, ncnn::VkMat& top_blob, ncnn::VkCompute& cmd, const ncnn::Option& opt) const +{ + int elempack = 1; + size_t elemsize = (opt.use_fp16_storage || opt.use_fp16_packed || opt.use_bf16_storage || opt.use_bf16_packed) + ? elempack * 2u + : elempack * 4u; + + top_blob.create(dst_w, dst_h, 3, elemsize, elempack, opt.blob_vkallocator); + if (top_blob.empty()) + return -100; + + int dst_cstep = (int)ncnn::alignSize(dst_w * dst_h * (int)elemsize, 16) / (int)elemsize; + + std::vector bindings(2); + bindings[0] = bottom_blob; + bindings[1] = top_blob; + + std::vector constants(9); + constants[0].i = src_w; + constants[1].i = src_h; + constants[2].i = dst_w; + constants[3].i = dst_h; + constants[4].i = pad_left; + constants[5].i = pad_top; + constants[6].f = scale; + constants[7].i = stride; + constants[8].i = dst_cstep; + + ncnn::VkMat dispatcher; + dispatcher.w = dst_w; + dispatcher.h = dst_h; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_preprocess, bindings, constants, dispatcher); + + return 0; +} + +// YoloPostprocess: GPU-based post-processing for YOLO11 +// 1. generate proposals from model output (softmax DFL + bbox decode + coordinate transform) +// 2. parallel NMS (each thread checks all higher-score boxes) +class YoloPostprocess +{ +public: + YoloPostprocess(); + + int create_pipeline(ncnn::VulkanDevice* _vkdev, const ncnn::Option& opt); + int destroy_pipeline(const ncnn::Option& opt); + + int generate(const ncnn::VkMat& pred, ncnn::VkMat& proposals, ncnn::VkCompute& cmd, const ncnn::Option& opt) const; + int nms(const ncnn::VkMat& proposals, ncnn::VkMat& picked, ncnn::VkCompute& cmd, const ncnn::Option& opt) const; + + float prob_threshold; + float nms_threshold; + int num_class; + int num_anchor; + int img_w, img_h; + int pad_left, pad_top; + float scale; + int grid0, grid1, grid2; + int grid0_h, grid1_h, grid2_h; + +private: + ncnn::VulkanDevice* vkdev; + +public: + ncnn::Pipeline* pipeline_generate_pack1; + ncnn::Pipeline* pipeline_generate_pack4; + ncnn::Pipeline* pipeline_nms; +}; + +// GLSL compute shader for generating YOLO proposals (pack1) +// input: model pred output 144-dim per anchor (binding 0) -- sfp for ncnn storage compat +// output: proposals float6 per anchor (x0,y0,x1,y1,score,label) (binding 1) +static const char yolo_generate_comp_pack1[] = R "( +#version 450 + + layout(push_constant) uniform parameter +{ + int num_anchor; + int num_class; + float prob_threshold; + int img_w; + int img_h; + int pad_left; + int pad_top; + float scale; + int grid0; + int grid1; + int grid2; + int stride0; + int stride1; + int stride2; + int grid0_h; + int grid1_h; + int grid2_h; +} +p; + +layout(binding = 0) readonly buffer pred_blob +{ + sfp pred_data[]; +}; +layout(binding = 1) writeonly buffer proposals_blob +{ + float proposals_data[]; +}; + +void main() +{ + int idx = int(gl_GlobalInvocationID.x); + if (idx >= p.num_anchor) + return; + + int grid0_sq = p.grid0 * p.grid0_h; + int grid01_sq = grid0_sq + p.grid1 * p.grid1_h; + + int stride; + int grid_x, grid_y; + if (idx < grid0_sq) + { + stride = p.stride0; + grid_x = idx % p.grid0; + grid_y = idx / p.grid0; + } + else if (idx < grid01_sq) + { + int idx2 = idx - grid0_sq; + stride = p.stride1; + grid_x = idx2 % p.grid1; + grid_y = idx2 / p.grid1; + } + else + { + int idx2 = idx - grid01_sq; + stride = p.stride2; + grid_x = idx2 % p.grid2; + grid_y = idx2 / p.grid2; + } + + int base = idx * 144; + + // find max class score + afp max_score = afp(-1e9); + int label = -1; + for (int c = 0; c < p.num_class; c++) + { + afp s = buffer_ld1(pred_data, base + 64 + c); + if (s > max_score) + { + max_score = s; + label = c; + } + } + afp score = afp(1.0) / (afp(1.0) + exp(-max_score)); + + float x0, y0, x1, y1; + if (float(score) < p.prob_threshold) + { + x0 = y0 = x1 = y1 = 0.0; + score = afp(0.0); + label = -1; + } + else + { + // DFL softmax for l, t, r, b + afp l = afp(0.0), t = afp(0.0), r = afp(0.0), b = afp(0.0); + for (int k = 0; k < 4; k++) + { + afp vals[16]; + afp maxv = afp(-1e9); + int offset = base + k * 16; + for (int i = 0; i < 16; i++) + { + vals[i] = buffer_ld1(pred_data, offset + i); + maxv = max(maxv, vals[i]); + } + afp sum = afp(0.0); + for (int i = 0; i < 16; i++) + { + vals[i] = exp(vals[i] - maxv); + sum += vals[i]; + } + afp expect = afp(0.0); + for (int i = 0; i < 16; i++) + { + expect += afp(float(i)) * vals[i] / sum; + } + + if (k == 0) + l = expect * afp(float(stride)); + else if (k == 1) + t = expect * afp(float(stride)); + else if (k == 2) + r = expect * afp(float(stride)); + else + b = expect * afp(float(stride)); + } + + float pb_cx = (float(grid_x) + 0.5) * float(stride); + float pb_cy = (float(grid_y) + 0.5) * float(stride); + + x0 = (pb_cx - float(l) - float(p.pad_left)) / p.scale; + y0 = (pb_cy - float(t) - float(p.pad_top)) / p.scale; + x1 = (pb_cx + float(r) - float(p.pad_left)) / p.scale; + y1 = (pb_cy + float(b) - float(p.pad_top)) / p.scale; + + // clip to original image + x0 = clamp(x0, 0.0, float(p.img_w - 1)); + y0 = clamp(y0, 0.0, float(p.img_h - 1)); + x1 = clamp(x1, 0.0, float(p.img_w - 1)); + y1 = clamp(y1, 0.0, float(p.img_h - 1)); + } + + proposals_data[idx * 6 + 0] = x0; + proposals_data[idx * 6 + 1] = y0; + proposals_data[idx * 6 + 2] = x1; + proposals_data[idx * 6 + 3] = y1; + proposals_data[idx * 6 + 4] = float(score); + proposals_data[idx * 6 + 5] = float(label); +} +)"; + +// GLSL compute shader for generating YOLO proposals (pack4) +// input: model pred output 144-dim per anchor (binding 0) -- sfp for ncnn storage compat +// output: proposals float6 per anchor (x0,y0,x1,y1,score,label) (binding 1) +static const char yolo_generate_comp_pack4[] = R"( +#version 450 + +layout (push_constant) uniform parameter +{ + int num_anchor; + int num_class; + float prob_threshold; + int img_w; + int img_h; + int pad_left; + int pad_top; + float scale; + int grid0; + int grid1; + int grid2; + int stride0; + int stride1; + int stride2; + int grid0_h; + int grid1_h; + int grid2_h; +} +p; + +layout(binding = 0) readonly buffer pred_blob +{ + sfp pred_data[]; +}; +layout(binding = 1) writeonly buffer proposals_blob +{ + float proposals_data[]; +}; + +void main() +{ + int idx = int(gl_GlobalInvocationID.x); + if (idx >= p.num_anchor) + return; + + int grid0_sq = p.grid0 * p.grid0_h; + int grid01_sq = grid0_sq + p.grid1 * p.grid1_h; + + int stride; + int grid_x, grid_y; + if (idx < grid0_sq) + { + stride = p.stride0; + grid_x = idx % p.grid0; + grid_y = idx / p.grid0; + } + else if (idx < grid01_sq) + { + int idx2 = idx - grid0_sq; + stride = p.stride1; + grid_x = idx2 % p.grid1; + grid_y = idx2 / p.grid1; + } + else + { + int idx2 = idx - grid01_sq; + stride = p.stride2; + grid_x = idx2 % p.grid2; + grid_y = idx2 / p.grid2; + } + + int comp = idx % 4; + int packed_idx = idx / 4; + int base = packed_idx * 576; + + // find max class score + afp max_score = afp(-1e9); + int label = -1; + for (int c = 0; c < p.num_class; c++) + { + afp s = buffer_ld1(pred_data, base + (64 + c) * 4 + comp); + if (s > max_score) + { + max_score = s; + label = c; + } + } + afp score = afp(1.0) / (afp(1.0) + exp(-max_score)); + + float x0, y0, x1, y1; + if (float(score) < p.prob_threshold) + { + x0 = y0 = x1 = y1 = 0.0; + score = afp(0.0); + label = -1; + } + else + { + // DFL softmax for l, t, r, b + afp l = afp(0.0), t = afp(0.0), r = afp(0.0), b = afp(0.0); + for (int k = 0; k < 4; k++) + { + afp vals[16]; + afp maxv = afp(-1e9); + int offset = base + k * 64; + for (int i = 0; i < 16; i++) + { + vals[i] = buffer_ld1(pred_data, offset + i * 4 + comp); + maxv = max(maxv, vals[i]); + } + afp sum = afp(0.0); + for (int i = 0; i < 16; i++) + { + vals[i] = exp(vals[i] - maxv); + sum += vals[i]; + } + afp expect = afp(0.0); + for (int i = 0; i < 16; i++) + { + expect += afp(float(i)) * vals[i] / sum; + } + + if (k == 0) + l = expect * afp(float(stride)); + else if (k == 1) + t = expect * afp(float(stride)); + else if (k == 2) + r = expect * afp(float(stride)); + else + b = expect * afp(float(stride)); + } + + float pb_cx = (float(grid_x) + 0.5) * float(stride); + float pb_cy = (float(grid_y) + 0.5) * float(stride); + + x0 = (pb_cx - float(l) - float(p.pad_left)) / p.scale; + y0 = (pb_cy - float(t) - float(p.pad_top)) / p.scale; + x1 = (pb_cx + float(r) - float(p.pad_left)) / p.scale; + y1 = (pb_cy + float(b) - float(p.pad_top)) / p.scale; + + // clip to original image + x0 = clamp(x0, 0.0, float(p.img_w - 1)); + y0 = clamp(y0, 0.0, float(p.img_h - 1)); + x1 = clamp(x1, 0.0, float(p.img_w - 1)); + y1 = clamp(y1, 0.0, float(p.img_h - 1)); + } + + proposals_data[idx * 6 + 0] = x0; + proposals_data[idx * 6 + 1] = y0; + proposals_data[idx * 6 + 2] = x1; + proposals_data[idx * 6 + 3] = y1; + proposals_data[idx * 6 + 4] = float(score); + proposals_data[idx * 6 + 5] = float(label); +} +)"; + +// GLSL compute shader for parallel NMS +// input: proposals float6 per anchor (binding 0) +// output: picked int per anchor (1=keep, 0=suppress) (binding 1) +static const char yolo_nms_comp[] = R"( +#version 450 + +layout (push_constant) uniform parameter +{ + int num_anchor; + float nms_threshold; +} +p; + +layout(binding = 0) readonly buffer proposals_blob +{ + float proposals_data[]; +}; +layout(binding = 1) writeonly buffer picked_blob +{ + int picked_data[]; +}; + +float intersection_area(int i, int j) +{ + float x0 = max(proposals_data[i * 6 + 0], proposals_data[j * 6 + 0]); + float y0 = max(proposals_data[i * 6 + 1], proposals_data[j * 6 + 1]); + float x1 = min(proposals_data[i * 6 + 2], proposals_data[j * 6 + 2]); + float y1 = min(proposals_data[i * 6 + 3], proposals_data[j * 6 + 3]); + float w = max(x1 - x0, 0.0); + float h = max(y1 - y0, 0.0); + return w * h; +} + +void main() +{ + int i = int(gl_GlobalInvocationID.x); + if (i >= p.num_anchor) + return; + + float score_i = proposals_data[i * 6 + 4]; + if (score_i <= 0.0) + { + picked_data[i] = 0; + return; + } + + float area_i = (proposals_data[i * 6 + 2] - proposals_data[i * 6 + 0]) + * (proposals_data[i * 6 + 3] - proposals_data[i * 6 + 1]); + + int keep = 1; + for (int j = 0; j < p.num_anchor; j++) + { + if (i == j) + continue; + + float score_j = proposals_data[j * 6 + 4]; + if (score_j < score_i) + continue; + if (score_j == score_i && j >= i) + continue; + + int label_i = int(proposals_data[i * 6 + 5]); + int label_j = int(proposals_data[j * 6 + 5]); + if (label_i != label_j) + continue; + + float inter = intersection_area(i, j); + float area_j = (proposals_data[j * 6 + 2] - proposals_data[j * 6 + 0]) + * (proposals_data[j * 6 + 3] - proposals_data[j * 6 + 1]); + float union_area = area_i + area_j - inter; + + if (union_area > 0.0 && inter / union_area > p.nms_threshold) + { + keep = 0; + break; + } + } + + picked_data[i] = keep; +} +)"; + +YoloPostprocess::YoloPostprocess() +{ + vkdev = 0; + pipeline_generate_pack1 = 0; + pipeline_generate_pack4 = 0; + pipeline_nms = 0; + + prob_threshold = 0.25f; + nms_threshold = 0.45f; + num_class = 80; + num_anchor = 8400; + img_w = 0; + img_h = 0; + pad_left = 0; + pad_top = 0; + scale = 1.f; +} + +int YoloPostprocess::create_pipeline(ncnn::VulkanDevice* _vkdev, const ncnn::Option& opt) +{ + vkdev = _vkdev; + + // compile generate shader for pack1 + { + std::vector spirv; + int ret = ncnn::compile_spirv_module(yolo_generate_comp_pack1, (int)strlen(yolo_generate_comp_pack1), opt, spirv); + if (ret != 0) + { + NCNN_LOGE("compile generate pack1 spirv failed %d", ret); + return -1; + } + pipeline_generate_pack1 = new ncnn::Pipeline(vkdev); + pipeline_generate_pack1->set_optimal_local_size_xyz(256, 1, 1); + ret = pipeline_generate_pack1->create(spirv.data(), spirv.size() * sizeof(uint32_t), std::vector()); + if (ret != 0) + { + NCNN_LOGE("pipeline_generate_pack1 create failed %d", ret); + return ret; + } + NCNN_LOGE("pipeline_generate_pack1 created pipeline=%lu", (unsigned long)pipeline_generate_pack1->pipeline()); + } + + // compile generate shader for pack4 + { + std::vector spirv; + int ret = ncnn::compile_spirv_module(yolo_generate_comp_pack4, (int)strlen(yolo_generate_comp_pack4), opt, spirv); + if (ret != 0) + { + NCNN_LOGE("compile generate pack4 spirv failed %d", ret); + return -1; + } + pipeline_generate_pack4 = new ncnn::Pipeline(vkdev); + pipeline_generate_pack4->set_optimal_local_size_xyz(256, 1, 1); + ret = pipeline_generate_pack4->create(spirv.data(), spirv.size() * sizeof(uint32_t), std::vector()); + if (ret != 0) + { + NCNN_LOGE("pipeline_generate_pack4 create failed %d", ret); + return ret; + } + NCNN_LOGE("pipeline_generate_pack4 created pipeline=%lu", (unsigned long)pipeline_generate_pack4->pipeline()); + } + + // compile nms shader + { + std::vector spirv; + int ret = ncnn::compile_spirv_module(yolo_nms_comp, (int)strlen(yolo_nms_comp), opt, spirv); + if (ret != 0) + { + NCNN_LOGE("compile nms spirv failed %d", ret); + return -1; + } + pipeline_nms = new ncnn::Pipeline(vkdev); + pipeline_nms->set_optimal_local_size_xyz(256, 1, 1); + ret = pipeline_nms->create(spirv.data(), spirv.size() * sizeof(uint32_t), std::vector()); + if (ret != 0) + { + NCNN_LOGE("pipeline_nms create failed %d", ret); + return ret; + } + NCNN_LOGE("pipeline_nms created pipeline=%lu", (unsigned long)pipeline_nms->pipeline()); + } + + return 0; +} + +int YoloPostprocess::destroy_pipeline(const ncnn::Option& /*opt*/) +{ + delete pipeline_generate_pack1; + pipeline_generate_pack1 = 0; + delete pipeline_generate_pack4; + pipeline_generate_pack4 = 0; + delete pipeline_nms; + pipeline_nms = 0; + return 0; +} + +int YoloPostprocess::generate(const ncnn::VkMat& pred, ncnn::VkMat& proposals, ncnn::VkCompute& cmd, const ncnn::Option& opt) const +{ + proposals.create(6, num_anchor, 1, 4u, 1, opt.blob_vkallocator); + if (proposals.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = pred; + bindings[1] = proposals; + + std::vector constants(17); + constants[0].i = num_anchor; + constants[1].i = num_class; + constants[2].f = prob_threshold; + constants[3].i = img_w; + constants[4].i = img_h; + constants[5].i = pad_left; + constants[6].i = pad_top; + constants[7].f = scale; + constants[8].i = grid0; + constants[9].i = grid1; + constants[10].i = grid2; + constants[11].i = 8; // stride0 + constants[12].i = 16; // stride1 + constants[13].i = 32; // stride2 + constants[14].i = grid0_h; + constants[15].i = grid1_h; + constants[16].i = grid2_h; + + ncnn::VkMat dispatcher; + dispatcher.w = num_anchor; + dispatcher.h = 1; + dispatcher.c = 1; + + ncnn::Pipeline* pipeline_generate; + if (pred.elempack == 1) + pipeline_generate = pipeline_generate_pack1; + else if (pred.elempack == 4) + pipeline_generate = pipeline_generate_pack4; + else + { + NCNN_LOGE("unsupported pred elempack %d", pred.elempack); + return -1; + } + cmd.record_pipeline(pipeline_generate, bindings, constants, dispatcher); + return 0; +} + +int YoloPostprocess::nms(const ncnn::VkMat& proposals, ncnn::VkMat& picked, ncnn::VkCompute& cmd, const ncnn::Option& opt) const +{ + picked.create(1, num_anchor, 1, 4u, 1, opt.blob_vkallocator); + if (picked.empty()) + return -100; + + std::vector bindings(2); + bindings[0] = proposals; + bindings[1] = picked; + + std::vector constants(2); + constants[0].i = num_anchor; + constants[1].f = nms_threshold; + + ncnn::VkMat dispatcher; + dispatcher.w = num_anchor; + dispatcher.h = 1; + dispatcher.c = 1; + + cmd.record_pipeline(pipeline_nms, bindings, constants, dispatcher); + return 0; +} + +static int detect_yolo11_vk(const cv::Mat& bgr, std::vector& objects) +{ + const int target_size = 640; + const float prob_threshold = 0.25f; + const float nms_threshold = 0.45f; + + int img_w = bgr.cols; + int img_h = bgr.rows; + + // ultralytics/cfg/models/v8/yolo11.yaml + std::vector strides(3); + strides[0] = 8; + strides[1] = 16; + strides[2] = 32; + const int max_stride = 32; + + // letterbox pad to multiple of max_stride + int w = img_w; + int h = img_h; + float scale = 1.f; + if (w > h) + { + scale = (float)target_size / w; + w = target_size; + h = h * scale; + } + else + { + scale = (float)target_size / h; + h = target_size; + w = w * scale; + } + + int wpad = (w + max_stride - 1) / max_stride * max_stride - w; + int hpad = (h + max_stride - 1) / max_stride * max_stride - h; + + int dst_w = w + wpad; + int dst_h = h + hpad; + + // ===== Vulkan zero-copy preprocess + inference + postprocess ===== + ncnn::VulkanDevice* vkdev = ncnn::get_gpu_device(ncnn::get_default_gpu_index()); + ncnn::VkAllocator* blob_vkallocator = vkdev->acquire_blob_allocator(); + ncnn::VkAllocator* staging_vkallocator = vkdev->acquire_staging_allocator(); + + { + ncnn::Option opt; + opt.use_vulkan_compute = true; + opt.blob_vkallocator = blob_vkallocator; + opt.workspace_vkallocator = blob_vkallocator; + opt.staging_vkallocator = staging_vkallocator; + + // step 1~2: upload + preprocess in cmd1 + ncnn::VkCompute cmd1(vkdev); + +#if defined(USE_NCNN_SIMPLEOCV) + int stride = img_w * 3; + const uchar* bgr_data = bgr.data; +#else + cv::Mat bgr_cont = bgr.isContinuous() ? bgr : bgr.clone(); + int stride = (int)bgr_cont.step[0]; // bytes per row + const uchar* bgr_data = bgr_cont.data; +#endif + + // manually upload uint8 data via staging buffer to avoid ncnn's convert_packing + // (convert_packing does not support int8 -> fp16/fp32) + ncnn::VkMat staging_vkmat; + staging_vkmat.create(stride, img_h, 1, 1u, 1, opt.staging_vkallocator); + memcpy(staging_vkmat.mapped_ptr(), bgr_data, stride * img_h); + staging_vkmat.allocator->flush(staging_vkmat.data); + staging_vkmat.data->access_flags = VK_ACCESS_HOST_WRITE_BIT; + staging_vkmat.data->stage_flags = VK_PIPELINE_STAGE_HOST_BIT; + + ncnn::VkMat in_vkmat; + cmd1.record_clone(staging_vkmat, in_vkmat, opt); + + YoloPreprocess preprocess_layer; + preprocess_layer.vkdev = vkdev; + preprocess_layer.target_size = target_size; + preprocess_layer.scale = scale; + preprocess_layer.pad_left = wpad / 2; + preprocess_layer.pad_top = hpad / 2; + preprocess_layer.dst_w = dst_w; + preprocess_layer.dst_h = dst_h; + preprocess_layer.src_w = img_w; + preprocess_layer.src_h = img_h; + preprocess_layer.stride = stride; + int ret = preprocess_layer.create_pipeline(opt); + if (ret != 0) + { + NCNN_LOGE("preprocess create_pipeline failed %d", ret); + return -1; + } + + ncnn::VkMat pre_vkmat; + ret = preprocess_layer.forward(in_vkmat, pre_vkmat, cmd1, opt); + if (ret != 0) + { + NCNN_LOGE("preprocess forward failed %d", ret); + return -1; + } + cmd1.submit_and_wait(); + fprintf(stderr, "pre_vkmat: w=%d h=%d c=%d elemsize=%zu elempack=%d\n", + pre_vkmat.w, pre_vkmat.h, pre_vkmat.c, pre_vkmat.elemsize, pre_vkmat.elempack); + + // step 3~5: inference + postprocess + download in cmd2 + ncnn::Net yolo11; + yolo11.opt.use_vulkan_compute = true; + yolo11.opt.blob_vkallocator = blob_vkallocator; + yolo11.opt.workspace_vkallocator = blob_vkallocator; + yolo11.opt.staging_vkallocator = staging_vkallocator; + + yolo11.load_param("yolo11n.ncnn.param"); + yolo11.load_model("yolo11n.ncnn.bin"); + + ncnn::Extractor ex = yolo11.create_extractor(); + ex.input("in0", pre_vkmat); + + ncnn::VkCompute cmd2(vkdev); + + // model inference + ncnn::VkMat out_vkmat; + ex.extract("out0", out_vkmat, cmd2); + fprintf(stderr, "out_vkmat from extract: w=%d h=%d c=%d elemsize=%zu elempack=%d\n", + out_vkmat.w, out_vkmat.h, out_vkmat.c, out_vkmat.elemsize, out_vkmat.elempack); + + // GPU postprocess (generate proposals + NMS) + // shader uses sfp/buffer_ld1 for pred_data, supports both pack1 and pack4 + YoloPostprocess postprocess; + ret = postprocess.create_pipeline(vkdev, opt); + if (ret != 0) + { + NCNN_LOGE("postprocess create_pipeline failed %d", ret); + return -1; + } + postprocess.prob_threshold = prob_threshold; + postprocess.nms_threshold = nms_threshold; + postprocess.num_class = 80; + postprocess.num_anchor = out_vkmat.h * out_vkmat.elempack; + postprocess.img_w = img_w; + postprocess.img_h = img_h; + postprocess.pad_left = wpad / 2; + postprocess.pad_top = hpad / 2; + postprocess.scale = scale; + postprocess.grid0 = dst_w / strides[0]; + postprocess.grid1 = dst_w / strides[1]; + postprocess.grid2 = dst_w / strides[2]; + postprocess.grid0_h = dst_h / strides[0]; + postprocess.grid1_h = dst_h / strides[1]; + postprocess.grid2_h = dst_h / strides[2]; + + ncnn::VkMat proposals_vkmat; + ncnn::VkMat picked_vkmat; + ret = postprocess.generate(out_vkmat, proposals_vkmat, cmd2, opt); + if (ret != 0) + { + NCNN_LOGE("postprocess generate failed %d", ret); + return -1; + } + ret = postprocess.nms(proposals_vkmat, picked_vkmat, cmd2, opt); + if (ret != 0) + { + NCNN_LOGE("postprocess nms failed %d", ret); + return -1; + } + + // download results + ncnn::Mat proposals_mat; + ncnn::Mat picked_mat; + cmd2.record_download(proposals_vkmat, proposals_mat, opt); + cmd2.record_download(picked_vkmat, picked_mat, opt); + cmd2.submit_and_wait(); + + float* proposals_data = (float*)proposals_mat.data; + int* picked_data = (int*)picked_mat.data; + for (int i = 0; i < postprocess.num_anchor; i++) + { + if (picked_data[i] == 0) + continue; + float x0 = proposals_data[i * 6 + 0]; + float y0 = proposals_data[i * 6 + 1]; + float x1 = proposals_data[i * 6 + 2]; + float y1 = proposals_data[i * 6 + 3]; + float score = proposals_data[i * 6 + 4]; + int label = (int)proposals_data[i * 6 + 5]; + + if (label < 0 || score < prob_threshold) + continue; + + Object obj; + obj.rect = cv::Rect_(x0, y0, x1 - x0, y1 - y0); + obj.label = label; + obj.prob = score; + objects.push_back(obj); + } + + postprocess.destroy_pipeline(opt); + preprocess_layer.destroy_pipeline(opt); + + // blob_vkallocator->mappable indicates unified memory (iGPU), allowing zero-copy read via mapped_ptr() + } // all ncnn objects destroyed here before reclaiming allocators + + vkdev->reclaim_blob_allocator(blob_vkallocator); + vkdev->reclaim_staging_allocator(staging_vkallocator); + + return 0; +} + +static void draw_objects(const cv::Mat& bgr, const std::vector& objects) +{ + static const char* class_names[] = { + "person", "bicycle", "car", "motorcycle", "airplane", "bus", "train", "truck", "boat", "traffic light", + "fire hydrant", "stop sign", "parking meter", "bench", "bird", "cat", "dog", "horse", "sheep", "cow", + "elephant", "bear", "zebra", "giraffe", "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee", + "skis", "snowboard", "sports ball", "kite", "baseball bat", "baseball glove", "skateboard", "surfboard", + "tennis racket", "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl", "banana", "apple", + "sandwich", "orange", "broccoli", "carrot", "hot dog", "pizza", "donut", "cake", "chair", "couch", + "potted plant", "bed", "dining table", "toilet", "tv", "laptop", "mouse", "remote", "keyboard", "cell phone", + "microwave", "oven", "toaster", "sink", "refrigerator", "book", "clock", "vase", "scissors", "teddy bear", + "hair drier", "toothbrush"}; + + static cv::Scalar colors[] = { + cv::Scalar(244, 67, 54), + cv::Scalar(233, 30, 99), + cv::Scalar(156, 39, 176), + cv::Scalar(103, 58, 183), + cv::Scalar(63, 81, 181), + cv::Scalar(33, 150, 243), + cv::Scalar(3, 169, 244), + cv::Scalar(0, 188, 212), + cv::Scalar(0, 150, 136), + cv::Scalar(76, 175, 80), + cv::Scalar(139, 195, 74), + cv::Scalar(205, 220, 57), + cv::Scalar(255, 235, 59), + cv::Scalar(255, 193, 7), + cv::Scalar(255, 152, 0), + cv::Scalar(255, 87, 34), + cv::Scalar(121, 85, 72), + cv::Scalar(158, 158, 158), + cv::Scalar(96, 125, 139)}; + + cv::Mat image = bgr.clone(); + + for (size_t i = 0; i < objects.size(); i++) + { + const Object& obj = objects[i]; + + const cv::Scalar& color = colors[i % 19]; + + fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob, + obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height); + + cv::rectangle(image, obj.rect, color); + + char text[256]; + sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100); + + int baseLine = 0; + cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine); + + int x = obj.rect.x; + int y = obj.rect.y - label_size.height - baseLine; + if (y < 0) + y = 0; + if (x + label_size.width > image.cols) + x = image.cols - label_size.width; + + cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)), + cv::Scalar(255, 255, 255), -1); + + cv::putText(image, text, cv::Point(x, y + label_size.height), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0)); + } + + cv::imshow("image", image); + cv::waitKey(0); +} + +int main(int argc, char** argv) +{ + if (argc != 2) + { + fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]); + return -1; + } + + const char* imagepath = argv[1]; + + cv::Mat m = cv::imread(imagepath, 1); + if (m.empty()) + { + fprintf(stderr, "cv::imread %s failed\n", imagepath); + return -1; + } + + std::vector objects; + detect_yolo11_vk(m, objects); + + draw_objects(m, objects); + + return 0; +}