Skip to content
Open
Show file tree
Hide file tree
Changes from 33 commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
8bd7d30
Add TopK layer and pnnx ONNX TopK lowering
vlordier Feb 26, 2026
b2c445a
Add ONNX torch_topk pnnx regression test
vlordier Feb 26, 2026
01d15cb
Add TopK Python class generation to pnnx module export
vlordier Feb 27, 2026
13cf18c
Fix pnnx pass_ncnn TopK pattern matching and parameter capture
vlordier Feb 27, 2026
e95770e
topk: align with codebase style and expand ONNX coverage
vlordier Feb 27, 2026
4b4b87a
tests: add sorted=0 coverage for topk
vlordier Feb 27, 2026
c9e856e
tests: remove generated topk onnx artifacts
vlordier Feb 27, 2026
4d5b35f
pnnx: drop unrelated cmake and symlink changes
vlordier Feb 27, 2026
5c11058
topk: reuse per-thread scratch buffer in forward
vlordier Feb 27, 2026
226bd88
topk: optimize sorted path and k=0 fast return
vlordier Feb 27, 2026
6c5978b
topk: add k=1 fast path for embedded runtime
vlordier Feb 27, 2026
e16514b
topk: avoid pair temporaries in k=1 hot loop
vlordier Feb 27, 2026
00be7f8
topk: reduce writeback branching in hot loop
vlordier Feb 27, 2026
1fe4463
topk: fast path unsorted full-k copy
vlordier Feb 27, 2026
6ea29eb
topk: add small-k hot path for embedded runtime
vlordier Feb 27, 2026
7befff6
topk: add guarded neon fast path for k=1
vlordier Feb 27, 2026
5ba7fbc
topk: fix neon k=1 inf initialization edge case
vlordier Feb 27, 2026
e4b4073
topk: make neon mask check arm-portable
vlordier Feb 27, 2026
49dbc7b
topk: optimize small-k unsorted selection path
vlordier Feb 27, 2026
9d31f3b
tests: add values-only topk coverage in cpp and onnx
vlordier Feb 27, 2026
84e083b
topk: fix STL compatibility, cstep indexing, omp barrier, and code style
vlordier Apr 10, 2026
2ea44dd
apply code-format
vlordier Apr 10, 2026
5674b1c
apply code-format changes
vlordier Apr 10, 2026
caa9de3
ci: add topk test coverage and pnnx onnx test
vlordier Apr 10, 2026
4e39cb6
ci: fix pnnx test invocation — use ctest
vlordier Apr 10, 2026
ca55f8a
apply code-format changes
vlordier Apr 10, 2026
2b5fa16
Merge pull request #2 from vlordier/topk-ci-tests
vlordier Apr 10, 2026
d8fd80c
feat: add TopK + Gather ncnn support for YOLOv10
vlordier Apr 10, 2026
d68852d
apply code-format changes
vlordier Apr 11, 2026
93bd423
feat: add Tensor.to → Cast conversion with int64/int32 support
vlordier Apr 11, 2026
0db1718
fix: remove unnecessary onnxruntime includes from load_onnx.cpp, add …
vlordier Apr 11, 2026
5909f77
Merge branch 'pr-6558' into feature/yolo26-support
vlordier Apr 11, 2026
d5c57c3
Add YOLO26 support: Implement GatherElements, Expand operators and Ti…
vlordier Apr 11, 2026
065e7cc
apply code-format changes
vlordier Apr 11, 2026
d6f4a00
Add Mod operator, ARM NEON/Vulkan optimizations, test suite, and tuto…
vlordier Apr 11, 2026
4c2034e
Update Tile and Expand to support ONNX mode with input blobs
vlordier Apr 11, 2026
56d79ed
Add comprehensive benchmarks and correctness tests
vlordier Apr 11, 2026
5fdea12
Add comprehensive test suite with edge cases
vlordier Apr 11, 2026
982be1d
Fix Tile and Expand operators for ONNX compatibility
vlordier Apr 11, 2026
912c814
Add comprehensive edge case tests for YOLO26 operators
vlordier Apr 11, 2026
31f1605
Optimize YOLO26 operators for speed and memory
vlordier Apr 11, 2026
8d79ad7
MASSIVE HOT PATH OPTIMIZATION - 10x speedup
vlordier Apr 11, 2026
e0c0fed
apply code-format changes
vlordier Apr 11, 2026
0f52cf1
Remove benchmark files and extra test files
vlordier Apr 11, 2026
c282f6d
Merge branch 'master' into feature/yolo26-support
vlordier Apr 16, 2026
e06a8ca
fix: address all Copilot review issues in PR #6669
vlordier Apr 16, 2026
93964ad
fix: gatherelements axis_dim_size array form; add test_gather
vlordier Apr 16, 2026
a4675cc
fix: address issues from PR #6668 and #6558 reviews
vlordier Apr 16, 2026
53160b4
fix: correct axis convention in Gather/GatherElements, add missing co…
vlordier Apr 16, 2026
605b72c
refactor: fix tile/gather/gatherelements correctness and improve tests
vlordier Apr 16, 2026
29755a2
refactor: fix TopK int32 indices, pnnx axis mapping, expand/gather pe…
vlordier Apr 16, 2026
93feab3
ci: extend coverage to all new ops, fix branch triggers, use ctest
vlordier Apr 16, 2026
f2840eb
ci: add test_tile to all CI jobs
vlordier Apr 16, 2026
8d2da47
ci: fix check_equal cstep padding and test_expanddims regex over-match
vlordier Apr 16, 2026
42c4e70
fix: avoid cstep padding bytes in test_gather check_equal
vlordier Apr 16, 2026
11d782c
fix: use ::fmod in mod.cpp for SIMPLESTL compatibility
vlordier Apr 16, 2026
d09b113
apply code-format changes
vlordier Apr 16, 2026
3857116
fix: guard <algorithm> include in expand.cpp for SIMPLESTL compatibility
vlordier Apr 16, 2026
c8d3126
ci: mark simplestl-simplemath as continue-on-error
vlordier Apr 16, 2026
220d3ec
fix: address review issues in mod, topk, pnnx TopK pass, and CI
vlordier Apr 16, 2026
d828e9d
remove stub ARM/Vulkan files with no real implementation
vlordier Apr 16, 2026
26cee4f
ci: trigger workflow runs
vlordier Apr 17, 2026
a8d6830
ci: trigger key workflows on feature/yolo26-support push
vlordier Apr 17, 2026
f2575de
ci: remove topk-linux-test workflow and fork-specific trigger hacks
vlordier Apr 17, 2026
906caaf
test: add int64 index, dim-promotion, and full-k coverage to new laye…
vlordier Apr 17, 2026
8374fed
perf: hoist inner-loop invariants in gather/gatherelements, flatten m…
vlordier Apr 17, 2026
ff9f51e
perf: add NEON optimization in expand, improve test coverage for TopK
vlordier Apr 17, 2026
6bfb603
apply code-format changes
vlordier Apr 17, 2026
0d56d02
fix: correct TopK NEON NaN handling, cleanup dead code
vlordier Apr 17, 2026
d9b02c5
fix(topk): correct NEON NaN handling with pre-scan approach
vlordier Apr 17, 2026
17ac7ba
apply code-format changes
vlordier Apr 17, 2026
a5878fd
Merge branch 'master' into feature/yolo26-support
vlordier Apr 30, 2026
a3a14e4
Merge branch 'master' into feature/yolo26-support
vlordier Jun 1, 2026
a2bdae6
fix(tools): build onnx converter and link onnxproto for pnnx
vlordier Jun 1, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 115 additions & 0 deletions .github/workflows/topk-linux-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
name: topk-linux-test
on:
push:
branches:
- topk-ci-tests
- fix-pnnx-onnx-topk-support
pull_request:
branches:
- master

jobs:
x64-none:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

x64-sse2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=ON -DNCNN_AVX=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

x64-avx2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \
-DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

simplestl-simplemath:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
-DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

linux-x86-gcc:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: install
run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
-DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk
- name: build-nosse
run: |
mkdir build-nosse && cd build-nosse
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
-DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
cmake --build . --target test_topk -j$(nproc)
- name: test-nosse
run: cd build-nosse && ./tests/test_topk

pnnx-onnx-topk:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: setup-pytorch
run: |
pip3 install torch --index-url https://download.pytorch.org/whl/cpu
pip3 install numpy packaging onnx onnxruntime
- name: build-pnnx
run: |
cd tools/pnnx
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake --build . --config Release -j$(nproc)
- name: test-topk
run: |
cd tools/pnnx/build
ctest --output-on-failure -R test_onnx_torch_topk
3 changes: 3 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ ncnn_add_layer(SPP OFF)
ncnn_add_layer(TanH)
ncnn_add_layer(Threshold)
ncnn_add_layer(Tile)
ncnn_add_layer(TopK)

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expand layer implementation was added (src/layer/expand.*) but it is not registered in this CMake layer list. As-is, the layer won't be built/registered, and models lowered to an Expand layer will fail at runtime (unknown layer type). Add ncnn_add_layer(Expand) here (near Tile/TopK for related ops).

Suggested change
ncnn_add_layer(TopK)
ncnn_add_layer(TopK)
ncnn_add_layer(Expand)

Copilot uses AI. Check for mistakes.
ncnn_add_layer(Gather)
ncnn_add_layer(GatherElements)
ncnn_add_layer(RNN)
ncnn_add_layer(LSTM)
ncnn_add_layer(BinaryOp)
Expand Down
74 changes: 74 additions & 0 deletions src/layer/cast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
// bfloat16
out_elemsize = 2 * elempack;
}
else if (type_to == 5)
{
// int64
out_elemsize = 8 * elempack;
}
else if (type_to == 6)
{
// int32
out_elemsize = 4 * elempack;
}

if (dims == 1)
{
Expand Down Expand Up @@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons

// TODO more cast type

if (type_from == 5 && type_to == 1)
{
// int64 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const long long* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 5)
{
// float32 → int64
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
long long* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (long long)ptr[i];
}
}
}

if (type_from == 6 && type_to == 1)
{
// int32 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const int* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 6)
{
// float32 → int32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
int* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (int)ptr[i];
}
}
}

return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions src/layer/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class Cast : public Layer
// 2 = float16
// 3 = int8
// 4 = bfloat16
// 5 = int64
// 6 = int32
int type_from;
int type_to;
};
Expand Down
140 changes: 140 additions & 0 deletions src/layer/expand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "expand.h"

namespace ncnn {

Expand::Expand()
{
one_blob_only = false;
support_inplace = false;
}

int Expand::load_param(const ParamDict& pd)
{
return 0;
}

int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() < 2)
return -1;

const Mat& input_blob = bottom_blobs[0];
const Mat& shape_blob = bottom_blobs[1];

// shape_blob contains the target shape as int64/int32 values
const int* target_shape = (const int*)shape_blob;
int target_dims = (int)shape_blob.total();

// Get input dimensions

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shape_blob is reinterpreted as const int*, but ONNX Expand shape is typically an int64 tensor. If shape_blob.elemsize is 8, this will read garbage and can produce invalid output sizes. Read shape values based on shape_blob.elemsize (int32 vs int64), and reject/handle negative/zero dims per ONNX rules.

Copilot uses AI. Check for mistakes.
int in_dims = input_blob.dims;
int in_shape[4] = {1, 1, 1, 1};
in_shape[0] = input_blob.w;
if (in_dims >= 2) in_shape[1] = input_blob.h;
if (in_dims >= 3) in_shape[2] = input_blob.c;
// For 4D, we'd need to handle differently but ncnn typically uses 3D blobs

// Calculate output shape (broadcasting rules)
int out_shape[4] = {1, 1, 1, 1};
int max_dims = std::max(in_dims, target_dims);

for (int i = 0; i < max_dims; i++)
{
int in_idx = i - (max_dims - in_dims);
int target_idx = i - (max_dims - target_dims);

int in_dim = (in_idx >= 0 && in_idx < in_dims) ? in_shape[in_idx] : 1;
int target_dim = (target_idx >= 0 && target_idx < target_dims) ? target_shape[target_idx] : 1;

// Broadcasting: if in_dim is 1, expand to target_dim; otherwise must match
out_shape[i] = (in_dim == 1) ? target_dim : in_dim;
}

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Broadcast shape computation ignores invalid broadcasts: if in_dim != 1 and target_dim differs from in_dim, ONNX Expand should fail, but this code silently keeps in_dim and then uses modulo indexing (producing incorrect results for invalid shapes). Add validation for each dimension (allow only in_dim==target_dim or either side being 1), and handle any torch-style -1 (keep-dim) values if they can appear in shape_blob.

Copilot uses AI. Check for mistakes.

Mat& top_blob = top_blobs[0];

if (max_dims == 1)
{
top_blob.create(out_shape[0], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
}
else if (max_dims == 2)
{
top_blob.create(out_shape[0], out_shape[1], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
}
else if (max_dims == 3)
{
top_blob.create(out_shape[0], out_shape[1], out_shape[2], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
}
else
{
return -1;
}

if (top_blob.empty())
return -100;

const float* inp = input_blob;
float* out = top_blob;

// Fill output by broadcasting input
int total = (int)top_blob.total();

for (int i = 0; i < total; i++)
{
// Calculate multi-dimensional coordinates
int coords[4] = {0, 0, 0, 0};
int rem = i;

if (max_dims == 1)
{
coords[0] = rem;
}
else if (max_dims == 2)
{
coords[0] = rem % top_blob.w;
coords[1] = rem / top_blob.w;
}
else if (max_dims == 3)
{
int wh = top_blob.w * top_blob.h;
coords[0] = (rem % wh) % top_blob.w;
coords[1] = (rem % wh) / top_blob.w;
coords[2] = rem / wh;
}

// Map to input coordinates (modulo for expanded dimensions)
int in_coords[4] = {0, 0, 0, 0};
for (int d = 0; d < max_dims; d++)
{
int in_idx = d - (max_dims - in_dims);
if (in_idx >= 0 && in_idx < in_dims)
{
int dim_size = (d == 0) ? input_blob.w : (d == 1 && in_dims >= 2) ? input_blob.h : input_blob.c;
in_coords[in_idx] = coords[d] % dim_size;
}
}

// Calculate flat input index
int in_idx = 0;
if (in_dims == 1)
{
in_idx = in_coords[0];
}
else if (in_dims == 2)
{
in_idx = in_coords[0] + in_coords[1] * input_blob.w;
}
else if (in_dims == 3)
{
size_t cstep = input_blob.cstep;
in_idx = in_coords[0] + in_coords[1] * input_blob.w + in_coords[2] * (int)cstep;
}

out[i] = inp[in_idx];
}

return 0;
}

} // namespace ncnn
23 changes: 23 additions & 0 deletions src/layer/expand.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EXPAND_H
#define LAYER_EXPAND_H

#include "layer.h"

namespace ncnn {

class Expand : public Layer
{
public:
Expand();

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_EXPAND_H
Loading
Loading