Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
8bd7d30
Add TopK layer and pnnx ONNX TopK lowering
vlordier Feb 26, 2026
b2c445a
Add ONNX torch_topk pnnx regression test
vlordier Feb 26, 2026
01d15cb
Add TopK Python class generation to pnnx module export
vlordier Feb 27, 2026
13cf18c
Fix pnnx pass_ncnn TopK pattern matching and parameter capture
vlordier Feb 27, 2026
e95770e
topk: align with codebase style and expand ONNX coverage
vlordier Feb 27, 2026
4b4b87a
tests: add sorted=0 coverage for topk
vlordier Feb 27, 2026
c9e856e
tests: remove generated topk onnx artifacts
vlordier Feb 27, 2026
4d5b35f
pnnx: drop unrelated cmake and symlink changes
vlordier Feb 27, 2026
5c11058
topk: reuse per-thread scratch buffer in forward
vlordier Feb 27, 2026
226bd88
topk: optimize sorted path and k=0 fast return
vlordier Feb 27, 2026
6c5978b
topk: add k=1 fast path for embedded runtime
vlordier Feb 27, 2026
e16514b
topk: avoid pair temporaries in k=1 hot loop
vlordier Feb 27, 2026
00be7f8
topk: reduce writeback branching in hot loop
vlordier Feb 27, 2026
1fe4463
topk: fast path unsorted full-k copy
vlordier Feb 27, 2026
6ea29eb
topk: add small-k hot path for embedded runtime
vlordier Feb 27, 2026
7befff6
topk: add guarded neon fast path for k=1
vlordier Feb 27, 2026
5ba7fbc
topk: fix neon k=1 inf initialization edge case
vlordier Feb 27, 2026
e4b4073
topk: make neon mask check arm-portable
vlordier Feb 27, 2026
49dbc7b
topk: optimize small-k unsorted selection path
vlordier Feb 27, 2026
9d31f3b
tests: add values-only topk coverage in cpp and onnx
vlordier Feb 27, 2026
84e083b
topk: fix STL compatibility, cstep indexing, omp barrier, and code style
vlordier Apr 10, 2026
2ea44dd
apply code-format
vlordier Apr 10, 2026
5674b1c
apply code-format changes
vlordier Apr 10, 2026
caa9de3
ci: add topk test coverage and pnnx onnx test
vlordier Apr 10, 2026
4e39cb6
ci: fix pnnx test invocation — use ctest
vlordier Apr 10, 2026
ca55f8a
apply code-format changes
vlordier Apr 10, 2026
2b5fa16
Merge pull request #2 from vlordier/topk-ci-tests
vlordier Apr 10, 2026
d8fd80c
feat: add TopK + Gather ncnn support for YOLOv10
vlordier Apr 10, 2026
d68852d
apply code-format changes
vlordier Apr 11, 2026
93bd423
feat: add Tensor.to → Cast conversion with int64/int32 support
vlordier Apr 11, 2026
168cdea
Merge branch 'master' into fix-pnnx-gather-support
vlordier Apr 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions .github/workflows/topk-linux-test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
name: topk-linux-test
on:
push:
branches:
- topk-ci-tests
Comment on lines +4 to +5

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This workflow only runs on pushes to the 'topk-ci-tests' branch, so it will not run for pull_request events or for the repository's normal branches after merge. If it is meant to provide ongoing CI coverage, add appropriate pull_request/push triggers; otherwise consider not adding it to the mainline PR.

Suggested change
branches:
- topk-ci-tests
pull_request:

Copilot uses AI. Check for mistakes.

jobs:
x64-none:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

x64-sse2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=ON -DNCNN_AVX=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

x64-avx2:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \
-DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

simplestl-simplemath:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
-DCMAKE_BUILD_TYPE=Debug \
-DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
-DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \
-DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk

linux-x86-gcc:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: install
run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
- name: build
run: |
mkdir build && cd build
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
-DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
cmake --build . --target test_topk -j$(nproc)
- name: test
run: cd build && ./tests/test_topk
- name: build-nosse
run: |
mkdir build-nosse && cd build-nosse
cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
-DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
-DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
cmake --build . --target test_topk -j$(nproc)
- name: test-nosse
run: cd build-nosse && ./tests/test_topk

pnnx-onnx-topk:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: '3.12'
- name: setup-pytorch
run: |
pip3 install torch --index-url https://download.pytorch.org/whl/cpu
pip3 install numpy packaging onnx onnxruntime
- name: build-pnnx
run: |
cd tools/pnnx
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake --build . --config Release -j$(nproc)
- name: test-topk
run: |
cd tools/pnnx/build
ctest --output-on-failure -R test_onnx_torch_topk
2 changes: 2 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,8 @@ ncnn_add_layer(SPP OFF)
ncnn_add_layer(TanH)
ncnn_add_layer(Threshold)
ncnn_add_layer(Tile)
ncnn_add_layer(TopK)
ncnn_add_layer(Gather)
ncnn_add_layer(RNN)
ncnn_add_layer(LSTM)
ncnn_add_layer(BinaryOp)
Expand Down
74 changes: 74 additions & 0 deletions src/layer/cast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
// bfloat16
out_elemsize = 2 * elempack;
}
else if (type_to == 5)
{
// int64
out_elemsize = 8 * elempack;
}
else if (type_to == 6)
{
// int32
out_elemsize = 4 * elempack;
}

if (dims == 1)
{
Expand Down Expand Up @@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons

// TODO more cast type

if (type_from == 5 && type_to == 1)
{
// int64 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const long long* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 5)
{
// float32 → int64
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
long long* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (long long)ptr[i];
}
}
}

if (type_from == 6 && type_to == 1)
{
// int32 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const int* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 6)
{
// float32 → int32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
int* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (int)ptr[i];
}
}
}

return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions src/layer/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class Cast : public Layer
// 2 = float16
// 3 = int8
// 4 = bfloat16
// 5 = int64
// 6 = int32
int type_from;
int type_to;
};
Expand Down
121 changes: 121 additions & 0 deletions src/layer/gather.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "gather.h"

namespace ncnn {

Gather::Gather()
{
one_blob_only = false;
support_inplace = false;
}

int Gather::load_param(const ParamDict& pd)
{
axis = pd.get(0, 0);

return 0;
}

int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() < 2)
return -1;

const Mat& input_blob = bottom_blobs[0];
const Mat& index_blob = bottom_blobs[1];
const int dims = input_blob.dims;

// index_blob should contain int64 or int32 indices
// For simplicity we treat it as float and cast
const int index_size = (int)index_blob.total();

Comment on lines +30 to +33

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gather currently treats index_blob memory as int32 via a raw cast, but upstream indices are commonly int64 (and sometimes int32) depending on exporter. Reading an int64 tensor as int32 will corrupt indices and can segfault/out-of-bounds. Please branch on index_blob.elemsize (4 vs 8) and load indices accordingly (or convert into a temporary int32 buffer).

Copilot uses AI. Check for mistakes.
Comment on lines +30 to +33

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Gather implementation comments say indices are treated as float and cast, but the code later reinterprets index_blob memory as int32. Indices are commonly int64 in exported graphs, so this will corrupt reads. Please load indices according to index_blob.elemsize (4 vs 8) or convert to a temporary int32 buffer.

Copilot uses AI. Check for mistakes.
int positive_axis = axis < 0 ? axis + dims : axis;
if (positive_axis < 0 || positive_axis >= dims)
return -1;

int shape[4] = {1, 1, 1, 1};
shape[0] = input_blob.w;
if (dims >= 2) shape[1] = input_blob.h;
if (dims == 3) shape[2] = input_blob.c;
if (dims == 4) shape[2] = input_blob.c; // w*h*c layout

const int axis_dim_size = shape[positive_axis];

// Output shape matches index_blob shape
const Mat& out_shape = index_blob;

// Allocate output (same dtype as input, shape matches index)
Mat& top_blob = top_blobs[0];
top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Output allocation always uses Mat::create(w,h,c,...) which forces a 3D output, even when index_blob is 1D/2D (and potentially 4D). This breaks Gather's contract that output shape matches index shape. Allocate top_blob with the same dims as index_blob using the appropriate create overload.

Suggested change
top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
if (out_shape.dims == 1)
top_blob.create(out_shape.w, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else if (out_shape.dims == 2)
top_blob.create(out_shape.w, out_shape.h, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else if (out_shape.dims == 3)
top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else if (out_shape.dims == 4)
top_blob.create(out_shape.w, out_shape.h, out_shape.d, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else
return -1;

Copilot uses AI. Check for mistakes.
if (top_blob.empty())
return -100;

const float* inp = input_blob;
const int* idx = (const int*)index_blob;
float* out = top_blob;
Comment on lines +55 to +57

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gather treats input and output tensors as float* (and writes via float*), but top_blob is allocated using input_blob.elemsize/elempack which may be fp16/int8/etc. This will produce incorrect results or memory corruption when elemsize != 4. Either restrict Gather to float32 with a runtime check or add proper type handling.

Copilot uses AI. Check for mistakes.

// General case: iterate over all output positions
// Map flat output index to multi-dimensional coords,
// then compute corresponding input position with index substitution
const int total_out = (int)top_blob.total();
for (int i = 0; i < total_out; i++)
{
// Decompose flat index i into coordinates based on top_blob shape
int rem = i;
int coord_out[4] = {0, 0, 0, 0};
if (top_blob.dims == 1)
{
coord_out[0] = rem;
}
else if (top_blob.dims == 2)
{
coord_out[0] = rem % top_blob.w;
coord_out[1] = rem / top_blob.w;
}
else if (top_blob.dims == 3)
{
int hw = top_blob.w * top_blob.h;
coord_out[0] = (rem % hw) % top_blob.w;
coord_out[1] = (rem % hw) / top_blob.w;
coord_out[2] = rem / hw;
}

// Get index value at this output position
int gather_idx = idx[i];
// Handle negative indices
if (gather_idx < 0) gather_idx += axis_dim_size;

// Build input coordinate (same as output, but axis coord replaced)
int coord_in[4] = {coord_out[0], coord_out[1], coord_out[2], coord_out[3]};
coord_in[positive_axis] = gather_idx;

// Clamp to input bounds
if (coord_in[positive_axis] >= axis_dim_size) coord_in[positive_axis] = axis_dim_size - 1;
if (coord_in[positive_axis] < 0) coord_in[positive_axis] = 0;

// Compute flat input index
int flat_in = 0;
if (dims == 1)
{
flat_in = coord_in[0];
}
else if (dims == 2)
{
flat_in = coord_in[0] + coord_in[1] * input_blob.w;
}
else if (dims == 3)
{
// ncnn 3D layout: w * h * c, with cstride padding
size_t cstep = input_blob.cstep;
flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep;
Comment on lines +108 to +112

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only dims 1/2/3 are handled in flat index computation. For dims==4, flat_in remains incorrect and Gather will return wrong results silently. Either implement 4D indexing (w,h,d,c with cstep) or explicitly reject dims > 3 early with a clear error code.

Copilot uses AI. Check for mistakes.
}

out[i] = inp[flat_in];
}

return 0;
}

} // namespace ncnn
27 changes: 27 additions & 0 deletions src/layer/gather.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_GATHER_H
#define LAYER_GATHER_H

#include "layer.h"

namespace ncnn {

class Gather : public Layer
{
public:
Gather();

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;

public:
// param_0 = axis (default 0)
int axis;
};

} // namespace ncnn

#endif // LAYER_GATHER_H
Loading