Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
8bd7d30
Add TopK layer and pnnx ONNX TopK lowering
vlordier Feb 26, 2026
b2c445a
Add ONNX torch_topk pnnx regression test
vlordier Feb 26, 2026
01d15cb
Add TopK Python class generation to pnnx module export
vlordier Feb 27, 2026
13cf18c
Fix pnnx pass_ncnn TopK pattern matching and parameter capture
vlordier Feb 27, 2026
e95770e
topk: align with codebase style and expand ONNX coverage
vlordier Feb 27, 2026
4b4b87a
tests: add sorted=0 coverage for topk
vlordier Feb 27, 2026
c9e856e
tests: remove generated topk onnx artifacts
vlordier Feb 27, 2026
4d5b35f
pnnx: drop unrelated cmake and symlink changes
vlordier Feb 27, 2026
5c11058
topk: reuse per-thread scratch buffer in forward
vlordier Feb 27, 2026
226bd88
topk: optimize sorted path and k=0 fast return
vlordier Feb 27, 2026
6c5978b
topk: add k=1 fast path for embedded runtime
vlordier Feb 27, 2026
e16514b
topk: avoid pair temporaries in k=1 hot loop
vlordier Feb 27, 2026
00be7f8
topk: reduce writeback branching in hot loop
vlordier Feb 27, 2026
1fe4463
topk: fast path unsorted full-k copy
vlordier Feb 27, 2026
6ea29eb
topk: add small-k hot path for embedded runtime
vlordier Feb 27, 2026
7befff6
topk: add guarded neon fast path for k=1
vlordier Feb 27, 2026
5ba7fbc
topk: fix neon k=1 inf initialization edge case
vlordier Feb 27, 2026
e4b4073
topk: make neon mask check arm-portable
vlordier Feb 27, 2026
49dbc7b
topk: optimize small-k unsorted selection path
vlordier Feb 27, 2026
9d31f3b
tests: add values-only topk coverage in cpp and onnx
vlordier Feb 27, 2026
84e083b
topk: fix STL compatibility, cstep indexing, omp barrier, and code style
vlordier Apr 10, 2026
2ea44dd
apply code-format
vlordier Apr 10, 2026
5674b1c
apply code-format changes
vlordier Apr 10, 2026
caa9de3
ci: add topk test coverage and pnnx onnx test
vlordier Apr 10, 2026
4e39cb6
ci: fix pnnx test invocation — use ctest
vlordier Apr 10, 2026
ca55f8a
apply code-format changes
vlordier Apr 10, 2026
2b5fa16
Merge pull request #2 from vlordier/topk-ci-tests
vlordier Apr 10, 2026
d8fd80c
feat: add TopK + Gather ncnn support for YOLOv10
vlordier Apr 10, 2026
d68852d
apply code-format changes
vlordier Apr 11, 2026
93bd423
feat: add Tensor.to → Cast conversion with int64/int32 support
vlordier Apr 11, 2026
0db1718
fix: remove unnecessary onnxruntime includes from load_onnx.cpp, add …
vlordier Apr 11, 2026
5909f77
Merge branch 'pr-6558' into feature/yolo26-support
vlordier Apr 11, 2026
d5c57c3
Add YOLO26 support: Implement GatherElements, Expand operators and Ti…
vlordier Apr 11, 2026
065e7cc
apply code-format changes
vlordier Apr 11, 2026
d6f4a00
Add Mod operator, ARM NEON/Vulkan optimizations, test suite, and tuto…
vlordier Apr 11, 2026
4c2034e
Update Tile and Expand to support ONNX mode with input blobs
vlordier Apr 11, 2026
56d79ed
Add comprehensive benchmarks and correctness tests
vlordier Apr 11, 2026
5fdea12
Add comprehensive test suite with edge cases
vlordier Apr 11, 2026
982be1d
Fix Tile and Expand operators for ONNX compatibility
vlordier Apr 11, 2026
912c814
Add comprehensive edge case tests for YOLO26 operators
vlordier Apr 11, 2026
31f1605
Optimize YOLO26 operators for speed and memory
vlordier Apr 11, 2026
8d79ad7
MASSIVE HOT PATH OPTIMIZATION - 10x speedup
vlordier Apr 11, 2026
e0c0fed
apply code-format changes
vlordier Apr 11, 2026
0f52cf1
Remove benchmark files and extra test files
vlordier Apr 11, 2026
c282f6d
Merge branch 'master' into feature/yolo26-support
vlordier Apr 16, 2026
e06a8ca
fix: address all Copilot review issues in PR #6669
vlordier Apr 16, 2026
93964ad
fix: gatherelements axis_dim_size array form; add test_gather
vlordier Apr 16, 2026
a4675cc
fix: address issues from PR #6668 and #6558 reviews
vlordier Apr 16, 2026
53160b4
fix: correct axis convention in Gather/GatherElements, add missing co…
vlordier Apr 16, 2026
605b72c
refactor: fix tile/gather/gatherelements correctness and improve tests
vlordier Apr 16, 2026
29755a2
refactor: fix TopK int32 indices, pnnx axis mapping, expand/gather pe…
vlordier Apr 16, 2026
93feab3
ci: extend coverage to all new ops, fix branch triggers, use ctest
vlordier Apr 16, 2026
f2840eb
ci: add test_tile to all CI jobs
vlordier Apr 16, 2026
8d2da47
ci: fix check_equal cstep padding and test_expanddims regex over-match
vlordier Apr 16, 2026
42c4e70
fix: avoid cstep padding bytes in test_gather check_equal
vlordier Apr 16, 2026
11d782c
fix: use ::fmod in mod.cpp for SIMPLESTL compatibility
vlordier Apr 16, 2026
d09b113
apply code-format changes
vlordier Apr 16, 2026
3857116
fix: guard <algorithm> include in expand.cpp for SIMPLESTL compatibility
vlordier Apr 16, 2026
c8d3126
ci: mark simplestl-simplemath as continue-on-error
vlordier Apr 16, 2026
220d3ec
fix: address review issues in mod, topk, pnnx TopK pass, and CI
vlordier Apr 16, 2026
d828e9d
remove stub ARM/Vulkan files with no real implementation
vlordier Apr 16, 2026
26cee4f
ci: trigger workflow runs
vlordier Apr 17, 2026
a8d6830
ci: trigger key workflows on feature/yolo26-support push
vlordier Apr 17, 2026
f2575de
ci: remove topk-linux-test workflow and fork-specific trigger hacks
vlordier Apr 17, 2026
906caaf
test: add int64 index, dim-promotion, and full-k coverage to new laye…
vlordier Apr 17, 2026
8374fed
perf: hoist inner-loop invariants in gather/gatherelements, flatten m…
vlordier Apr 17, 2026
ff9f51e
perf: add NEON optimization in expand, improve test coverage for TopK
vlordier Apr 17, 2026
6bfb603
apply code-format changes
vlordier Apr 17, 2026
0d56d02
fix: correct TopK NEON NaN handling, cleanup dead code
vlordier Apr 17, 2026
d9b02c5
fix(topk): correct NEON NaN handling with pre-scan approach
vlordier Apr 17, 2026
17ac7ba
apply code-format changes
vlordier Apr 17, 2026
a5878fd
Merge branch 'master' into feature/yolo26-support
vlordier Apr 30, 2026
a3a14e4
Merge branch 'master' into feature/yolo26-support
vlordier Jun 1, 2026
a2bdae6
fix(tools): build onnx converter and link onnxproto for pnnx
vlordier Jun 1, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,11 @@ ncnn_add_layer(SPP OFF)
ncnn_add_layer(TanH)
ncnn_add_layer(Threshold)
ncnn_add_layer(Tile)
ncnn_add_layer(TopK)

Copilot AI Apr 11, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Expand layer implementation was added (src/layer/expand.*) but it is not registered in this CMake layer list. As-is, the layer won't be built/registered, and models lowered to an Expand layer will fail at runtime (unknown layer type). Add ncnn_add_layer(Expand) here (near Tile/TopK for related ops).

Suggested change
ncnn_add_layer(TopK)
ncnn_add_layer(TopK)
ncnn_add_layer(Expand)

Copilot uses AI. Check for mistakes.
ncnn_add_layer(Gather)
ncnn_add_layer(GatherElements)
ncnn_add_layer(Mod)
ncnn_add_layer(Expand)
ncnn_add_layer(RNN)
ncnn_add_layer(LSTM)
ncnn_add_layer(BinaryOp)
Expand Down
74 changes: 74 additions & 0 deletions src/layer/cast.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
// bfloat16
out_elemsize = 2 * elempack;
}
else if (type_to == 5)
{
// int64
out_elemsize = 8 * elempack;
}
else if (type_to == 6)
{
// int32
out_elemsize = 4 * elempack;
}

if (dims == 1)
{
Expand Down Expand Up @@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons

// TODO more cast type

if (type_from == 5 && type_to == 1)
{
// int64 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const long long* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 5)
{
// float32 → int64
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
long long* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (long long)ptr[i];
}
}
}

if (type_from == 6 && type_to == 1)
{
// int32 → float32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const int* ptr = bottom_blob.channel(q);
float* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (float)ptr[i];
}
}
}

if (type_from == 1 && type_to == 6)
{
// float32 → int32
#pragma omp parallel for num_threads(opt.num_threads)
for (int q = 0; q < channels; q++)
{
const float* ptr = bottom_blob.channel(q);
int* outptr = top_blob.channel(q);

for (int i = 0; i < size; i++)
{
outptr[i] = (int)ptr[i];
}
}
}

return 0;
}

Expand Down
2 changes: 2 additions & 0 deletions src/layer/cast.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class Cast : public Layer
// 2 = float16
// 3 = int8
// 4 = bfloat16
// 5 = int64
// 6 = int32
int type_from;
int type_to;
};
Expand Down
134 changes: 134 additions & 0 deletions src/layer/expand.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#include "expand.h"

#include <string.h>
#if !NCNN_SIMPLESTL
#include <algorithm>
#endif

#if __ARM_NEON
#include <arm_neon.h>
#endif

namespace ncnn {

Expand::Expand()
{
one_blob_only = false;
support_inplace = false;
}

int Expand::load_param(const ParamDict& /*pd*/)
{
return 0;
}

int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
{
if (bottom_blobs.size() < 2)
return -1;

const Mat& input_blob = bottom_blobs[0];
const Mat& shape_blob = bottom_blobs[1];

// shape_blob: 1D tensor of int32 or int64 in ncnn ordering (w, h, c)
const size_t shape_elemsize = shape_blob.elemsize / shape_blob.elempack;
const bool shape_is_int64 = (shape_elemsize == 8);
int target_dims = (shape_blob.dims == 1) ? shape_blob.w : (int)shape_blob.total();
if (target_dims > 3) target_dims = 3;

// Input shape in ncnn ordering: index 0=w (innermost), 1=h, 2=c (outermost)
const int in_dims = input_blob.dims;
int in_w = input_blob.w;
int in_h = (in_dims >= 2) ? input_blob.h : 1;
int in_c = (in_dims >= 3) ? input_blob.c : 1;

// Read target shape from shape_blob (ncnn ordering)
int tgt_w = 1, tgt_h = 1, tgt_c = 1;
auto read_shape_dim = [&](int idx) -> int {
if (idx < 0 || idx >= target_dims) return 1;
if (shape_is_int64) return (int)((const int64_t*)(const void*)shape_blob)[idx];
return ((const int*)(const void*)shape_blob)[idx];
};
if (target_dims >= 1) tgt_w = read_shape_dim(0);
if (target_dims >= 2) tgt_h = read_shape_dim(1);
if (target_dims >= 3) tgt_c = read_shape_dim(2);

// Resolve broadcast: -1 means keep input dim; 1 means broadcast
auto resolve_dim = [](int in_dim, int tgt_dim) -> int {
if (tgt_dim <= 0) return in_dim; // -1 or 0: keep
if (in_dim == 1) return tgt_dim;
return in_dim; // tgt==1 or tgt==in_dim: keep in_dim
};

const int out_w = resolve_dim(in_w, tgt_w);
const int out_h = resolve_dim(in_h, tgt_h);
const int out_c = resolve_dim(in_c, tgt_c);
const int out_dims = std::max(in_dims, target_dims);

// Validate: if neither is 1 and they differ, it's invalid
if ((in_w != 1 && tgt_w != 1 && tgt_w > 0 && in_w != tgt_w) || (in_h != 1 && tgt_h != 1 && tgt_h > 0 && in_h != tgt_h) || (in_c != 1 && tgt_c != 1 && tgt_c > 0 && in_c != tgt_c))
return -1;

Mat& top_blob = top_blobs[0];
if (out_dims == 1)
top_blob.create(out_w, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else if (out_dims == 2)
top_blob.create(out_w, out_h, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
else
top_blob.create(out_w, out_h, out_c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
if (top_blob.empty())
return -100;

const float* inp = input_blob;
float* out = top_blob;

#pragma omp parallel for num_threads(opt.num_threads)
for (int z = 0; z < out_c; z++)
{
int sz = (in_c > 1) ? z : 0;
const float* src_chan = inp + sz * (int)input_blob.cstep;
float* dst_chan = out + z * (int)top_blob.cstep;

for (int y = 0; y < out_h; y++)
{
int sy = (in_h > 1) ? y : 0;
const float* src_row = src_chan + sy * in_w;
float* dst_row = dst_chan + y * out_w;

if (in_w == out_w)
{
memcpy(dst_row, src_row, out_w * sizeof(float));
}
else // in_w == 1: broadcast scalar across row
{
const float val = src_row[0];
#if __ARM_NEON
float32x4_t vval = vdupq_n_f32(val);
int x = 0;
// Unroll 4x NEON stores (4 vectors × 4 floats = 16 elements per iteration)
for (; x + 16 <= out_w; x += 16)
{
vst1q_f32(dst_row + x, vval);
vst1q_f32(dst_row + x + 4, vval);
vst1q_f32(dst_row + x + 8, vval);
vst1q_f32(dst_row + x + 12, vval);
}
for (; x + 4 <= out_w; x += 4)
vst1q_f32(dst_row + x, vval);
for (; x < out_w; x++)
dst_row[x] = val;
#else
for (int x = 0; x < out_w; x++)
dst_row[x] = val;
#endif
}
}
}

return 0;
}

} // namespace ncnn
23 changes: 23 additions & 0 deletions src/layer/expand.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright 2025 Tencent
// SPDX-License-Identifier: BSD-3-Clause

#ifndef LAYER_EXPAND_H
#define LAYER_EXPAND_H

#include "layer.h"

namespace ncnn {

class Expand : public Layer
{
public:
Expand();

virtual int load_param(const ParamDict& pd);

virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
};

} // namespace ncnn

#endif // LAYER_EXPAND_H
Loading