Tencent · vlordier · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml
@@ -0,0 +1,115 @@
+name: topk-linux-test
+on:
+  push:
+    branches:
+    - topk-ci-tests
+    - fix-pnnx-onnx-topk-support
+  pull_request:
+    branches:
+    - master
+
+jobs:
+  x64-none:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-sse2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-avx2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \
+            -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  simplestl-simplemath:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
+            -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  linux-x86-gcc:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: install
+      run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+    - name: build-nosse
+      run: |
+        mkdir build-nosse && cd build-nosse
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test-nosse
+      run: cd build-nosse && ./tests/test_topk
+
+  pnnx-onnx-topk:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+    - name: setup-pytorch
+      run: |
+        pip3 install torch --index-url https://download.pytorch.org/whl/cpu
+        pip3 install numpy packaging onnx onnxruntime
+    - name: build-pnnx
+      run: |
+        cd tools/pnnx
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake --build . --config Release -j$(nproc)
+    - name: test-topk
+      run: |
+        cd tools/pnnx/build
+        ctest --output-on-failure -R test_onnx_torch_topk
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -101,6 +101,9 @@ ncnn_add_layer(SPP OFF)
 ncnn_add_layer(TanH)
 ncnn_add_layer(Threshold)
 ncnn_add_layer(Tile)
+ncnn_add_layer(TopK)
-ncnn_add_layer(TopK)
+ncnn_add_layer(TopK)
+ncnn_add_layer(Expand)
-ncnn_add_layer(TopK)
+ncnn_add_layer(TopK)
+ncnn_add_layer(Expand)
+ncnn_add_layer(Gather)
+ncnn_add_layer(GatherElements)
 ncnn_add_layer(RNN)
 ncnn_add_layer(LSTM)
 ncnn_add_layer(BinaryOp)

diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp
@@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         // bfloat16
         out_elemsize = 2 * elempack;
     }
+    else if (type_to == 5)
+    {
+        // int64
+        out_elemsize = 8 * elempack;
+    }
+    else if (type_to == 6)
+    {
+        // int32
+        out_elemsize = 4 * elempack;
+    }
 
     if (dims == 1)
     {
@@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
     // TODO more cast type
 
+    if (type_from == 5 && type_to == 1)
+    {
+        // int64 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const long long* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 5)
+    {
+        // float32 → int64
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            long long* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (long long)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 6 && type_to == 1)
+    {
+        // int32 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 6)
+    {
+        // float32 → int32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            int* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (int)ptr[i];
+            }
+        }
+    }
+
     return 0;
 }
 

diff --git a/src/layer/cast.h b/src/layer/cast.h
@@ -24,6 +24,8 @@ class Cast : public Layer
     // 2 = float16
     // 3 = int8
     // 4 = bfloat16
+    // 5 = int64
+    // 6 = int32
     int type_from;
     int type_to;
 };

diff --git a/src/layer/expand.cpp b/src/layer/expand.cpp
@@ -0,0 +1,140 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "expand.h"
+
+namespace ncnn {
+
+Expand::Expand()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+int Expand::load_param(const ParamDict& pd)
+{
+    return 0;
+}
+
+int Expand::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    if (bottom_blobs.size() < 2)
+        return -1;
+
+    const Mat& input_blob = bottom_blobs[0];
+    const Mat& shape_blob = bottom_blobs[1];
+
+    // shape_blob contains the target shape as int64/int32 values
+    const int* target_shape = (const int*)shape_blob;
+    int target_dims = (int)shape_blob.total();
+
+    // Get input dimensions
+    int in_dims = input_blob.dims;
+    int in_shape[4] = {1, 1, 1, 1};
+    in_shape[0] = input_blob.w;
+    if (in_dims >= 2) in_shape[1] = input_blob.h;
+    if (in_dims >= 3) in_shape[2] = input_blob.c;
+    // For 4D, we'd need to handle differently but ncnn typically uses 3D blobs
+
+    // Calculate output shape (broadcasting rules)
+    int out_shape[4] = {1, 1, 1, 1};
+    int max_dims = std::max(in_dims, target_dims);
+
+    for (int i = 0; i < max_dims; i++)
+    {
+        int in_idx = i - (max_dims - in_dims);
+        int target_idx = i - (max_dims - target_dims);
+
+        int in_dim = (in_idx >= 0 && in_idx < in_dims) ? in_shape[in_idx] : 1;
+        int target_dim = (target_idx >= 0 && target_idx < target_dims) ? target_shape[target_idx] : 1;
+
+        // Broadcasting: if in_dim is 1, expand to target_dim; otherwise must match
+        out_shape[i] = (in_dim == 1) ? target_dim : in_dim;
+    }
+
+    Mat& top_blob = top_blobs[0];
+
+    if (max_dims == 1)
+    {
+        top_blob.create(out_shape[0], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    }
+    else if (max_dims == 2)
+    {
+        top_blob.create(out_shape[0], out_shape[1], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    }
+    else if (max_dims == 3)
+    {
+        top_blob.create(out_shape[0], out_shape[1], out_shape[2], input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    }
+    else
+    {
+        return -1;
+    }
+
+    if (top_blob.empty())
+        return -100;
+
+    const float* inp = input_blob;
+    float* out = top_blob;
+
+    // Fill output by broadcasting input
+    int total = (int)top_blob.total();
+
+    for (int i = 0; i < total; i++)
+    {
+        // Calculate multi-dimensional coordinates
+        int coords[4] = {0, 0, 0, 0};
+        int rem = i;
+
+        if (max_dims == 1)
+        {
+            coords[0] = rem;
+        }
+        else if (max_dims == 2)
+        {
+            coords[0] = rem % top_blob.w;
+            coords[1] = rem / top_blob.w;
+        }
+        else if (max_dims == 3)
+        {
+            int wh = top_blob.w * top_blob.h;
+            coords[0] = (rem % wh) % top_blob.w;
+            coords[1] = (rem % wh) / top_blob.w;
+            coords[2] = rem / wh;
+        }
+
+        // Map to input coordinates (modulo for expanded dimensions)
+        int in_coords[4] = {0, 0, 0, 0};
+        for (int d = 0; d < max_dims; d++)
+        {
+            int in_idx = d - (max_dims - in_dims);
+            if (in_idx >= 0 && in_idx < in_dims)
+            {
+                int dim_size = (d == 0) ? input_blob.w : (d == 1 && in_dims >= 2) ? input_blob.h : input_blob.c;
+                in_coords[in_idx] = coords[d] % dim_size;
+            }
+        }
+
+        // Calculate flat input index
+        int in_idx = 0;
+        if (in_dims == 1)
+        {
+            in_idx = in_coords[0];
+        }
+        else if (in_dims == 2)
+        {
+            in_idx = in_coords[0] + in_coords[1] * input_blob.w;
+        }
+        else if (in_dims == 3)
+        {
+            size_t cstep = input_blob.cstep;
+            in_idx = in_coords[0] + in_coords[1] * input_blob.w + in_coords[2] * (int)cstep;
+        }
+
+        out[i] = inp[in_idx];
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/expand.h b/src/layer/expand.h
@@ -0,0 +1,23 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_EXPAND_H
+#define LAYER_EXPAND_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Expand : public Layer
+{
+public:
+    Expand();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_EXPAND_H