Tencent · vlordier · Feb 26, 2026 · Feb 26, 2026 · Feb 27, 2026 · Feb 27, 2026
diff --git a/.github/workflows/topk-linux-test.yml b/.github/workflows/topk-linux-test.yml
@@ -0,0 +1,111 @@
+name: topk-linux-test
+on:
+  push:
+    branches:
+    - topk-ci-tests
-    branches:
-    - topk-ci-tests
+  pull_request:
-    branches:
-    - topk-ci-tests
+  pull_request:
+
+jobs:
+  x64-none:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-sse2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  x64-avx2:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Debug -DNCNN_RUNTIME_CPU=OFF \
+            -DNCNN_SSE2=ON -DNCNN_AVX=ON -DNCNN_F16C=ON -DNCNN_FMA=ON -DNCNN_AVX2=ON \
+            -DNCNN_AVX512=OFF -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF \
+            -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  simplestl-simplemath:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host-c.gcc.toolchain.cmake \
+            -DCMAKE_BUILD_TYPE=Debug \
+            -DNCNN_SIMPLESTL=ON -DNCNN_SIMPLEMATH=ON \
+            -DNCNN_OPENMP=OFF -DNCNN_THREADS=OFF \
+            -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+
+  linux-x86-gcc:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - name: install
+      run: sudo apt-get update && sudo apt-get install -y gcc-multilib g++-multilib
+    - name: build
+      run: |
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test
+      run: cd build && ./tests/test_topk
+    - name: build-nosse
+      run: |
+        mkdir build-nosse && cd build-nosse
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/host.gcc-m32.toolchain.cmake \
+            -DNCNN_RUNTIME_CPU=OFF -DNCNN_SSE2=OFF -DNCNN_AVX=OFF \
+            -DNCNN_BUILD_TESTS=ON -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF ..
+        cmake --build . --target test_topk -j$(nproc)
+    - name: test-nosse
+      run: cd build-nosse && ./tests/test_topk
+
+  pnnx-onnx-topk:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v4
+    - uses: actions/setup-python@v5
+      with:
+        python-version: '3.12'
+    - name: setup-pytorch
+      run: |
+        pip3 install torch --index-url https://download.pytorch.org/whl/cpu
+        pip3 install numpy packaging onnx onnxruntime
+    - name: build-pnnx
+      run: |
+        cd tools/pnnx
+        mkdir build && cd build
+        cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake --build . --config Release -j$(nproc)
+    - name: test-topk
+      run: |
+        cd tools/pnnx/build
+        ctest --output-on-failure -R test_onnx_torch_topk
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -101,6 +101,8 @@ ncnn_add_layer(SPP OFF)
 ncnn_add_layer(TanH)
 ncnn_add_layer(Threshold)
 ncnn_add_layer(Tile)
+ncnn_add_layer(TopK)
+ncnn_add_layer(Gather)
 ncnn_add_layer(RNN)
 ncnn_add_layer(LSTM)
 ncnn_add_layer(BinaryOp)

diff --git a/src/layer/cast.cpp b/src/layer/cast.cpp
@@ -74,6 +74,16 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
         // bfloat16
         out_elemsize = 2 * elempack;
     }
+    else if (type_to == 5)
+    {
+        // int64
+        out_elemsize = 8 * elempack;
+    }
+    else if (type_to == 6)
+    {
+        // int32
+        out_elemsize = 4 * elempack;
+    }
 
     if (dims == 1)
     {
@@ -173,6 +183,70 @@ int Cast::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) cons
 
     // TODO more cast type
 
+    if (type_from == 5 && type_to == 1)
+    {
+        // int64 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const long long* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 5)
+    {
+        // float32 → int64
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            long long* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (long long)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 6 && type_to == 1)
+    {
+        // int32 → float32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const int* ptr = bottom_blob.channel(q);
+            float* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (float)ptr[i];
+            }
+        }
+    }
+
+    if (type_from == 1 && type_to == 6)
+    {
+        // float32 → int32
+        #pragma omp parallel for num_threads(opt.num_threads)
+        for (int q = 0; q < channels; q++)
+        {
+            const float* ptr = bottom_blob.channel(q);
+            int* outptr = top_blob.channel(q);
+
+            for (int i = 0; i < size; i++)
+            {
+                outptr[i] = (int)ptr[i];
+            }
+        }
+    }
+
     return 0;
 }
 

diff --git a/src/layer/cast.h b/src/layer/cast.h
@@ -24,6 +24,8 @@ class Cast : public Layer
     // 2 = float16
     // 3 = int8
     // 4 = bfloat16
+    // 5 = int64
+    // 6 = int32
     int type_from;
     int type_to;
 };

diff --git a/src/layer/gather.cpp b/src/layer/gather.cpp
@@ -0,0 +1,121 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#include "gather.h"
+
+namespace ncnn {
+
+Gather::Gather()
+{
+    one_blob_only = false;
+    support_inplace = false;
+}
+
+int Gather::load_param(const ParamDict& pd)
+{
+    axis = pd.get(0, 0);
+
+    return 0;
+}
+
+int Gather::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
+{
+    if (bottom_blobs.size() < 2)
+        return -1;
+
+    const Mat& input_blob = bottom_blobs[0];
+    const Mat& index_blob = bottom_blobs[1];
+    const int dims = input_blob.dims;
+
+    // index_blob should contain int64 or int32 indices
+    // For simplicity we treat it as float and cast
+    const int index_size = (int)index_blob.total();
+
+    int positive_axis = axis < 0 ? axis + dims : axis;
+    if (positive_axis < 0 || positive_axis >= dims)
+        return -1;
+
+    int shape[4] = {1, 1, 1, 1};
+    shape[0] = input_blob.w;
+    if (dims >= 2) shape[1] = input_blob.h;
+    if (dims == 3) shape[2] = input_blob.c;
+    if (dims == 4) shape[2] = input_blob.c; // w*h*c layout
+
+    const int axis_dim_size = shape[positive_axis];
+
+    // Output shape matches index_blob shape
+    const Mat& out_shape = index_blob;
+
+    // Allocate output (same dtype as input, shape matches index)
+    Mat& top_blob = top_blobs[0];
+    top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
-    top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    if (out_shape.dims == 1)
+        top_blob.create(out_shape.w, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 2)
+        top_blob.create(out_shape.w, out_shape.h, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 3)
+        top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 4)
+        top_blob.create(out_shape.w, out_shape.h, out_shape.d, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else
+        return -1;
-    top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    if (out_shape.dims == 1)
+        top_blob.create(out_shape.w, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 2)
+        top_blob.create(out_shape.w, out_shape.h, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 3)
+        top_blob.create(out_shape.w, out_shape.h, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else if (out_shape.dims == 4)
+        top_blob.create(out_shape.w, out_shape.h, out_shape.d, out_shape.c, input_blob.elemsize, input_blob.elempack, opt.blob_allocator);
+    else
+        return -1;
+    if (top_blob.empty())
+        return -100;
+
+    const float* inp = input_blob;
+    const int* idx = (const int*)index_blob;
+    float* out = top_blob;
+
+    // General case: iterate over all output positions
+    // Map flat output index to multi-dimensional coords,
+    // then compute corresponding input position with index substitution
+    const int total_out = (int)top_blob.total();
+    for (int i = 0; i < total_out; i++)
+    {
+        // Decompose flat index i into coordinates based on top_blob shape
+        int rem = i;
+        int coord_out[4] = {0, 0, 0, 0};
+        if (top_blob.dims == 1)
+        {
+            coord_out[0] = rem;
+        }
+        else if (top_blob.dims == 2)
+        {
+            coord_out[0] = rem % top_blob.w;
+            coord_out[1] = rem / top_blob.w;
+        }
+        else if (top_blob.dims == 3)
+        {
+            int hw = top_blob.w * top_blob.h;
+            coord_out[0] = (rem % hw) % top_blob.w;
+            coord_out[1] = (rem % hw) / top_blob.w;
+            coord_out[2] = rem / hw;
+        }
+
+        // Get index value at this output position
+        int gather_idx = idx[i];
+        // Handle negative indices
+        if (gather_idx < 0) gather_idx += axis_dim_size;
+
+        // Build input coordinate (same as output, but axis coord replaced)
+        int coord_in[4] = {coord_out[0], coord_out[1], coord_out[2], coord_out[3]};
+        coord_in[positive_axis] = gather_idx;
+
+        // Clamp to input bounds
+        if (coord_in[positive_axis] >= axis_dim_size) coord_in[positive_axis] = axis_dim_size - 1;
+        if (coord_in[positive_axis] < 0) coord_in[positive_axis] = 0;
+
+        // Compute flat input index
+        int flat_in = 0;
+        if (dims == 1)
+        {
+            flat_in = coord_in[0];
+        }
+        else if (dims == 2)
+        {
+            flat_in = coord_in[0] + coord_in[1] * input_blob.w;
+        }
+        else if (dims == 3)
+        {
+            // ncnn 3D layout: w * h * c, with cstride padding
+            size_t cstep = input_blob.cstep;
+            flat_in = coord_in[0] + coord_in[1] * input_blob.w + coord_in[2] * (int)cstep;
+        }
+
+        out[i] = inp[flat_in];
+    }
+
+    return 0;
+}
+
+} // namespace ncnn
diff --git a/src/layer/gather.h b/src/layer/gather.h
@@ -0,0 +1,27 @@
+// Copyright 2025 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+#ifndef LAYER_GATHER_H
+#define LAYER_GATHER_H
+
+#include "layer.h"
+
+namespace ncnn {
+
+class Gather : public Layer
+{
+public:
+    Gather();
+
+    virtual int load_param(const ParamDict& pd);
+
+    virtual int forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const;
+
+public:
+    // param_0 = axis (default 0)
+    int axis;
+};
+
+} // namespace ncnn
+
+#endif // LAYER_GATHER_H