diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h
new file mode 100644
index 000000000000..7f40c0d82d4b
--- /dev/null
+++ b/src/layer/riscv/convolution_packed_int8.h
@@ -0,0 +1,575 @@
+// Copyright 2026 Tencent
+// SPDX-License-Identifier: BSD-3-Clause
+
+static void convolution_transform_kernel_packed_int8_rvv(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h)
+{
+    const int maxk = kernel_w * kernel_h;
+
+    // src = kw-kh-inch-outch
+    // dst = pb-pa-kw-kh-inch/vlm1-outch/vlm1
+
+    // clang-format off
+    // *INDENT-OFF*
+#if __riscv_vector
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+    const size_t vlm4 = __riscv_vsetvlmax_e32m4();
+
+    const int packn = (int)vlm1;
+    const int pack4n = (int)vlm4;
+
+    if (outch >= pack4n)
+    {
+        if (inch >= pack4n)
+            kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch / pack4n + (outch % pack4n) / packn + outch % packn, (size_t)(pack4n * pack4n), pack4n * pack4n);
+        else
+            kernel_tm.create(maxk, inch, outch / pack4n + (outch % pack4n) / packn + outch % packn, (size_t)pack4n, pack4n);
+    }
+    else if (outch >= packn)
+    {
+        if (inch >= pack4n)
+            kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch / packn + outch % packn, (size_t)(pack4n * packn), pack4n * packn);
+        else
+            kernel_tm.create(maxk, inch, outch / packn + outch % packn, (size_t)packn, packn);
+    }
+    else
+#endif // __riscv_vector
+    {
+#if __riscv_vector
+        if (inch >= pack4n)
+            kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch, (size_t)pack4n, pack4n);
+        else
+#endif // __riscv_vector
+            kernel_tm.create(maxk, inch, outch, (size_t)1u, 1);
+    }
+    // *INDENT-ON*
+    // clang-format on
+
+    int q = 0;
+#if __riscv_vector
+    for (; q + pack4n - 1 < outch; q += pack4n)
+    {
+        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;
+        signed char* g00 = kernel_tm.channel(q / pack4n);
+
+        int p = 0;
+        for (; p + pack4n - 1 < inch; p += pack4n)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                for (size_t i = 0; i < pack4n; i++)
+                {
+                    const signed char* src = kptr + (p + i) * maxk + k;
+                    vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm4);
+                    __riscv_vse8_v_i8m1(g00, row, vlm4);
+                    g00 += pack4n;
+                }
+            }
+        }
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* src = kptr + p * maxk + k;
+                vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm4);
+                __riscv_vse8_v_i8m1(g00, row, vlm4);
+                g00 += pack4n;
+            }
+        }
+    }
+    for (; q + packn - 1 < outch; q += packn)
+    {
+        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;
+        signed char* g00 = kernel_tm.channel(q / pack4n + (q % pack4n) / packn);
+        int p = 0;
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* src = kptr + p * maxk + k;
+                vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm1);
+                __riscv_vse8_v_i8m1(g00, row, vlm1);
+                g00 += packn;
+            }
+        }
+    }
+#endif // __riscv_vector
+    for (; q < outch; q++)
+    {
+        const signed char* kptr = (const signed char*)kernel + q * inch * maxk;
+#if __riscv_vector
+        signed char* g00 = kernel_tm.channel(q / pack4n + (q % pack4n) / packn + q % packn);
+#else
+        signed char* g00 = kernel_tm.channel(q);
+#endif
+
+        int p = 0;
+#if __riscv_vector
+        for (; p + pack4n - 1 < inch; p += pack4n)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                const signed char* k0 = kptr + k;
+
+                for (size_t i = 0; i < pack4n; i++)
+                {
+                    g00[0] = k0[0];
+                    k0 += maxk;
+                    g00 += 1;
+                }
+            }
+            kptr += maxk * pack4n;
+        }
+#endif // __riscv_vector
+        for (; p < inch; p++)
+        {
+            for (int k = 0; k < maxk; k++)
+            {
+                g00[0] = kptr[0];
+                g00++;
+                kptr++;
+            }
+        }
+    }
+    return;
+}
+
+static void convolution_packed_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
+{
+    const int w = bottom_blob.w;
+    const int elempack = bottom_blob.elempack;
+    const int inch = bottom_blob.c * elempack;
+
+    const int outw = top_blob.w;
+    const int outh = top_blob.h;
+    const int out_elempack = top_blob.elempack;
+    const int outch = top_blob.c * out_elempack;
+#if __riscv_vector
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+    const size_t vlm4 = __riscv_vsetvlmax_e32m4();
+
+    const int pack4n = (int)vlm4;
+    const int packn = (int)vlm1;
+    const size_t N = (elempack == pack4n) ? 1 : bottom_blob.cstep * elempack;
+    const size_t M = top_blob.cstep * out_elempack;
+#endif
+    // kernel offsets
+    const int maxk = kernel_w * kernel_h;
+    std::vector<int> _space_ofs(maxk);
+    int* space_ofs = &_space_ofs[0];
+    {
+        int p1 = 0;
+        int p2 = 0;
+        int gap = w * dilation_h - kernel_w * dilation_w;
+        for (int i = 0; i < kernel_h; i++)
+        {
+            for (int j = 0; j < kernel_w; j++)
+            {
+                space_ofs[p1] = p2 * elempack;
+                p1++;
+                p2 += dilation_w;
+            }
+            p2 += gap;
+        }
+    }
+
+    int nn_outch = 0;
+    int remain_outch_start = 0;
+#if __riscv_vector
+    nn_outch = (outch - remain_outch_start) / pack4n;
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * pack4n;
+        int* outptr = top_blob.channel(p / out_elempack);
+
+        int ij = 0;
+        for (; ij + 1 < outw * outh; ij += 2)
+        {
+            const int i0 = ij / outw;
+            const int j0 = ij % outw;
+            const int i1 = (ij + 1) / outw;
+            const int j1 = (ij + 1) % outw;
+
+            vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vlm4);
+            vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vlm4);
+            const signed char* kptr = weight_data_tm.channel(p / pack4n);
+
+            int q = 0;
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
+                const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+                    for (int l = 0; l < pack4n; l++)
+                    {
+                        vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                        vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vlm4);
+                        vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[l * N], vlm4);
+                        _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4);
+                        _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4);
+
+                        kptr += pack4n;
+                    }
+                }
+            }
+
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                    vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[0], vlm4);
+                    vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[0], vlm4);
+                    _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4);
+                    _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4);
+
+                    kptr += pack4n;
+                }
+            }
+
+            if (out_elempack == packn)
+            {
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum0, 0), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum0, 1), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum0, 2), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum0, 3), vlm1);
+                outptr += packn;
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum1, 0), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum1, 1), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum1, 2), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum1, 3), vlm1);
+                outptr += packn;
+            }
+
+            if (out_elempack == 1)
+            {
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum0, vlm4);
+                outptr += 1;
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum1, vlm4);
+                outptr += 1;
+            }
+        }
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vlm4);
+            const signed char* kptr = weight_data_tm.channel(p / pack4n);
+
+            int q = 0;
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    for (int l = 0; l < pack4n; l++)
+                    {
+                        vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                        vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vlm4);
+                        _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4);
+
+                        kptr += pack4n;
+                    }
+                }
+            }
+
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                    vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[0], vlm4);
+                    _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4);
+
+                    kptr += pack4n;
+                }
+            }
+
+            if (out_elempack == packn)
+            {
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum, 0), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum, 1), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum, 2), vlm1);
+                __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum, 3), vlm1);
+                outptr += packn;
+            }
+
+            if (out_elempack == 1)
+            {
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum, vlm4);
+                outptr += 1;
+            }
+        }
+    }
+    remain_outch_start += nn_outch * pack4n;
+
+    nn_outch = (outch - remain_outch_start) / packn;
+    const size_t vl = __riscv_vsetvl_e8m1(packn);
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int pp = 0; pp < nn_outch; pp++)
+    {
+        const int p = remain_outch_start + pp * packn;
+        int* outptr = top_blob.channel(p / out_elempack);
+
+        int ij = 0;
+        for (; ij + 1 < outw * outh; ij += 2)
+        {
+            const int i0 = ij / outw;
+            const int j0 = ij % outw;
+            const int i1 = (ij + 1) / outw;
+            const int j1 = (ij + 1) % outw;
+
+            vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vl);
+            vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vl);
+            const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn);
+
+            int q = 0;
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
+                const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;
+                for (int l = 0; l < pack4n; l++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        const signed char* r0s = r0 + space_ofs[k];
+                        const signed char* r1s = r1 + space_ofs[k];
+                        vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl);
+                        vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vl);
+                        vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[l * N], vl);
+                        _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vl);
+                        _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vl);
+
+                        kptr += vl;
+                    }
+                }
+            }
+
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
+                const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl);
+                    vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[0], vl);
+                    vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[0], vl);
+                    _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vl);
+                    _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vl);
+
+                    kptr += vl;
+                }
+            }
+
+            if (out_elempack == packn)
+            {
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum0, 0), vl);
+                outptr += packn;
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum1, 0), vl);
+                outptr += packn;
+            }
+
+            if (out_elempack == 1)
+            {
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum0, vl);
+                outptr += 1;
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum1, vl);
+                outptr += 1;
+            }
+        }
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vl);
+            const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn);
+
+            int q = 0;
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+                for (int l = 0; l < pack4n; l++)
+                {
+                    for (int k = 0; k < maxk; k++)
+                    {
+                        const signed char* r0s = r0 + space_ofs[k];
+                        vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl);
+                        vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vl);
+                        _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vl);
+
+                        kptr += vl;
+                    }
+                }
+            }
+
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl);
+                    vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[0], vl);
+                    _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vl);
+
+                    kptr += vl;
+                }
+            }
+
+            if (out_elempack == packn)
+            {
+                __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum, 0), vl);
+                outptr += packn;
+            }
+
+            if (out_elempack == 1)
+            {
+                __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum, vl);
+                outptr += 1;
+            }
+        }
+    }
+
+    remain_outch_start += nn_outch * packn;
+#endif // __riscv_vector
+    #pragma omp parallel for num_threads(opt.num_threads)
+    for (int p = remain_outch_start; p < outch; p++)
+    {
+        int* outptr = top_blob.channel(p);
+
+        int ij = 0;
+        for (; ij + 1 < outw * outh; ij += 2)
+        {
+            const int i0 = ij / outw;
+            const int j0 = ij % outw;
+            const int i1 = (ij + 1) / outw;
+            const int j1 = (ij + 1) % outw;
+
+            int sum0 = 0;
+            int sum1 = 0;
+#if __riscv_vector
+            const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn + p % packn);
+#else
+            const signed char* kptr = weight_data_tm.channel(p);
+#endif
+            int q = 0;
+#if __riscv_vector
+            vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vlm4);
+            vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vlm4);
+
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
+                const signed char* r1 = bottom_blob.channel(q / elempack).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+                    vint8m1_t _r0 = __riscv_vlse8_v_i8m1(r0s, N, vlm4);
+                    vint8m1_t _r1 = __riscv_vlse8_v_i8m1(r1s, N, vlm4);
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                    vint16m2_t _s0 = __riscv_vwmul_vv_i16m2(_w, _r0, vlm4);
+                    vint16m2_t _s1 = __riscv_vwmul_vv_i16m2(_w, _r1, vlm4);
+                    _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4);
+                    _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4);
+
+                    kptr += pack4n;
+                }
+            }
+
+            vint32m1_t _sum00 = __riscv_vmv_v_x_i32m1(0, vlm1);
+            _sum00 = __riscv_vredsum_vs_i32m4_i32m1(_sum0, _sum00, vlm4);
+            sum0 += __riscv_vmv_x_s_i32m1_i32(_sum00);
+
+            vint32m1_t _sum11 = __riscv_vmv_v_x_i32m1(0, vlm1);
+            _sum11 = __riscv_vredsum_vs_i32m4_i32m1(_sum1, _sum11, vlm4);
+            sum1 += __riscv_vmv_x_s_i32m1_i32(_sum11);
+#endif
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i0 * stride_h) + j0 * stride_w * elempack;
+                const signed char* r1 = bottom_blob.channel(q).row<const signed char>(i1 * stride_h) + j1 * stride_w * elempack;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    const signed char* r1s = r1 + space_ofs[k];
+                    sum0 += r0s[0] * kptr[0];
+                    sum1 += r1s[0] * kptr[0];
+                    kptr++;
+                }
+            }
+            outptr[0] = sum0;
+            outptr[1] = sum1;
+            outptr += 2;
+        }
+
+        for (; ij < outw * outh; ij++)
+        {
+            const int i = ij / outw;
+            const int j = ij % outw;
+
+            int sum = 0;
+#if __riscv_vector
+            const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn + p % packn);
+#else
+            const signed char* kptr = weight_data_tm.channel(p);
+#endif
+            int q = 0;
+#if __riscv_vector
+            vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vlm4);
+
+            for (; q + pack4n - 1 < inch; q += pack4n)
+            {
+                const signed char* r0 = bottom_blob.channel(q / elempack).row<const signed char>(i * stride_h) + j * stride_w * elempack;
+
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    vint8m1_t _r = __riscv_vlse8_v_i8m1(r0s, N, vlm4);
+                    vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4);
+                    vint16m2_t _s = __riscv_vwmul_vv_i16m2(_w, _r, vlm4);
+                    _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4);
+                    kptr += pack4n;
+                }
+            }
+
+            vint32m1_t _sum0 = __riscv_vmv_v_x_i32m1(0, vlm1);
+            _sum0 = __riscv_vredsum_vs_i32m4_i32m1(_sum, _sum0, vlm4);
+            sum += __riscv_vmv_x_s_i32m1_i32(_sum0);
+#endif
+            for (; q < inch; q++)
+            {
+                const signed char* r0 = bottom_blob.channel(q).row<const signed char>(i * stride_h) + j * stride_w;
+                for (int k = 0; k < maxk; k++)
+                {
+                    const signed char* r0s = r0 + space_ofs[k];
+                    sum += r0s[0] * kptr[0];
+                    kptr++;
+                }
+            }
+            outptr[0] = sum;
+            outptr += 1;
+        }
+    }
+
+    return;
+}
\ No newline at end of file
diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp
index a45e8bc6223a..aacbad129ede 100644
--- a/src/layer/riscv/convolution_riscv.cpp
+++ b/src/layer/riscv/convolution_riscv.cpp
@@ -21,6 +21,10 @@ namespace ncnn {
 #include "convolution_3x3_winograd.h"
 #include "convolution_packed.h"
 
+#if NCNN_INT8
+#include "convolution_packed_int8.h"
+#endif // NCNN_INT8
+
 #if __riscv_vector
 #include "convolution_3x3_pack1ton.h"
 #include "convolution_7x7_pack1ton.h"
@@ -54,8 +58,7 @@ int Convolution_riscv::create_pipeline(const Option& opt)
 #if NCNN_INT8
     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
     {
-        // TODO implement int8
-        return 0;
+        return create_pipeline_int8_rvv(opt);
     }
 #endif
 
@@ -201,31 +204,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti
 #if NCNN_INT8
     if (opt.use_int8_inference && int8_scale_term)
     {
-        Mat bottom_blob_unpacked = bottom_blob;
-        if (bottom_blob.elempack != 1)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
-            if (bottom_blob_unpacked.empty())
-                return -100;
-        }
-
-        Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked;
-        if (bottom_blob_unpacked.elembits() == 16)
-        {
-            Option opt_pack1 = opt;
-            opt_pack1.blob_allocator = opt.workspace_allocator;
-
-            cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1);
-            if (bottom_blob_unpacked_fp32.empty())
-                return -100;
-        }
-
-        Option opt_unpacked = opt;
-        opt_unpacked.use_packing_layout = false;
-        return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked);
+        return forward_int8_rvv(bottom_blob, top_blob, opt);
     }
 #endif
 
@@ -545,4 +524,123 @@ int Convolution_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector
     return 0;
 }
 
+#if NCNN_INT8
+int Convolution_riscv::create_pipeline_int8_rvv(const Option& opt)
+{
+    const int maxk = kernel_w * kernel_h;
+    const int num_input = weight_data_size / maxk / num_output;
+
+    // TODO: implement kernel transform for winograd, sgemm, etc
+    convolution_transform_kernel_packed_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h);
+
+    scale_in_data.create(num_output);
+    for (int p = 0; p < num_output; p++)
+    {
+        // requantize and relu
+        float scale_in;
+        if (weight_data_int8_scales[p] == 0)
+            scale_in = 0;
+        else
+            scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
+
+        scale_in_data[p] = scale_in;
+    }
+
+    if (opt.lightmode)
+        weight_data.release();
+
+    return 0;
+}
+
+int Convolution_riscv::forward_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
+{
+#if __riscv_vector
+    const int packn = csrr_vlenb() / 4;
+    const int packn_s8 = csrr_vlenb();
+#endif // __riscv_vector
+
+    int elembits = bottom_blob.elembits();
+
+    Mat bottom_blob_int8 = bottom_blob;
+    if (elembits != 8)
+    {
+        Option opt_q = opt;
+        opt_q.blob_allocator = opt.workspace_allocator;
+        quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
+        if (bottom_blob_int8.empty())
+            return -100;
+    }
+
+    Mat bottom_blob_bordered;
+    make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
+    if (bottom_blob_bordered.empty())
+        return -100;
+
+    int w = bottom_blob_bordered.w;
+    int h = bottom_blob_bordered.h;
+    int channels = bottom_blob.c;
+    size_t elemsize = bottom_blob.elemsize;
+    int elempack = bottom_blob_bordered.elempack;
+
+    const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+    const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+
+    int outw = (w - kernel_extent_w) / stride_w + 1;
+    int outh = (h - kernel_extent_h) / stride_h + 1;
+
+    bool use_int8_requantize = int8_scale_term > 100;
+    int out_elempack = 1;
+    int out_elempack_int32 = 1;
+#if __riscv_vector
+    if (opt.use_packing_layout)
+    {
+        if (use_int8_requantize)
+        {
+            out_elempack = num_output % packn_s8 == 0 ? packn_s8 : 1;
+        }
+        else
+        {
+            out_elempack = num_output % packn == 0 ? packn : 1;
+        }
+        out_elempack_int32 = num_output % packn == 0 ? packn : 1;
+    }
+#endif // __riscv_vector
+    size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
+#if NCNN_ZFH
+    if (support_fp16_storage && opt.use_fp16_storage)
+    {
+        out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack;
+    }
+#endif // NCNN_ZFH
+
+    Mat top_blob_int32;
+    top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator);
+    if (top_blob_int32.empty())
+        return -100;
+
+    // TODO: Implement winograd, sgemm, etc
+    convolution_packed_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
+    bottom_blob_bordered.release();
+
+    top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
+    if (top_blob.empty())
+        return -100;
+
+    if (use_int8_requantize)
+    {
+        requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
+    }
+    else
+    {
+        dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
+
+        if (activation)
+        {
+            activation->forward_inplace(top_blob, opt);
+        }
+    }
+    return 0;
+}
+#endif // NCNN_INT8
+
 } // namespace ncnn
diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h
index 2d8215f86159..1664d63df0e6 100644
--- a/src/layer/riscv/convolution_riscv.h
+++ b/src/layer/riscv/convolution_riscv.h
@@ -26,6 +26,10 @@ class Convolution_riscv : public Convolution
     int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
     int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
 #endif
+#if NCNN_INT8
+    int create_pipeline_int8_rvv(const Option& opt);
+    int forward_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const;
+#endif
 
 public:
     Layer* activation;
@@ -40,6 +44,10 @@ class Convolution_riscv : public Convolution
 
     // fp16
     Mat bias_data_fp16;
+
+#if NCNN_INT8
+    Mat scale_in_data;
+#endif
 };
 
 } // namespace ncnn
diff --git a/src/layer/riscv/dequantize_riscv_zfh.cpp b/src/layer/riscv/dequantize_riscv_zfh.cpp
index 29c3de64b4e8..dd6cdaaeed95 100644
--- a/src/layer/riscv/dequantize_riscv_zfh.cpp
+++ b/src/layer/riscv/dequantize_riscv_zfh.cpp
@@ -12,6 +12,210 @@
 
 namespace ncnn {
 #if NCNN_ZFH
+#if __riscv_vector
+static void dequantize_packnton_f16_fp16s(const int* ptr0, const int* ptr1, __fp16* f16ptr, const Mat& scale_data, const Mat& bias_data, int elemcount)
+{
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+    const size_t vlm2 = __riscv_vsetvlmax_e32m2();
+    const size_t vlm4 = __riscv_vsetvlmax_e32m4();
+    const size_t vlm8 = __riscv_vsetvlmax_e32m8();
+
+    float scale = scale_data[0];
+    vfloat32m2_t _scale0 = __riscv_vfmv_v_f_f32m2(scale, vlm2);
+    if (scale_data.w > 1)
+    {
+        _scale0 = __riscv_vle32_v_f32m2(scale_data, vlm2);
+    }
+    vfloat32m4_t _scale1 = __riscv_vcreate_v_f32m2_f32m4(_scale0, _scale0);
+    vfloat32m8_t _scale2 = __riscv_vcreate_v_f32m4_f32m8(_scale1, _scale1);
+
+    int i = 0;
+    if (bias_data.w == 0)
+    {
+        for (; i + 3 < elemcount; i += 4)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 2, vlm1);
+            vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 2, vlm1);
+            vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 3, vlm1);
+            vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 3, vlm1);
+            vint32m8_t _v = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vlm8);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale2, vlm8);
+            __riscv_vse16_v_f16m4(f16ptr, __riscv_vfncvt_f_f_w_f16m4(_vf, vlm8), vlm8);
+
+            ptr0 += vlm1 * 4;
+            ptr1 += vlm1 * 4;
+            f16ptr += vlm8;
+        }
+
+        for (; i + 1 < elemcount; i += 2)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m4_t _v = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3);
+            vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_v, vlm4);
+            _vf = __riscv_vfmul_vv_f32m4(_vf, _scale1, vlm4);
+            __riscv_vse16_v_f16m2(f16ptr, __riscv_vfncvt_f_f_w_f16m2(_vf, vlm4), vlm4);
+
+            ptr0 += vlm1 * 2;
+            ptr1 += vlm1 * 2;
+            f16ptr += vlm4;
+        }
+
+        for (; i < elemcount; i++)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m2_t _v = __riscv_vcreate_v_i32m1_i32m2(_v0, _v1);
+            vfloat32m2_t _vf = __riscv_vfcvt_f_x_v_f32m2(_v, vlm2);
+            _vf = __riscv_vfmul_vv_f32m2(_vf, _scale0, vlm2);
+            __riscv_vse16_v_f16m1(f16ptr, __riscv_vfncvt_f_f_w_f16m1(_vf, vlm2), vlm2);
+
+            ptr0 += vlm1;
+            ptr1 += vlm1;
+            f16ptr += vlm2;
+        }
+    }
+    else
+    {
+        float bias = bias_data[0];
+        vfloat32m2_t _bias0 = __riscv_vfmv_v_f_f32m2(bias, vlm2);
+        if (bias_data.w > 1)
+        {
+            _bias0 = __riscv_vle32_v_f32m2(bias_data, vlm2);
+        }
+        vfloat32m4_t _bias1 = __riscv_vcreate_v_f32m2_f32m4(_bias0, _bias0);
+        vfloat32m8_t _bias2 = __riscv_vcreate_v_f32m4_f32m8(_bias1, _bias1);
+
+        for (; i + 3 < elemcount; i += 4)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 2, vlm1);
+            vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 2, vlm1);
+            vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 3, vlm1);
+            vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 3, vlm1);
+            vint32m8_t _v = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vlm8);
+            _vf = __riscv_vfmacc_vv_f32m8(_bias2, _vf, _scale2, vlm8);
+            __riscv_vse16_v_f16m4(f16ptr, __riscv_vfncvt_f_f_w_f16m4(_vf, vlm8), vlm8);
+
+            ptr0 += vlm1 * 4;
+            ptr1 += vlm1 * 4;
+            f16ptr += vlm8;
+        }
+
+        for (; i + 1 < elemcount; i += 2)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m4_t _v = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3);
+            vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_v, vlm4);
+            _vf = __riscv_vfmacc_vv_f32m4(_bias1, _vf, _scale1, vlm4);
+            __riscv_vse16_v_f16m2(f16ptr, __riscv_vfncvt_f_f_w_f16m2(_vf, vlm4), vlm4);
+
+            ptr0 += vlm1 * 2;
+            ptr1 += vlm1 * 2;
+            f16ptr += vlm4;
+        }
+
+        for (; i < elemcount; i++)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m2_t _v = __riscv_vcreate_v_i32m1_i32m2(_v0, _v1);
+            vfloat32m2_t _vf = __riscv_vfcvt_f_x_v_f32m2(_v, vlm2);
+            _vf = __riscv_vfmacc_vv_f32m2(_bias0, _vf, _scale0, vlm2);
+            __riscv_vse16_v_f16m1(f16ptr, __riscv_vfncvt_f_f_w_f16m1(_vf, vlm2), vlm2);
+
+            ptr0 += vlm1;
+            ptr1 += vlm1;
+            f16ptr += vlm2;
+        }
+    }
+}
+
+static void dequantize_packnto1_fp16s(const int* intptr, __fp16* f16ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int stride)
+{
+    const size_t vlm8 = __riscv_vsetvlmax_e32m8();
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+
+    float scale = scale_data[0];
+    vfloat32m8_t _scale = __riscv_vfmv_v_f_f32m8(scale, vlm8);
+    if (scale_data.w > 1)
+    {
+        vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_data, vlm1);
+        _scale = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s);
+    }
+
+    __fp16 tmp[vlm8];
+    int n = elemcount * vlm1;
+    if (bias_data.w == 0)
+    {
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vint32m8_t _v = __riscv_vle32_v_i32m8(intptr, vl);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vl);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale, vl);
+            __riscv_vse16_v_f16m4(tmp, __riscv_vfncvt_f_f_w_f16m4(_vf, vl), vl);
+            for (size_t j = 0; j < (vl / vlm1); j++)
+            {
+                for (int i = 0; i < vlm1; i++)
+                {
+                    f16ptr[i * stride] = tmp[j * vlm1 + i];
+                }
+                f16ptr++;
+            }
+
+            intptr += vl;
+            n -= vl;
+        }
+    }
+    else
+    {
+        float bias = bias_data[0];
+        vfloat32m8_t _bias = __riscv_vfmv_v_f_f32m8(bias, vlm8);
+        if (bias_data.w > 1)
+        {
+            vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1);
+            _bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b);
+        }
+
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vint32m8_t _v = __riscv_vle32_v_i32m8(intptr, vl);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vl);
+            _vf = __riscv_vfmacc_vv_f32m8(_bias, _vf, _scale, vl);
+            __riscv_vse16_v_f16m4(tmp, __riscv_vfncvt_f_f_w_f16m4(_vf, vl), vl);
+
+            for (size_t j = 0; j < (vl / vlm1); j++)
+            {
+                for (int i = 0; i < vlm1; i++)
+                {
+                    f16ptr[i * stride] = tmp[j * vlm1 + i];
+                }
+                f16ptr++;
+            }
+
+            intptr += vl;
+            n -= vl;
+        }
+    }
+}
+#endif // __riscv_vector
+
 static void dequantize_fp16s(const int* intptr, __fp16* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack)
 {
     const int size = elemcount * elempack;
@@ -93,11 +297,24 @@ int Dequantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
     const int d = bottom_blob.d;
     const int channels = bottom_blob.c;
     const int elempack = bottom_blob.elempack;
-    const size_t out_elemsize = elempack * 2u;
 
+#if __riscv_vector
+    const int packn = csrr_vlenb() / 4;
+    const int packn_f16 = csrr_vlenb() / 2;
+#endif // __riscv_vector
     if (dims == 1)
     {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = w * elempack % packn_f16 == 0 ? packn_f16 : 1;
+        }
+#endif
+        const int outw = w * elempack / out_elempack;
+        size_t out_elemsize = out_elempack * 2u;
+
+        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -118,40 +335,127 @@ int Dequantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const
 
     if (dims == 2)
     {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h * elempack % packn_f16 == 0 ? packn_f16 : 1;
+        }
+#endif
+        const int outh = h * elempack / out_elempack;
+        size_t out_elemsize = out_elempack * 2u;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
+#if __riscv_vector
+        if (elempack == packn && out_elempack == packn_f16)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const int* ptr0 = bottom_blob.row<const int>(i * 2);
+                const int* ptr1 = bottom_blob.row<const int>(i * 2 + 1);
+                __fp16* f16ptr = top_blob.row<__fp16>(i);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < h; i++)
+                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data;
+
+                dequantize_packnton_f16_fp16s(ptr0, ptr1, f16ptr, scale_data_i, bias_data_i, w);
+            }
+        }
+
+        if (elempack == packn && out_elempack == 1)
         {
-            const int* intptr = bottom_blob.row<const int>(i);
-            __fp16* ptr = top_blob.row<__fp16>(i);
-            const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
-            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                __fp16* f16ptr = top_blob.row<__fp16>(i * packn);
+
+                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
 
-            dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
+                dequantize_packnto1_fp16s(intptr, f16ptr, scale_data_i, bias_data_i, w, w);
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                __fp16* ptr = top_blob.row<__fp16>(i);
+                const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
+
+                dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack);
+            }
         }
     }
 
     if (dims == 3 || dims == 4)
     {
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = channels * elempack % packn_f16 == 0 ? packn_f16 : 1;
+        }
+#endif
+        const int outc = channels * elempack / out_elempack;
+        size_t out_elemsize = out_elempack * 2u;
+
         if (dims == 3)
-            top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
         else
-            top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
+#if __riscv_vector
+        if (elempack == packn && out_elempack == packn_f16)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < outc; q++)
+            {
+                const int* ptr0 = bottom_blob.channel(q * 2);
+                const int* ptr1 = bottom_blob.channel(q * 2 + 1);
+                __fp16* f16ptr = top_blob.channel(q);
 
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data;
+
+                dequantize_packnton_f16_fp16s(ptr0, ptr1, f16ptr, scale_data_q, bias_data_q, w * h * d);
+            }
+        }
+
+        if (elempack == packn && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                __fp16* f16ptr = top_blob.channel(q * packn);
+                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
+
+                dequantize_packnto1_fp16s(intptr, f16ptr, scale_data_q, bias_data_q, w * h * d, top_blob.cstep);
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1 && out_elempack == 1)
         {
-            const int* intptr = bottom_blob.channel(q);
-            __fp16* ptr = top_blob.channel(q);
-            const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
-            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                __fp16* ptr = top_blob.channel(q);
+                const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
 
-            dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h * d, elempack);
+                dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h * d, elempack);
+            }
         }
     }
 
diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp
index eaee285864c8..aaed417f9db7 100644
--- a/src/layer/riscv/requantize_riscv.cpp
+++ b/src/layer/riscv/requantize_riscv.cpp
@@ -19,6 +19,217 @@ Requantize_riscv::Requantize_riscv()
 #endif // __riscv_vector
 }
 
+#if __riscv_vector
+static void requantize_packnton_s8(const int* ptr0, const int* ptr1, const int* ptr2, const int* ptr3, signed char* s8ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount)
+{
+    const size_t vlm8 = __riscv_vsetvlmax_e32m8();
+    const size_t vlm4 = __riscv_vsetvlmax_e32m4();
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+
+    float scale_in = scale_in_data[0];
+    float scale_out = scale_out_data[0];
+
+    vfloat32m4_t _scale_in0 = __riscv_vfmv_v_f_f32m4(scale_in, vlm4);
+    if (scale_in_data.w > 1)
+    {
+        _scale_in0 = __riscv_vle32_v_f32m4(scale_in_data, vlm4);
+    }
+
+    vfloat32m4_t _scale_out0 = __riscv_vfmv_v_f_f32m4(scale_out, vlm4);
+    if (scale_out_data.w > 1)
+    {
+        _scale_out0 = __riscv_vle32_v_f32m4(scale_out_data, vlm4);
+    }
+
+    vfloat32m8_t _scale_in = __riscv_vcreate_v_f32m4_f32m8(_scale_in0, _scale_in0);
+    vfloat32m8_t _scale_out = __riscv_vcreate_v_f32m4_f32m8(_scale_out0, _scale_out0);
+
+    if (bias_data.w == 0)
+    {
+        int i = 0;
+        for (; i + 1 < elemcount; i += 2)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1);
+            vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr2 + vlm1, vlm1);
+            vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr3 + vlm1, vlm1);
+            vint32m8_t _vi = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_vi, vlm8);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_in, vlm8);
+            _vf = activation_ps(_vf, activation_type, activation_params, vlm8);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vlm8);
+            __riscv_vse8_v_i8m2(s8ptr, float2int8(_vf, vlm8), vlm8);
+
+            ptr0 += vlm1 * 2;
+            ptr1 += vlm1 * 2;
+            ptr2 += vlm1 * 2;
+            ptr3 += vlm1 * 2;
+            s8ptr += vlm8;
+        }
+
+        for (; i < elemcount; i++)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1);
+            vint32m4_t _vi = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3);
+            vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_vi, vlm4);
+            _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_in0, vlm4);
+            _vf = activation_ps(_vf, activation_type, activation_params, vlm4);
+            _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_out0, vlm4);
+            __riscv_vse8_v_i8m1(s8ptr, float2int8(_vf, vlm4), vlm4);
+
+            ptr0 += vlm1;
+            ptr1 += vlm1;
+            ptr2 += vlm1;
+            ptr3 += vlm1;
+            s8ptr += vlm4;
+        }
+    }
+    else
+    {
+        float bias = bias_data[0];
+        vfloat32m4_t _bias0 = __riscv_vfmv_v_f_f32m4(bias, vlm4);
+        if (bias_data.w > 1)
+        {
+            _bias0 = __riscv_vle32_v_f32m4(bias_data, vlm4);
+        }
+        vfloat32m8_t _bias = __riscv_vcreate_v_f32m4_f32m8(_bias0, _bias0);
+
+        int i = 0;
+        for (; i + 1 < elemcount; i += 2)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1);
+            vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1);
+            vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1);
+            vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr2 + vlm1, vlm1);
+            vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr3 + vlm1, vlm1);
+            vint32m8_t _vi = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_vi, vlm8);
+            _vf = __riscv_vfmadd_vv_f32m8(_vf, _scale_in, _bias, vlm8);
+            _vf = activation_ps(_vf, activation_type, activation_params, vlm8);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vlm8);
+            __riscv_vse8_v_i8m2(s8ptr, float2int8(_vf, vlm8), vlm8);
+
+            ptr0 += vlm1 * 2;
+            ptr1 += vlm1 * 2;
+            ptr2 += vlm1 * 2;
+            ptr3 += vlm1 * 2;
+            s8ptr += vlm8;
+        }
+
+        for (; i < elemcount; i++)
+        {
+            vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1);
+            vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1);
+            vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1);
+            vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1);
+            vint32m4_t _vi = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3);
+            vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_vi, vlm4);
+            _vf = __riscv_vfmadd_vv_f32m4(_vf, _scale_in0, _bias0, vlm4);
+            _vf = activation_ps(_vf, activation_type, activation_params, vlm4);
+            _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_out0, vlm4);
+            __riscv_vse8_v_i8m1(s8ptr, float2int8(_vf, vlm4), vlm4);
+
+            ptr0 += vlm1;
+            ptr1 += vlm1;
+            ptr2 += vlm1;
+            ptr3 += vlm1;
+            s8ptr += vlm4;
+        }
+    }
+}
+
+static void requantize_packnto1(const int* ptr, signed char* s8ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int stride)
+{
+    const size_t vlm8 = __riscv_vsetvlmax_e32m8();
+    const size_t vlm1 = __riscv_vsetvlmax_e32m1();
+
+    float scale_in = scale_in_data[0];
+    float scale_out = scale_out_data[0];
+
+    vfloat32m8_t _scale_in = __riscv_vfmv_v_f_f32m8(scale_in, vlm8);
+    if (scale_in_data.w > 1)
+    {
+        vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_in_data, vlm1);
+        _scale_in = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s);
+    }
+
+    vfloat32m8_t _scale_out = __riscv_vfmv_v_f_f32m8(scale_out, vlm8);
+    if (scale_out_data.w > 1)
+    {
+        vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_out_data, vlm1);
+        _scale_out = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s);
+    }
+
+    signed char tmp[vlm8];
+    int n = elemcount * vlm1;
+
+    if (bias_data.w == 0)
+    {
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(ptr, vl), vl);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_in, vl);
+            _vf = activation_ps(_vf, activation_type, activation_params, vl);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vl);
+            __riscv_vse8_v_i8m2(tmp, float2int8(_vf, vl), vl);
+            for (size_t j = 0; j < (vl / vlm1); j++)
+            {
+                for (int i = 0; i < vlm1; i++)
+                {
+                    s8ptr[i * stride] = tmp[j * vlm1 + i];
+                }
+                s8ptr++;
+            }
+
+            ptr += vl;
+            n -= vl;
+        }
+    }
+    else
+    {
+        float bias = bias_data[0];
+        vfloat32m8_t _bias = __riscv_vfmv_v_f_f32m8(bias, vlm8);
+        if (bias_data.w > 1)
+        {
+            vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1);
+            _bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b);
+        }
+
+        while (n > 0)
+        {
+            size_t vl = __riscv_vsetvl_e32m8(n);
+            vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(ptr, vl), vl);
+            _vf = __riscv_vfmadd_vv_f32m8(_vf, _scale_in, _bias, vl);
+            _vf = activation_ps(_vf, activation_type, activation_params, vl);
+            _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vl);
+            __riscv_vse8_v_i8m2(tmp, float2int8(_vf, vl), vl);
+            for (size_t j = 0; j < (vl / vlm1); j++)
+            {
+                for (int i = 0; i < vlm1; i++)
+                {
+                    s8ptr[i * stride] = tmp[j * vlm1 + i];
+                }
+                s8ptr++;
+            }
+
+            ptr += vl;
+            n -= vl;
+        }
+    }
+}
+#endif // __riscv_vector
+
 static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack)
 {
     const int bias_data_size = bias_data.w;
@@ -358,9 +569,24 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
     const int elempack = bottom_blob.elempack;
     const size_t out_elemsize = elempack * 1u;
 
+#if __riscv_vector
+    const int packn = csrr_vlenb() / 4;
+    const int packn_s8 = csrr_vlenb();
+#endif
+
     if (dims == 1)
     {
-        top_blob.create(w, out_elemsize, elempack, opt.blob_allocator);
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = w * elempack % packn_s8 == 0 ? packn_s8 : 1;
+        }
+#endif
+        const int outw = w * elempack / out_elempack;
+        const size_t out_elemsize = out_elempack * 1u;
+
+        top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
@@ -382,44 +608,140 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio
 
     if (dims == 2)
     {
-        top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator);
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = h * elempack % packn_s8 == 0 ? packn_s8 : 1;
+        }
+#endif // __riscv_vector
+        const int outh = h * elempack / out_elempack;
+        const size_t out_elemsize = out_elempack * 1u;
+
+        top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
-
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int i = 0; i < h; i++)
+#if __riscv_vector
+        if (elempack == packn && out_elempack == packn_s8)
         {
-            const int* intptr = bottom_blob.row<const int>(i);
-            signed char* ptr = top_blob.row<signed char>(i);
-
-            const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
-            const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
-            const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < outh; i++)
+            {
+                const int* ptr0 = bottom_blob.row<int>(i * 4);
+                const int* ptr1 = bottom_blob.row<int>(i * 4 + 1);
+                const int* ptr2 = bottom_blob.row<int>(i * 4 + 2);
+                const int* ptr3 = bottom_blob.row<int>(i * 4 + 3);
+                signed char* s8ptr = top_blob.row<signed char>(i);
+
+                const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data;
+                const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data;
+
+                requantize_packnton_s8(ptr0, ptr1, ptr2, ptr3, s8ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w);
+            }
+        }
 
-            requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
+        if (elempack == packn && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* ptr = bottom_blob.row<int>(i);
+                signed char* s8ptr = top_blob.row<signed char>(i * packn);
+
+                const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
+                const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
+
+                requantize_packnto1(ptr, s8ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, w);
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int i = 0; i < h; i++)
+            {
+                const int* intptr = bottom_blob.row<const int>(i);
+                signed char* ptr = top_blob.row<signed char>(i);
+
+                const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data;
+                const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data;
+                const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data;
+
+                requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack);
+            }
         }
     }
 
     if (dims == 3 || dims == 4)
     {
+        int out_elempack = 1;
+#if __riscv_vector
+        if (opt.use_packing_layout)
+        {
+            out_elempack = channels * elempack % packn_s8 == 0 ? packn_s8 : 1;
+        }
+#endif
+        const int outc = channels * elempack / out_elempack;
+        const size_t out_elemsize = out_elempack * 1u;
+
         if (dims == 3)
-            top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator);
+            top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
         else
-            top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator);
+            top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator);
         if (top_blob.empty())
             return -100;
 
-        #pragma omp parallel for num_threads(opt.num_threads)
-        for (int q = 0; q < channels; q++)
+#if __riscv_vector
+        if (elempack == packn && out_elempack == packn_s8)
         {
-            const int* intptr = bottom_blob.channel(q);
-            signed char* ptr = top_blob.channel(q);
-
-            const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
-            const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
-            const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
+            for (int q = 0; q < outc; q++)
+            {
+                const int* ptr0 = bottom_blob.channel(q * 4);
+                const int* ptr1 = bottom_blob.channel(q * 4 + 1);
+                const int* ptr2 = bottom_blob.channel(q * 4 + 2);
+                const int* ptr3 = bottom_blob.channel(q * 4 + 3);
+                signed char* s8ptr = top_blob.channel(q);
+
+                const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data;
+                const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data;
+
+                requantize_packnton_s8(ptr0, ptr1, ptr2, ptr3, s8ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d);
+            }
+        }
 
-            requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, elempack);
+        if (elempack == packn && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* ptr = bottom_blob.channel(q);
+                signed char* s8ptr = top_blob.channel(q * packn);
+
+                const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
+                const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
+
+                requantize_packnto1(ptr, s8ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, top_blob.cstep);
+            }
+        }
+#endif // __riscv_vector
+        if (elempack == 1 && out_elempack == 1)
+        {
+            #pragma omp parallel for num_threads(opt.num_threads)
+            for (int q = 0; q < channels; q++)
+            {
+                const int* intptr = bottom_blob.channel(q);
+                signed char* ptr = top_blob.channel(q);
+
+                const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data;
+                const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data;
+                const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data;
+
+                requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, elempack);
+            }
         }
     }
 
diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp
index cf579e2c968b..c1d558ec3f80 100644
--- a/tests/test_requantize.cpp
+++ b/tests/test_requantize.cpp
@@ -294,12 +294,17 @@ static int test_requantize_3()
 static int test_requantize_4()
 {
     return 0
+#ifndef __riscv
+           || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 1, 1, 24)
+           || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 24, 24, 0)
+#else
+           || test_requantize(RandomIntMat(5, 3, 2, 24), 1, 1, 24)
+           || test_requantize(RandomIntMat(5, 3, 2, 24), 24, 24, 0)
+#endif
            || test_requantize_pack1(RandomIntMat(5, 3, 2, 12), 1, 1, 12)
            || test_requantize_pack1(RandomIntMat(5, 3, 2, 12), 12, 12, 0)
            || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 1, 13, 13)
-           || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 13, 1, 0)
-           || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 1, 1, 24)
-           || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 24, 24, 0);
+           || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 13, 1, 0);
 }
 
 int main()