diff --git a/src/layer/riscv/convolution_packed_int8.h b/src/layer/riscv/convolution_packed_int8.h new file mode 100644 index 000000000000..7f40c0d82d4b --- /dev/null +++ b/src/layer/riscv/convolution_packed_int8.h @@ -0,0 +1,575 @@ +// Copyright 2026 Tencent +// SPDX-License-Identifier: BSD-3-Clause + +static void convolution_transform_kernel_packed_int8_rvv(const Mat& kernel, Mat& kernel_tm, int inch, int outch, int kernel_w, int kernel_h) +{ + const int maxk = kernel_w * kernel_h; + + // src = kw-kh-inch-outch + // dst = pb-pa-kw-kh-inch/vlm1-outch/vlm1 + + // clang-format off + // *INDENT-OFF* +#if __riscv_vector + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + const size_t vlm4 = __riscv_vsetvlmax_e32m4(); + + const int packn = (int)vlm1; + const int pack4n = (int)vlm4; + + if (outch >= pack4n) + { + if (inch >= pack4n) + kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch / pack4n + (outch % pack4n) / packn + outch % packn, (size_t)(pack4n * pack4n), pack4n * pack4n); + else + kernel_tm.create(maxk, inch, outch / pack4n + (outch % pack4n) / packn + outch % packn, (size_t)pack4n, pack4n); + } + else if (outch >= packn) + { + if (inch >= pack4n) + kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch / packn + outch % packn, (size_t)(pack4n * packn), pack4n * packn); + else + kernel_tm.create(maxk, inch, outch / packn + outch % packn, (size_t)packn, packn); + } + else +#endif // __riscv_vector + { +#if __riscv_vector + if (inch >= pack4n) + kernel_tm.create(maxk, inch / pack4n + inch % pack4n, outch, (size_t)pack4n, pack4n); + else +#endif // __riscv_vector + kernel_tm.create(maxk, inch, outch, (size_t)1u, 1); + } + // *INDENT-ON* + // clang-format on + + int q = 0; +#if __riscv_vector + for (; q + pack4n - 1 < outch; q += pack4n) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; + signed char* g00 = kernel_tm.channel(q / pack4n); + + int p = 0; + for (; p + pack4n - 1 < inch; p += pack4n) + { + for (int k = 0; k < maxk; k++) + { + for (size_t i = 0; i < pack4n; i++) + { + const signed char* src = kptr + (p + i) * maxk + k; + vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm4); + __riscv_vse8_v_i8m1(g00, row, vlm4); + g00 += pack4n; + } + } + } + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* src = kptr + p * maxk + k; + vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm4); + __riscv_vse8_v_i8m1(g00, row, vlm4); + g00 += pack4n; + } + } + } + for (; q + packn - 1 < outch; q += packn) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; + signed char* g00 = kernel_tm.channel(q / pack4n + (q % pack4n) / packn); + int p = 0; + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* src = kptr + p * maxk + k; + vint8m1_t row = __riscv_vlse8_v_i8m1(src, inch * maxk, vlm1); + __riscv_vse8_v_i8m1(g00, row, vlm1); + g00 += packn; + } + } + } +#endif // __riscv_vector + for (; q < outch; q++) + { + const signed char* kptr = (const signed char*)kernel + q * inch * maxk; +#if __riscv_vector + signed char* g00 = kernel_tm.channel(q / pack4n + (q % pack4n) / packn + q % packn); +#else + signed char* g00 = kernel_tm.channel(q); +#endif + + int p = 0; +#if __riscv_vector + for (; p + pack4n - 1 < inch; p += pack4n) + { + for (int k = 0; k < maxk; k++) + { + const signed char* k0 = kptr + k; + + for (size_t i = 0; i < pack4n; i++) + { + g00[0] = k0[0]; + k0 += maxk; + g00 += 1; + } + } + kptr += maxk * pack4n; + } +#endif // __riscv_vector + for (; p < inch; p++) + { + for (int k = 0; k < maxk; k++) + { + g00[0] = kptr[0]; + g00++; + kptr++; + } + } + } + return; +} + +static void convolution_packed_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_tm, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) +{ + const int w = bottom_blob.w; + const int elempack = bottom_blob.elempack; + const int inch = bottom_blob.c * elempack; + + const int outw = top_blob.w; + const int outh = top_blob.h; + const int out_elempack = top_blob.elempack; + const int outch = top_blob.c * out_elempack; +#if __riscv_vector + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + const size_t vlm4 = __riscv_vsetvlmax_e32m4(); + + const int pack4n = (int)vlm4; + const int packn = (int)vlm1; + const size_t N = (elempack == pack4n) ? 1 : bottom_blob.cstep * elempack; + const size_t M = top_blob.cstep * out_elempack; +#endif + // kernel offsets + const int maxk = kernel_w * kernel_h; + std::vector _space_ofs(maxk); + int* space_ofs = &_space_ofs[0]; + { + int p1 = 0; + int p2 = 0; + int gap = w * dilation_h - kernel_w * dilation_w; + for (int i = 0; i < kernel_h; i++) + { + for (int j = 0; j < kernel_w; j++) + { + space_ofs[p1] = p2 * elempack; + p1++; + p2 += dilation_w; + } + p2 += gap; + } + } + + int nn_outch = 0; + int remain_outch_start = 0; +#if __riscv_vector + nn_outch = (outch - remain_outch_start) / pack4n; + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * pack4n; + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int j0 = ij % outw; + const int i1 = (ij + 1) / outw; + const int j1 = (ij + 1) % outw; + + vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vlm4); + vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vlm4); + const signed char* kptr = weight_data_tm.channel(p / pack4n); + + int q = 0; + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + for (int l = 0; l < pack4n; l++) + { + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vlm4); + vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[l * N], vlm4); + _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4); + _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4); + + kptr += pack4n; + } + } + } + + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[0], vlm4); + vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[0], vlm4); + _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4); + _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4); + + kptr += pack4n; + } + } + + if (out_elempack == packn) + { + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum0, 0), vlm1); + __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum0, 1), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum0, 2), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum0, 3), vlm1); + outptr += packn; + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum1, 0), vlm1); + __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum1, 1), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum1, 2), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum1, 3), vlm1); + outptr += packn; + } + + if (out_elempack == 1) + { + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum0, vlm4); + outptr += 1; + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum1, vlm4); + outptr += 1; + } + } + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vlm4); + const signed char* kptr = weight_data_tm.channel(p / pack4n); + + int q = 0; + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + for (int l = 0; l < pack4n; l++) + { + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vlm4); + _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4); + + kptr += pack4n; + } + } + } + + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[0], vlm4); + _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4); + + kptr += pack4n; + } + } + + if (out_elempack == packn) + { + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum, 0), vlm1); + __riscv_vse32_v_i32m1(outptr + M, __riscv_vget_v_i32m4_i32m1(_sum, 1), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 2, __riscv_vget_v_i32m4_i32m1(_sum, 2), vlm1); + __riscv_vse32_v_i32m1(outptr + M * 3, __riscv_vget_v_i32m4_i32m1(_sum, 3), vlm1); + outptr += packn; + } + + if (out_elempack == 1) + { + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum, vlm4); + outptr += 1; + } + } + } + remain_outch_start += nn_outch * pack4n; + + nn_outch = (outch - remain_outch_start) / packn; + const size_t vl = __riscv_vsetvl_e8m1(packn); + #pragma omp parallel for num_threads(opt.num_threads) + for (int pp = 0; pp < nn_outch; pp++) + { + const int p = remain_outch_start + pp * packn; + int* outptr = top_blob.channel(p / out_elempack); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int j0 = ij % outw; + const int i1 = (ij + 1) / outw; + const int j1 = (ij + 1) % outw; + + vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vl); + vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vl); + const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn); + + int q = 0; + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + for (int l = 0; l < pack4n; l++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl); + vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vl); + vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[l * N], vl); + _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vl); + _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vl); + + kptr += vl; + } + } + } + + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl); + vint16m2_t _s0 = __riscv_vwmul_vx_i16m2(_w, r0s[0], vl); + vint16m2_t _s1 = __riscv_vwmul_vx_i16m2(_w, r1s[0], vl); + _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vl); + _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vl); + + kptr += vl; + } + } + + if (out_elempack == packn) + { + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum0, 0), vl); + outptr += packn; + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum1, 0), vl); + outptr += packn; + } + + if (out_elempack == 1) + { + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum0, vl); + outptr += 1; + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum1, vl); + outptr += 1; + } + } + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vl); + const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn); + + int q = 0; + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + for (int l = 0; l < pack4n; l++) + { + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl); + vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[l * N], vl); + _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vl); + + kptr += vl; + } + } + } + + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vl); + vint16m2_t _s = __riscv_vwmul_vx_i16m2(_w, r0s[0], vl); + _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vl); + + kptr += vl; + } + } + + if (out_elempack == packn) + { + __riscv_vse32_v_i32m1(outptr, __riscv_vget_v_i32m4_i32m1(_sum, 0), vl); + outptr += packn; + } + + if (out_elempack == 1) + { + __riscv_vsse32_v_i32m4(outptr, M * sizeof(int), _sum, vl); + outptr += 1; + } + } + } + + remain_outch_start += nn_outch * packn; +#endif // __riscv_vector + #pragma omp parallel for num_threads(opt.num_threads) + for (int p = remain_outch_start; p < outch; p++) + { + int* outptr = top_blob.channel(p); + + int ij = 0; + for (; ij + 1 < outw * outh; ij += 2) + { + const int i0 = ij / outw; + const int j0 = ij % outw; + const int i1 = (ij + 1) / outw; + const int j1 = (ij + 1) % outw; + + int sum0 = 0; + int sum1 = 0; +#if __riscv_vector + const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn + p % packn); +#else + const signed char* kptr = weight_data_tm.channel(p); +#endif + int q = 0; +#if __riscv_vector + vint32m4_t _sum0 = __riscv_vmv_v_x_i32m4(0, vlm4); + vint32m4_t _sum1 = __riscv_vmv_v_x_i32m4(0, vlm4); + + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q / elempack).row(i1 * stride_h) + j1 * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + vint8m1_t _r0 = __riscv_vlse8_v_i8m1(r0s, N, vlm4); + vint8m1_t _r1 = __riscv_vlse8_v_i8m1(r1s, N, vlm4); + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s0 = __riscv_vwmul_vv_i16m2(_w, _r0, vlm4); + vint16m2_t _s1 = __riscv_vwmul_vv_i16m2(_w, _r1, vlm4); + _sum0 = __riscv_vwadd_wv_i32m4(_sum0, _s0, vlm4); + _sum1 = __riscv_vwadd_wv_i32m4(_sum1, _s1, vlm4); + + kptr += pack4n; + } + } + + vint32m1_t _sum00 = __riscv_vmv_v_x_i32m1(0, vlm1); + _sum00 = __riscv_vredsum_vs_i32m4_i32m1(_sum0, _sum00, vlm4); + sum0 += __riscv_vmv_x_s_i32m1_i32(_sum00); + + vint32m1_t _sum11 = __riscv_vmv_v_x_i32m1(0, vlm1); + _sum11 = __riscv_vredsum_vs_i32m4_i32m1(_sum1, _sum11, vlm4); + sum1 += __riscv_vmv_x_s_i32m1_i32(_sum11); +#endif + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i0 * stride_h) + j0 * stride_w * elempack; + const signed char* r1 = bottom_blob.channel(q).row(i1 * stride_h) + j1 * stride_w * elempack; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + const signed char* r1s = r1 + space_ofs[k]; + sum0 += r0s[0] * kptr[0]; + sum1 += r1s[0] * kptr[0]; + kptr++; + } + } + outptr[0] = sum0; + outptr[1] = sum1; + outptr += 2; + } + + for (; ij < outw * outh; ij++) + { + const int i = ij / outw; + const int j = ij % outw; + + int sum = 0; +#if __riscv_vector + const signed char* kptr = weight_data_tm.channel(p / pack4n + (p % pack4n) / packn + p % packn); +#else + const signed char* kptr = weight_data_tm.channel(p); +#endif + int q = 0; +#if __riscv_vector + vint32m4_t _sum = __riscv_vmv_v_x_i32m4(0, vlm4); + + for (; q + pack4n - 1 < inch; q += pack4n) + { + const signed char* r0 = bottom_blob.channel(q / elempack).row(i * stride_h) + j * stride_w * elempack; + + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + vint8m1_t _r = __riscv_vlse8_v_i8m1(r0s, N, vlm4); + vint8m1_t _w = __riscv_vle8_v_i8m1(kptr, vlm4); + vint16m2_t _s = __riscv_vwmul_vv_i16m2(_w, _r, vlm4); + _sum = __riscv_vwadd_wv_i32m4(_sum, _s, vlm4); + kptr += pack4n; + } + } + + vint32m1_t _sum0 = __riscv_vmv_v_x_i32m1(0, vlm1); + _sum0 = __riscv_vredsum_vs_i32m4_i32m1(_sum, _sum0, vlm4); + sum += __riscv_vmv_x_s_i32m1_i32(_sum0); +#endif + for (; q < inch; q++) + { + const signed char* r0 = bottom_blob.channel(q).row(i * stride_h) + j * stride_w; + for (int k = 0; k < maxk; k++) + { + const signed char* r0s = r0 + space_ofs[k]; + sum += r0s[0] * kptr[0]; + kptr++; + } + } + outptr[0] = sum; + outptr += 1; + } + } + + return; +} \ No newline at end of file diff --git a/src/layer/riscv/convolution_riscv.cpp b/src/layer/riscv/convolution_riscv.cpp index a45e8bc6223a..aacbad129ede 100644 --- a/src/layer/riscv/convolution_riscv.cpp +++ b/src/layer/riscv/convolution_riscv.cpp @@ -21,6 +21,10 @@ namespace ncnn { #include "convolution_3x3_winograd.h" #include "convolution_packed.h" +#if NCNN_INT8 +#include "convolution_packed_int8.h" +#endif // NCNN_INT8 + #if __riscv_vector #include "convolution_3x3_pack1ton.h" #include "convolution_7x7_pack1ton.h" @@ -54,8 +58,7 @@ int Convolution_riscv::create_pipeline(const Option& opt) #if NCNN_INT8 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u) { - // TODO implement int8 - return 0; + return create_pipeline_int8_rvv(opt); } #endif @@ -201,31 +204,7 @@ int Convolution_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opti #if NCNN_INT8 if (opt.use_int8_inference && int8_scale_term) { - Mat bottom_blob_unpacked = bottom_blob; - if (bottom_blob.elempack != 1) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1); - if (bottom_blob_unpacked.empty()) - return -100; - } - - Mat bottom_blob_unpacked_fp32 = bottom_blob_unpacked; - if (bottom_blob_unpacked.elembits() == 16) - { - Option opt_pack1 = opt; - opt_pack1.blob_allocator = opt.workspace_allocator; - - cast_float16_to_float32(bottom_blob_unpacked, bottom_blob_unpacked_fp32, opt_pack1); - if (bottom_blob_unpacked_fp32.empty()) - return -100; - } - - Option opt_unpacked = opt; - opt_unpacked.use_packing_layout = false; - return Convolution::forward_int8(bottom_blob_unpacked_fp32, top_blob, opt_unpacked); + return forward_int8_rvv(bottom_blob, top_blob, opt); } #endif @@ -545,4 +524,123 @@ int Convolution_riscv::forward(const std::vector& bottom_blobs, std::vector return 0; } +#if NCNN_INT8 +int Convolution_riscv::create_pipeline_int8_rvv(const Option& opt) +{ + const int maxk = kernel_w * kernel_h; + const int num_input = weight_data_size / maxk / num_output; + + // TODO: implement kernel transform for winograd, sgemm, etc + convolution_transform_kernel_packed_int8_rvv(weight_data, weight_data_tm, num_input, num_output, kernel_w, kernel_h); + + scale_in_data.create(num_output); + for (int p = 0; p < num_output; p++) + { + // requantize and relu + float scale_in; + if (weight_data_int8_scales[p] == 0) + scale_in = 0; + else + scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]); + + scale_in_data[p] = scale_in; + } + + if (opt.lightmode) + weight_data.release(); + + return 0; +} + +int Convolution_riscv::forward_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const +{ +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const int packn_s8 = csrr_vlenb(); +#endif // __riscv_vector + + int elembits = bottom_blob.elembits(); + + Mat bottom_blob_int8 = bottom_blob; + if (elembits != 8) + { + Option opt_q = opt; + opt_q.blob_allocator = opt.workspace_allocator; + quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q); + if (bottom_blob_int8.empty()) + return -100; + } + + Mat bottom_blob_bordered; + make_padding(bottom_blob_int8, bottom_blob_bordered, opt); + if (bottom_blob_bordered.empty()) + return -100; + + int w = bottom_blob_bordered.w; + int h = bottom_blob_bordered.h; + int channels = bottom_blob.c; + size_t elemsize = bottom_blob.elemsize; + int elempack = bottom_blob_bordered.elempack; + + const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1; + const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1; + + int outw = (w - kernel_extent_w) / stride_w + 1; + int outh = (h - kernel_extent_h) / stride_h + 1; + + bool use_int8_requantize = int8_scale_term > 100; + int out_elempack = 1; + int out_elempack_int32 = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + if (use_int8_requantize) + { + out_elempack = num_output % packn_s8 == 0 ? packn_s8 : 1; + } + else + { + out_elempack = num_output % packn == 0 ? packn : 1; + } + out_elempack_int32 = num_output % packn == 0 ? packn : 1; + } +#endif // __riscv_vector + size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack; +#if NCNN_ZFH + if (support_fp16_storage && opt.use_fp16_storage) + { + out_elemsize = use_int8_requantize ? 1u * out_elempack : 2u * out_elempack; + } +#endif // NCNN_ZFH + + Mat top_blob_int32; + top_blob_int32.create(outw, outh, num_output / out_elempack_int32, (size_t)(4u * out_elempack_int32), out_elempack_int32, opt.workspace_allocator); + if (top_blob_int32.empty()) + return -100; + + // TODO: Implement winograd, sgemm, etc + convolution_packed_int8_rvv(bottom_blob_bordered, top_blob_int32, weight_data_tm, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt); + bottom_blob_bordered.release(); + + top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator); + if (top_blob.empty()) + return -100; + + if (use_int8_requantize) + { + requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt); + } + else + { + dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt); + + if (activation) + { + activation->forward_inplace(top_blob, opt); + } + } + return 0; +} +#endif // NCNN_INT8 + } // namespace ncnn diff --git a/src/layer/riscv/convolution_riscv.h b/src/layer/riscv/convolution_riscv.h index 2d8215f86159..1664d63df0e6 100644 --- a/src/layer/riscv/convolution_riscv.h +++ b/src/layer/riscv/convolution_riscv.h @@ -26,6 +26,10 @@ class Convolution_riscv : public Convolution int forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; int forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; #endif +#if NCNN_INT8 + int create_pipeline_int8_rvv(const Option& opt); + int forward_int8_rvv(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const; +#endif public: Layer* activation; @@ -40,6 +44,10 @@ class Convolution_riscv : public Convolution // fp16 Mat bias_data_fp16; + +#if NCNN_INT8 + Mat scale_in_data; +#endif }; } // namespace ncnn diff --git a/src/layer/riscv/dequantize_riscv_zfh.cpp b/src/layer/riscv/dequantize_riscv_zfh.cpp index 29c3de64b4e8..dd6cdaaeed95 100644 --- a/src/layer/riscv/dequantize_riscv_zfh.cpp +++ b/src/layer/riscv/dequantize_riscv_zfh.cpp @@ -12,6 +12,210 @@ namespace ncnn { #if NCNN_ZFH +#if __riscv_vector +static void dequantize_packnton_f16_fp16s(const int* ptr0, const int* ptr1, __fp16* f16ptr, const Mat& scale_data, const Mat& bias_data, int elemcount) +{ + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + const size_t vlm2 = __riscv_vsetvlmax_e32m2(); + const size_t vlm4 = __riscv_vsetvlmax_e32m4(); + const size_t vlm8 = __riscv_vsetvlmax_e32m8(); + + float scale = scale_data[0]; + vfloat32m2_t _scale0 = __riscv_vfmv_v_f_f32m2(scale, vlm2); + if (scale_data.w > 1) + { + _scale0 = __riscv_vle32_v_f32m2(scale_data, vlm2); + } + vfloat32m4_t _scale1 = __riscv_vcreate_v_f32m2_f32m4(_scale0, _scale0); + vfloat32m8_t _scale2 = __riscv_vcreate_v_f32m4_f32m8(_scale1, _scale1); + + int i = 0; + if (bias_data.w == 0) + { + for (; i + 3 < elemcount; i += 4) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 2, vlm1); + vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 2, vlm1); + vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 3, vlm1); + vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 3, vlm1); + vint32m8_t _v = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vlm8); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale2, vlm8); + __riscv_vse16_v_f16m4(f16ptr, __riscv_vfncvt_f_f_w_f16m4(_vf, vlm8), vlm8); + + ptr0 += vlm1 * 4; + ptr1 += vlm1 * 4; + f16ptr += vlm8; + } + + for (; i + 1 < elemcount; i += 2) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m4_t _v = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3); + vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_v, vlm4); + _vf = __riscv_vfmul_vv_f32m4(_vf, _scale1, vlm4); + __riscv_vse16_v_f16m2(f16ptr, __riscv_vfncvt_f_f_w_f16m2(_vf, vlm4), vlm4); + + ptr0 += vlm1 * 2; + ptr1 += vlm1 * 2; + f16ptr += vlm4; + } + + for (; i < elemcount; i++) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m2_t _v = __riscv_vcreate_v_i32m1_i32m2(_v0, _v1); + vfloat32m2_t _vf = __riscv_vfcvt_f_x_v_f32m2(_v, vlm2); + _vf = __riscv_vfmul_vv_f32m2(_vf, _scale0, vlm2); + __riscv_vse16_v_f16m1(f16ptr, __riscv_vfncvt_f_f_w_f16m1(_vf, vlm2), vlm2); + + ptr0 += vlm1; + ptr1 += vlm1; + f16ptr += vlm2; + } + } + else + { + float bias = bias_data[0]; + vfloat32m2_t _bias0 = __riscv_vfmv_v_f_f32m2(bias, vlm2); + if (bias_data.w > 1) + { + _bias0 = __riscv_vle32_v_f32m2(bias_data, vlm2); + } + vfloat32m4_t _bias1 = __riscv_vcreate_v_f32m2_f32m4(_bias0, _bias0); + vfloat32m8_t _bias2 = __riscv_vcreate_v_f32m4_f32m8(_bias1, _bias1); + + for (; i + 3 < elemcount; i += 4) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 2, vlm1); + vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 2, vlm1); + vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr0 + vlm1 * 3, vlm1); + vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr1 + vlm1 * 3, vlm1); + vint32m8_t _v = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vlm8); + _vf = __riscv_vfmacc_vv_f32m8(_bias2, _vf, _scale2, vlm8); + __riscv_vse16_v_f16m4(f16ptr, __riscv_vfncvt_f_f_w_f16m4(_vf, vlm8), vlm8); + + ptr0 += vlm1 * 4; + ptr1 += vlm1 * 4; + f16ptr += vlm8; + } + + for (; i + 1 < elemcount; i += 2) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m4_t _v = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3); + vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_v, vlm4); + _vf = __riscv_vfmacc_vv_f32m4(_bias1, _vf, _scale1, vlm4); + __riscv_vse16_v_f16m2(f16ptr, __riscv_vfncvt_f_f_w_f16m2(_vf, vlm4), vlm4); + + ptr0 += vlm1 * 2; + ptr1 += vlm1 * 2; + f16ptr += vlm4; + } + + for (; i < elemcount; i++) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m2_t _v = __riscv_vcreate_v_i32m1_i32m2(_v0, _v1); + vfloat32m2_t _vf = __riscv_vfcvt_f_x_v_f32m2(_v, vlm2); + _vf = __riscv_vfmacc_vv_f32m2(_bias0, _vf, _scale0, vlm2); + __riscv_vse16_v_f16m1(f16ptr, __riscv_vfncvt_f_f_w_f16m1(_vf, vlm2), vlm2); + + ptr0 += vlm1; + ptr1 += vlm1; + f16ptr += vlm2; + } + } +} + +static void dequantize_packnto1_fp16s(const int* intptr, __fp16* f16ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int stride) +{ + const size_t vlm8 = __riscv_vsetvlmax_e32m8(); + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + + float scale = scale_data[0]; + vfloat32m8_t _scale = __riscv_vfmv_v_f_f32m8(scale, vlm8); + if (scale_data.w > 1) + { + vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_data, vlm1); + _scale = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s); + } + + __fp16 tmp[vlm8]; + int n = elemcount * vlm1; + if (bias_data.w == 0) + { + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vint32m8_t _v = __riscv_vle32_v_i32m8(intptr, vl); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vl); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale, vl); + __riscv_vse16_v_f16m4(tmp, __riscv_vfncvt_f_f_w_f16m4(_vf, vl), vl); + for (size_t j = 0; j < (vl / vlm1); j++) + { + for (int i = 0; i < vlm1; i++) + { + f16ptr[i * stride] = tmp[j * vlm1 + i]; + } + f16ptr++; + } + + intptr += vl; + n -= vl; + } + } + else + { + float bias = bias_data[0]; + vfloat32m8_t _bias = __riscv_vfmv_v_f_f32m8(bias, vlm8); + if (bias_data.w > 1) + { + vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1); + _bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b); + } + + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vint32m8_t _v = __riscv_vle32_v_i32m8(intptr, vl); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_v, vl); + _vf = __riscv_vfmacc_vv_f32m8(_bias, _vf, _scale, vl); + __riscv_vse16_v_f16m4(tmp, __riscv_vfncvt_f_f_w_f16m4(_vf, vl), vl); + + for (size_t j = 0; j < (vl / vlm1); j++) + { + for (int i = 0; i < vlm1; i++) + { + f16ptr[i * stride] = tmp[j * vlm1 + i]; + } + f16ptr++; + } + + intptr += vl; + n -= vl; + } + } +} +#endif // __riscv_vector + static void dequantize_fp16s(const int* intptr, __fp16* ptr, const Mat& scale_data, const Mat& bias_data, int elemcount, int elempack) { const int size = elemcount * elempack; @@ -93,11 +297,24 @@ int Dequantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const const int d = bottom_blob.d; const int channels = bottom_blob.c; const int elempack = bottom_blob.elempack; - const size_t out_elemsize = elempack * 2u; +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const int packn_f16 = csrr_vlenb() / 2; +#endif // __riscv_vector if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = w * elempack % packn_f16 == 0 ? packn_f16 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + size_t out_elemsize = out_elempack * 2u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -118,40 +335,127 @@ int Dequantize_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = h * elempack % packn_f16 == 0 ? packn_f16 : 1; + } +#endif + const int outh = h * elempack / out_elempack; + size_t out_elemsize = out_elempack * 2u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; +#if __riscv_vector + if (elempack == packn && out_elempack == packn_f16) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* ptr0 = bottom_blob.row(i * 2); + const int* ptr1 = bottom_blob.row(i * 2 + 1); + __fp16* f16ptr = top_blob.row<__fp16>(i); - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * out_elempack, out_elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data; + + dequantize_packnton_f16_fp16s(ptr0, ptr1, f16ptr, scale_data_i, bias_data_i, w); + } + } + + if (elempack == packn && out_elempack == 1) { - const int* intptr = bottom_blob.row(i); - __fp16* ptr = top_blob.row<__fp16>(i); - const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + __fp16* f16ptr = top_blob.row<__fp16>(i * packn); + + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack); + dequantize_packnto1_fp16s(intptr, f16ptr, scale_data_i, bias_data_i, w, w); + } + } +#endif // __riscv_vector + if (elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + __fp16* ptr = top_blob.row<__fp16>(i); + const Mat scale_data_i = scale_data_size > 1 ? scale_data.range(i * elempack, elempack) : scale_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + + dequantize_fp16s(intptr, ptr, scale_data_i, bias_data_i, w, elempack); + } } } if (dims == 3 || dims == 4) { + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % packn_f16 == 0 ? packn_f16 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + size_t out_elemsize = out_elempack * 2u; + if (dims == 3) - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); else - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; +#if __riscv_vector + if (elempack == packn && out_elempack == packn_f16) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < outc; q++) + { + const int* ptr0 = bottom_blob.channel(q * 2); + const int* ptr1 = bottom_blob.channel(q * 2 + 1); + __fp16* f16ptr = top_blob.channel(q); - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * out_elempack, out_elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data; + + dequantize_packnton_f16_fp16s(ptr0, ptr1, f16ptr, scale_data_q, bias_data_q, w * h * d); + } + } + + if (elempack == packn && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + __fp16* f16ptr = top_blob.channel(q * packn); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + + dequantize_packnto1_fp16s(intptr, f16ptr, scale_data_q, bias_data_q, w * h * d, top_blob.cstep); + } + } +#endif // __riscv_vector + if (elempack == 1 && out_elempack == 1) { - const int* intptr = bottom_blob.channel(q); - __fp16* ptr = top_blob.channel(q); - const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + __fp16* ptr = top_blob.channel(q); + const Mat scale_data_q = scale_data_size > 1 ? scale_data.range(q * elempack, elempack) : scale_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h * d, elempack); + dequantize_fp16s(intptr, ptr, scale_data_q, bias_data_q, w * h * d, elempack); + } } } diff --git a/src/layer/riscv/requantize_riscv.cpp b/src/layer/riscv/requantize_riscv.cpp index eaee285864c8..aaed417f9db7 100644 --- a/src/layer/riscv/requantize_riscv.cpp +++ b/src/layer/riscv/requantize_riscv.cpp @@ -19,6 +19,217 @@ Requantize_riscv::Requantize_riscv() #endif // __riscv_vector } +#if __riscv_vector +static void requantize_packnton_s8(const int* ptr0, const int* ptr1, const int* ptr2, const int* ptr3, signed char* s8ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount) +{ + const size_t vlm8 = __riscv_vsetvlmax_e32m8(); + const size_t vlm4 = __riscv_vsetvlmax_e32m4(); + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + + float scale_in = scale_in_data[0]; + float scale_out = scale_out_data[0]; + + vfloat32m4_t _scale_in0 = __riscv_vfmv_v_f_f32m4(scale_in, vlm4); + if (scale_in_data.w > 1) + { + _scale_in0 = __riscv_vle32_v_f32m4(scale_in_data, vlm4); + } + + vfloat32m4_t _scale_out0 = __riscv_vfmv_v_f_f32m4(scale_out, vlm4); + if (scale_out_data.w > 1) + { + _scale_out0 = __riscv_vle32_v_f32m4(scale_out_data, vlm4); + } + + vfloat32m8_t _scale_in = __riscv_vcreate_v_f32m4_f32m8(_scale_in0, _scale_in0); + vfloat32m8_t _scale_out = __riscv_vcreate_v_f32m4_f32m8(_scale_out0, _scale_out0); + + if (bias_data.w == 0) + { + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1); + vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr2 + vlm1, vlm1); + vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr3 + vlm1, vlm1); + vint32m8_t _vi = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_vi, vlm8); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_in, vlm8); + _vf = activation_ps(_vf, activation_type, activation_params, vlm8); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vlm8); + __riscv_vse8_v_i8m2(s8ptr, float2int8(_vf, vlm8), vlm8); + + ptr0 += vlm1 * 2; + ptr1 += vlm1 * 2; + ptr2 += vlm1 * 2; + ptr3 += vlm1 * 2; + s8ptr += vlm8; + } + + for (; i < elemcount; i++) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1); + vint32m4_t _vi = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3); + vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_vi, vlm4); + _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_in0, vlm4); + _vf = activation_ps(_vf, activation_type, activation_params, vlm4); + _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_out0, vlm4); + __riscv_vse8_v_i8m1(s8ptr, float2int8(_vf, vlm4), vlm4); + + ptr0 += vlm1; + ptr1 += vlm1; + ptr2 += vlm1; + ptr3 += vlm1; + s8ptr += vlm4; + } + } + else + { + float bias = bias_data[0]; + vfloat32m4_t _bias0 = __riscv_vfmv_v_f_f32m4(bias, vlm4); + if (bias_data.w > 1) + { + _bias0 = __riscv_vle32_v_f32m4(bias_data, vlm4); + } + vfloat32m8_t _bias = __riscv_vcreate_v_f32m4_f32m8(_bias0, _bias0); + + int i = 0; + for (; i + 1 < elemcount; i += 2) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1); + vint32m1_t _v4 = __riscv_vle32_v_i32m1(ptr0 + vlm1, vlm1); + vint32m1_t _v5 = __riscv_vle32_v_i32m1(ptr1 + vlm1, vlm1); + vint32m1_t _v6 = __riscv_vle32_v_i32m1(ptr2 + vlm1, vlm1); + vint32m1_t _v7 = __riscv_vle32_v_i32m1(ptr3 + vlm1, vlm1); + vint32m8_t _vi = __riscv_vcreate_v_i32m1_i32m8(_v0, _v1, _v2, _v3, _v4, _v5, _v6, _v7); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(_vi, vlm8); + _vf = __riscv_vfmadd_vv_f32m8(_vf, _scale_in, _bias, vlm8); + _vf = activation_ps(_vf, activation_type, activation_params, vlm8); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vlm8); + __riscv_vse8_v_i8m2(s8ptr, float2int8(_vf, vlm8), vlm8); + + ptr0 += vlm1 * 2; + ptr1 += vlm1 * 2; + ptr2 += vlm1 * 2; + ptr3 += vlm1 * 2; + s8ptr += vlm8; + } + + for (; i < elemcount; i++) + { + vint32m1_t _v0 = __riscv_vle32_v_i32m1(ptr0, vlm1); + vint32m1_t _v1 = __riscv_vle32_v_i32m1(ptr1, vlm1); + vint32m1_t _v2 = __riscv_vle32_v_i32m1(ptr2, vlm1); + vint32m1_t _v3 = __riscv_vle32_v_i32m1(ptr3, vlm1); + vint32m4_t _vi = __riscv_vcreate_v_i32m1_i32m4(_v0, _v1, _v2, _v3); + vfloat32m4_t _vf = __riscv_vfcvt_f_x_v_f32m4(_vi, vlm4); + _vf = __riscv_vfmadd_vv_f32m4(_vf, _scale_in0, _bias0, vlm4); + _vf = activation_ps(_vf, activation_type, activation_params, vlm4); + _vf = __riscv_vfmul_vv_f32m4(_vf, _scale_out0, vlm4); + __riscv_vse8_v_i8m1(s8ptr, float2int8(_vf, vlm4), vlm4); + + ptr0 += vlm1; + ptr1 += vlm1; + ptr2 += vlm1; + ptr3 += vlm1; + s8ptr += vlm4; + } + } +} + +static void requantize_packnto1(const int* ptr, signed char* s8ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int activation_type, const Mat& activation_params, int elemcount, int stride) +{ + const size_t vlm8 = __riscv_vsetvlmax_e32m8(); + const size_t vlm1 = __riscv_vsetvlmax_e32m1(); + + float scale_in = scale_in_data[0]; + float scale_out = scale_out_data[0]; + + vfloat32m8_t _scale_in = __riscv_vfmv_v_f_f32m8(scale_in, vlm8); + if (scale_in_data.w > 1) + { + vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_in_data, vlm1); + _scale_in = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s); + } + + vfloat32m8_t _scale_out = __riscv_vfmv_v_f_f32m8(scale_out, vlm8); + if (scale_out_data.w > 1) + { + vfloat32m1_t _s = __riscv_vle32_v_f32m1(scale_out_data, vlm1); + _scale_out = __riscv_vcreate_v_f32m1_f32m8(_s, _s, _s, _s, _s, _s, _s, _s); + } + + signed char tmp[vlm8]; + int n = elemcount * vlm1; + + if (bias_data.w == 0) + { + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(ptr, vl), vl); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_in, vl); + _vf = activation_ps(_vf, activation_type, activation_params, vl); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vl); + __riscv_vse8_v_i8m2(tmp, float2int8(_vf, vl), vl); + for (size_t j = 0; j < (vl / vlm1); j++) + { + for (int i = 0; i < vlm1; i++) + { + s8ptr[i * stride] = tmp[j * vlm1 + i]; + } + s8ptr++; + } + + ptr += vl; + n -= vl; + } + } + else + { + float bias = bias_data[0]; + vfloat32m8_t _bias = __riscv_vfmv_v_f_f32m8(bias, vlm8); + if (bias_data.w > 1) + { + vfloat32m1_t _b = __riscv_vle32_v_f32m1(bias_data, vlm1); + _bias = __riscv_vcreate_v_f32m1_f32m8(_b, _b, _b, _b, _b, _b, _b, _b); + } + + while (n > 0) + { + size_t vl = __riscv_vsetvl_e32m8(n); + vfloat32m8_t _vf = __riscv_vfcvt_f_x_v_f32m8(__riscv_vle32_v_i32m8(ptr, vl), vl); + _vf = __riscv_vfmadd_vv_f32m8(_vf, _scale_in, _bias, vl); + _vf = activation_ps(_vf, activation_type, activation_params, vl); + _vf = __riscv_vfmul_vv_f32m8(_vf, _scale_out, vl); + __riscv_vse8_v_i8m2(tmp, float2int8(_vf, vl), vl); + for (size_t j = 0; j < (vl / vlm1); j++) + { + for (int i = 0; i < vlm1; i++) + { + s8ptr[i * stride] = tmp[j * vlm1 + i]; + } + s8ptr++; + } + + ptr += vl; + n -= vl; + } + } +} +#endif // __riscv_vector + static void requantize_relu(const int* intptr, signed char* ptr, const Mat& scale_in_data, const Mat& bias_data, const Mat& scale_out_data, int elemcount, int elempack) { const int bias_data_size = bias_data.w; @@ -358,9 +569,24 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio const int elempack = bottom_blob.elempack; const size_t out_elemsize = elempack * 1u; +#if __riscv_vector + const int packn = csrr_vlenb() / 4; + const int packn_s8 = csrr_vlenb(); +#endif + if (dims == 1) { - top_blob.create(w, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = w * elempack % packn_s8 == 0 ? packn_s8 : 1; + } +#endif + const int outw = w * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(outw, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; @@ -382,44 +608,140 @@ int Requantize_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Optio if (dims == 2) { - top_blob.create(w, h, out_elemsize, elempack, opt.blob_allocator); + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = h * elempack % packn_s8 == 0 ? packn_s8 : 1; + } +#endif // __riscv_vector + const int outh = h * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + + top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - - #pragma omp parallel for num_threads(opt.num_threads) - for (int i = 0; i < h; i++) +#if __riscv_vector + if (elempack == packn && out_elempack == packn_s8) { - const int* intptr = bottom_blob.row(i); - signed char* ptr = top_blob.row(i); - - const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; - const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; - const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < outh; i++) + { + const int* ptr0 = bottom_blob.row(i * 4); + const int* ptr1 = bottom_blob.row(i * 4 + 1); + const int* ptr2 = bottom_blob.row(i * 4 + 2); + const int* ptr3 = bottom_blob.row(i * 4 + 3); + signed char* s8ptr = top_blob.row(i); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * out_elempack, out_elempack) : scale_out_data; + + requantize_packnton_s8(ptr0, ptr1, ptr2, ptr3, s8ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w); + } + } - requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + if (elempack == packn && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* ptr = bottom_blob.row(i); + signed char* s8ptr = top_blob.row(i * packn); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + + requantize_packnto1(ptr, s8ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, w); + } + } +#endif // __riscv_vector + if (elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int i = 0; i < h; i++) + { + const int* intptr = bottom_blob.row(i); + signed char* ptr = top_blob.row(i); + + const Mat scale_in_data_i = scale_in_data_size > 1 ? scale_in_data.range(i * elempack, elempack) : scale_in_data; + const Mat bias_data_i = bias_data_size > 1 ? bias_data.range(i * elempack, elempack) : bias_data; + const Mat scale_out_data_i = scale_out_data_size > 1 ? scale_out_data.range(i * elempack, elempack) : scale_out_data; + + requantize(intptr, ptr, scale_in_data_i, bias_data_i, scale_out_data_i, activation_type, activation_params, w, elempack); + } } } if (dims == 3 || dims == 4) { + int out_elempack = 1; +#if __riscv_vector + if (opt.use_packing_layout) + { + out_elempack = channels * elempack % packn_s8 == 0 ? packn_s8 : 1; + } +#endif + const int outc = channels * elempack / out_elempack; + const size_t out_elemsize = out_elempack * 1u; + if (dims == 3) - top_blob.create(w, h, channels, out_elemsize, elempack, opt.blob_allocator); + top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator); else - top_blob.create(w, h, d, channels, out_elemsize, elempack, opt.blob_allocator); + top_blob.create(w, h, d, outc, out_elemsize, out_elempack, opt.blob_allocator); if (top_blob.empty()) return -100; - #pragma omp parallel for num_threads(opt.num_threads) - for (int q = 0; q < channels; q++) +#if __riscv_vector + if (elempack == packn && out_elempack == packn_s8) { - const int* intptr = bottom_blob.channel(q); - signed char* ptr = top_blob.channel(q); - - const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; - const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; - const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + for (int q = 0; q < outc; q++) + { + const int* ptr0 = bottom_blob.channel(q * 4); + const int* ptr1 = bottom_blob.channel(q * 4 + 1); + const int* ptr2 = bottom_blob.channel(q * 4 + 2); + const int* ptr3 = bottom_blob.channel(q * 4 + 3); + signed char* s8ptr = top_blob.channel(q); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * out_elempack, out_elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * out_elempack, out_elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * out_elempack, out_elempack) : scale_out_data; + + requantize_packnton_s8(ptr0, ptr1, ptr2, ptr3, s8ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d); + } + } - requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, elempack); + if (elempack == packn && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* ptr = bottom_blob.channel(q); + signed char* s8ptr = top_blob.channel(q * packn); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + + requantize_packnto1(ptr, s8ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, top_blob.cstep); + } + } +#endif // __riscv_vector + if (elempack == 1 && out_elempack == 1) + { + #pragma omp parallel for num_threads(opt.num_threads) + for (int q = 0; q < channels; q++) + { + const int* intptr = bottom_blob.channel(q); + signed char* ptr = top_blob.channel(q); + + const Mat scale_in_data_q = scale_in_data_size > 1 ? scale_in_data.range(q * elempack, elempack) : scale_in_data; + const Mat bias_data_q = bias_data_size > 1 ? bias_data.range(q * elempack, elempack) : bias_data; + const Mat scale_out_data_q = scale_out_data_size > 1 ? scale_out_data.range(q * elempack, elempack) : scale_out_data; + + requantize(intptr, ptr, scale_in_data_q, bias_data_q, scale_out_data_q, activation_type, activation_params, w * h * d, elempack); + } } } diff --git a/tests/test_requantize.cpp b/tests/test_requantize.cpp index cf579e2c968b..c1d558ec3f80 100644 --- a/tests/test_requantize.cpp +++ b/tests/test_requantize.cpp @@ -294,12 +294,17 @@ static int test_requantize_3() static int test_requantize_4() { return 0 +#ifndef __riscv + || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 1, 1, 24) + || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 24, 24, 0) +#else + || test_requantize(RandomIntMat(5, 3, 2, 24), 1, 1, 24) + || test_requantize(RandomIntMat(5, 3, 2, 24), 24, 24, 0) +#endif || test_requantize_pack1(RandomIntMat(5, 3, 2, 12), 1, 1, 12) || test_requantize_pack1(RandomIntMat(5, 3, 2, 12), 12, 12, 0) || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 1, 13, 13) - || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 13, 1, 0) - || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 1, 1, 24) - || test_requantize_pack8(RandomIntMat(5, 3, 2, 24), 24, 24, 0); + || test_requantize_pack1(RandomIntMat(3, 5, 3, 13), 13, 1, 0); } int main()