lavc/h264dsp: unroll R-V V weight16

As VLSE128.V does not exist, we have no other way to deal with latency.

T-Head C908:
h264_weight16_8_c:                                     989.4 ( 1.00x)
h264_weight16_8_rvv_i32:                               193.2 ( 5.12x)

SpacemiT X60:
h264_weight16_8_c:                                     874.1 ( 1.00x)
h264_weight16_8_rvv_i32:                               196.9 ( 4.44x)
This commit is contained in:
Rémi Denis-Courmont 2024-09-01 16:31:13 +03:00
parent 4936bb2508
commit 459a1512f1
2 changed files with 16 additions and 7 deletions

View file

@ -97,7 +97,7 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
const bool zvl128b = ff_rv_vlen_least(128);
if (bit_depth == 8) {
if (zvl128b)
if (zvl128b && (flags & AV_CPU_FLAG_RVB))
dsp->weight_h264_pixels_tab[0] =
ff_h264_weight_funcs_8_rvv[0].weight;
if (flags & AV_CPU_FLAG_RVV_I64)

View file

@ -29,20 +29,29 @@
#include "libavutil/riscv/asm.S"
.variant_cc ff_h264_weight_pixels_simple_8_rvv
func ff_h264_weight_pixels_simple_8_rvv, zve32x
func ff_h264_weight_pixels_simple_8_rvv, zve32x, b
csrwi vxrm, 0
sll a5, a5, a3
1:
vsetvli zero, t6, e16, m2, ta, ma
vle8.v v8, (a0)
addi a2, a2, -1
add t0, a0, a1
vle8.v v8, (a0)
addi a2, a2, -2
vle8.v v9, (t0)
vzext.vf2 v24, v8
vzext.vf2 v26, v9
vmul.vx v16, v24, a4
vmul.vx v18, v26, a4
vsadd.vx v16, v16, a5
vmax.vx v16, v16, zero
vsetvli zero, zero, e8, m1, ta, ma
vsadd.vx v18, v18, a5
vmax.vx v16, v16, zero
vmax.vx v18, v18, zero
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wx v8, v16, a3
vse8.v v8, (a0)
vnclipu.wx v9, v18, a3
vse8.v v8, (a0)
vse8.v v9, (t0)
sh1add a0, a1, a0
add a0, a0, a1
bnez a2, 1b