lavc/vp8dsp: add R-V V vp7_idct_dc_add4y

As with idct_dc_add, most of the code is shared with, and replaces, the
previous VP8 function. To improve performance, we break down the 16x4
matrix into 4 rows, rather than 4 squares. Thus strided loads and
stores are avoided, and the 4 DC calculations are vectored.
Unfortunately this requires a vector gather to splat the DC values, but
overall this is still a win for performance:

T-Head C908:
vp7_idct_dc_add4y_c:       7.2
vp7_idct_dc_add4y_rvv_i32: 2.2
vp8_idct_dc_add4y_c:       6.2
vp8_idct_dc_add4y_rvv_i32: 2.2 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7

SpacemiT X60:
vp7_idct_dc_add4y_c:       6.2
vp7_idct_dc_add4y_rvv_i32: 2.0
vp8_idct_dc_add4y_c:       5.5
vp8_idct_dc_add4y_rvv_i32: 2.5 (before)
vp8_idct_dc_add4y_rvv_i32: 1.7

I also tried to provision the DC values using indexed loads. It ends up
slower overall, especially for VP7, as we then have to compute 16 DC's
instead of just 4.
This commit is contained in:
Rémi Denis-Courmont 2024-06-01 21:32:56 +03:00
parent 30797e4ff6
commit 4e120fbbbd
3 changed files with 54 additions and 10 deletions

View file

@ -28,6 +28,7 @@
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
ptrdiff_t stride)
@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
c->vp8_idct_add = ff_vp7_idct_add_rvv;
#endif
c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv;
}
#endif
}

View file

@ -127,3 +127,19 @@ func ff_vp7_idct_add_rvv, zve32x
ret
endfunc
#endif
func ff_vp7_idct_dc_add4y_rvv, zve32x
li t0, 32
vsetivli zero, 4, e16, mf2, ta, ma
li t1, 23170
vlse16.v v8, (a1), t0 # block[0..3][0]
vwmul.vx v0, v8, t1
li t2, 0x20000
vsetvli zero, zero, e32, m1, ta, ma
vsra.vi v0, v0, 14
vmul.vx v0, v0, t1
vadd.vx v0, v0, t2
vsetvli zero, zero, e16, mf2, ta, ma
vnsra.wi v8, v0, 18 # 4x DC
tail ff_vp78_idct_dc_add4y_rvv
endfunc

View file

@ -105,6 +105,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x
# fall through
endfunc
# a3 = DC
func ff_vp78_idct_dc_add_rvv, zve32x
csrwi vxrm, 0
vsetivli zero, 4, e8, mf4, ta, ma
@ -121,6 +122,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x
ret
endfunc
func ff_vp8_idct_dc_add4y_rvv, zve32x
li t0, 32
vsetivli zero, 4, e16, mf2, ta, ma
vlse16.v v8, (a1), t0
vadd.vi v8, v8, 4
vsra.vi v8, v8, 3
# fall through
endfunc
.variant_cc ff_vp78_idct_dc_add4y_rvv
# v8 = [dc0, dc1, dc2, dc3]
func ff_vp78_idct_dc_add4y_rvv, zve32x
vsetivli zero, 16, e16, m2, ta, ma
vid.v v4
vsrl.vi v4, v4, 2
vrgather.vv v0, v8, v4 # replicate each DC four times
vsetvli zero, zero, e8, m1, ta, ma
li a4, 4
1:
vle8.v v8, (a0)
addi a4, a4, -1
vwaddu.wv v16, v0, v8
sh zero, (a1)
vsetvli zero, zero, e16, m2, ta, ma
vmax.vx v16, v16, zero
addi a1, a1, 32
vsetvli zero, zero, e8, m1, ta, ma
vnclipu.wi v8, v16, 0
vse8.v v8, (a0)
add a0, a0, a2
bnez a4, 1b
ret
endfunc
.macro vp8_idct_dc_add
vlse32.v v0, (a0), a2
lh a5, 0(a1)
@ -143,16 +179,6 @@ endfunc
addi a1, a1, 32
.endm
func ff_vp8_idct_dc_add4y_rvv, zve32x
vsetivli zero, 4, e8, mf4, ta, ma
.rept 3
vp8_idct_dc_addy
.endr
vp8_idct_dc_add
ret
endfunc
func ff_vp8_idct_dc_add4uv_rvv, zve32x
vsetivli zero, 4, e8, mf4, ta, ma
vp8_idct_dc_addy