forked from FFmpeg/FFmpeg
lavc/vp8dsp: add R-V V vp7_idct_dc_add4y
As with idct_dc_add, most of the code is shared with, and replaces, the previous VP8 function. To improve performance, we break down the 16x4 matrix into 4 rows, rather than 4 squares. Thus strided loads and stores are avoided, and the 4 DC calculations are vectored. Unfortunately this requires a vector gather to splat the DC values, but overall this is still a win for performance: T-Head C908: vp7_idct_dc_add4y_c: 7.2 vp7_idct_dc_add4y_rvv_i32: 2.2 vp8_idct_dc_add4y_c: 6.2 vp8_idct_dc_add4y_rvv_i32: 2.2 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 SpacemiT X60: vp7_idct_dc_add4y_c: 6.2 vp7_idct_dc_add4y_rvv_i32: 2.0 vp8_idct_dc_add4y_c: 5.5 vp8_idct_dc_add4y_rvv_i32: 2.5 (before) vp8_idct_dc_add4y_rvv_i32: 1.7 I also tried to provision the DC values using indexed loads. It ends up slower overall, especially for VP7, as we then have to compute 16 DC's instead of just 4.
This commit is contained in:
parent
30797e4ff6
commit
4e120fbbbd
3 changed files with 54 additions and 10 deletions
|
@ -28,6 +28,7 @@
|
|||
void ff_vp7_luma_dc_wht_rvv(int16_t block[4][4][16], int16_t dc[16]);
|
||||
void ff_vp7_idct_add_rvv(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
|
||||
void ff_vp78_idct_dc_add_rvv(uint8_t *, int16_t block[16], ptrdiff_t, int dc);
|
||||
void ff_vp7_idct_dc_add4y_rvv(uint8_t *dst, int16_t block[4][16], ptrdiff_t);
|
||||
|
||||
static void ff_vp7_idct_dc_add_rvv(uint8_t *dst, int16_t block[16],
|
||||
ptrdiff_t stride)
|
||||
|
@ -49,6 +50,7 @@ av_cold void ff_vp7dsp_init_riscv(VP8DSPContext *c)
|
|||
c->vp8_idct_add = ff_vp7_idct_add_rvv;
|
||||
#endif
|
||||
c->vp8_idct_dc_add = ff_vp7_idct_dc_add_rvv;
|
||||
c->vp8_idct_dc_add4y = ff_vp7_idct_dc_add4y_rvv;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -127,3 +127,19 @@ func ff_vp7_idct_add_rvv, zve32x
|
|||
ret
|
||||
endfunc
|
||||
#endif
|
||||
|
||||
func ff_vp7_idct_dc_add4y_rvv, zve32x
|
||||
li t0, 32
|
||||
vsetivli zero, 4, e16, mf2, ta, ma
|
||||
li t1, 23170
|
||||
vlse16.v v8, (a1), t0 # block[0..3][0]
|
||||
vwmul.vx v0, v8, t1
|
||||
li t2, 0x20000
|
||||
vsetvli zero, zero, e32, m1, ta, ma
|
||||
vsra.vi v0, v0, 14
|
||||
vmul.vx v0, v0, t1
|
||||
vadd.vx v0, v0, t2
|
||||
vsetvli zero, zero, e16, mf2, ta, ma
|
||||
vnsra.wi v8, v0, 18 # 4x DC
|
||||
tail ff_vp78_idct_dc_add4y_rvv
|
||||
endfunc
|
||||
|
|
|
@ -105,6 +105,7 @@ func ff_vp8_idct_dc_add_rvv, zve32x
|
|||
# fall through
|
||||
endfunc
|
||||
|
||||
# a3 = DC
|
||||
func ff_vp78_idct_dc_add_rvv, zve32x
|
||||
csrwi vxrm, 0
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
|
@ -121,6 +122,41 @@ func ff_vp78_idct_dc_add_rvv, zve32x
|
|||
ret
|
||||
endfunc
|
||||
|
||||
func ff_vp8_idct_dc_add4y_rvv, zve32x
|
||||
li t0, 32
|
||||
vsetivli zero, 4, e16, mf2, ta, ma
|
||||
vlse16.v v8, (a1), t0
|
||||
vadd.vi v8, v8, 4
|
||||
vsra.vi v8, v8, 3
|
||||
# fall through
|
||||
endfunc
|
||||
|
||||
.variant_cc ff_vp78_idct_dc_add4y_rvv
|
||||
# v8 = [dc0, dc1, dc2, dc3]
|
||||
func ff_vp78_idct_dc_add4y_rvv, zve32x
|
||||
vsetivli zero, 16, e16, m2, ta, ma
|
||||
vid.v v4
|
||||
vsrl.vi v4, v4, 2
|
||||
vrgather.vv v0, v8, v4 # replicate each DC four times
|
||||
vsetvli zero, zero, e8, m1, ta, ma
|
||||
li a4, 4
|
||||
1:
|
||||
vle8.v v8, (a0)
|
||||
addi a4, a4, -1
|
||||
vwaddu.wv v16, v0, v8
|
||||
sh zero, (a1)
|
||||
vsetvli zero, zero, e16, m2, ta, ma
|
||||
vmax.vx v16, v16, zero
|
||||
addi a1, a1, 32
|
||||
vsetvli zero, zero, e8, m1, ta, ma
|
||||
vnclipu.wi v8, v16, 0
|
||||
vse8.v v8, (a0)
|
||||
add a0, a0, a2
|
||||
bnez a4, 1b
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
.macro vp8_idct_dc_add
|
||||
vlse32.v v0, (a0), a2
|
||||
lh a5, 0(a1)
|
||||
|
@ -143,16 +179,6 @@ endfunc
|
|||
addi a1, a1, 32
|
||||
.endm
|
||||
|
||||
func ff_vp8_idct_dc_add4y_rvv, zve32x
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
.rept 3
|
||||
vp8_idct_dc_addy
|
||||
.endr
|
||||
vp8_idct_dc_add
|
||||
|
||||
ret
|
||||
endfunc
|
||||
|
||||
func ff_vp8_idct_dc_add4uv_rvv, zve32x
|
||||
vsetivli zero, 4, e8, mf4, ta, ma
|
||||
vp8_idct_dc_addy
|
||||
|
|
Loading…
Add table
Reference in a new issue