forked from FFmpeg/FFmpeg
lavc/h264dsp: unroll R-V V weight16
As VLSE128.V does not exist, we have no other way to deal with latency. T-Head C908: h264_weight16_8_c: 989.4 ( 1.00x) h264_weight16_8_rvv_i32: 193.2 ( 5.12x) SpacemiT X60: h264_weight16_8_c: 874.1 ( 1.00x) h264_weight16_8_rvv_i32: 196.9 ( 4.44x)
This commit is contained in:
parent
4936bb2508
commit
459a1512f1
2 changed files with 16 additions and 7 deletions
|
@ -97,7 +97,7 @@ av_cold void ff_h264dsp_init_riscv(H264DSPContext *dsp, const int bit_depth,
|
|||
const bool zvl128b = ff_rv_vlen_least(128);
|
||||
|
||||
if (bit_depth == 8) {
|
||||
if (zvl128b)
|
||||
if (zvl128b && (flags & AV_CPU_FLAG_RVB))
|
||||
dsp->weight_h264_pixels_tab[0] =
|
||||
ff_h264_weight_funcs_8_rvv[0].weight;
|
||||
if (flags & AV_CPU_FLAG_RVV_I64)
|
||||
|
|
|
@ -29,20 +29,29 @@
|
|||
#include "libavutil/riscv/asm.S"
|
||||
|
||||
.variant_cc ff_h264_weight_pixels_simple_8_rvv
|
||||
func ff_h264_weight_pixels_simple_8_rvv, zve32x
|
||||
func ff_h264_weight_pixels_simple_8_rvv, zve32x, b
|
||||
csrwi vxrm, 0
|
||||
sll a5, a5, a3
|
||||
1:
|
||||
vsetvli zero, t6, e16, m2, ta, ma
|
||||
vle8.v v8, (a0)
|
||||
addi a2, a2, -1
|
||||
add t0, a0, a1
|
||||
vle8.v v8, (a0)
|
||||
addi a2, a2, -2
|
||||
vle8.v v9, (t0)
|
||||
vzext.vf2 v24, v8
|
||||
vzext.vf2 v26, v9
|
||||
vmul.vx v16, v24, a4
|
||||
vmul.vx v18, v26, a4
|
||||
vsadd.vx v16, v16, a5
|
||||
vmax.vx v16, v16, zero
|
||||
vsetvli zero, zero, e8, m1, ta, ma
|
||||
vsadd.vx v18, v18, a5
|
||||
vmax.vx v16, v16, zero
|
||||
vmax.vx v18, v18, zero
|
||||
vsetvli zero, zero, e8, m1, ta, ma
|
||||
vnclipu.wx v8, v16, a3
|
||||
vse8.v v8, (a0)
|
||||
vnclipu.wx v9, v18, a3
|
||||
vse8.v v8, (a0)
|
||||
vse8.v v9, (t0)
|
||||
sh1add a0, a1, a0
|
||||
add a0, a0, a1
|
||||
bnez a2, 1b
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue