forked from FFmpeg/FFmpeg
aarch64/vvc: Add put_qpel_vx
put_luma_v_8_4x4_c: 1.0 ( 1.00x) put_luma_v_8_4x4_neon: 0.0 ( 0.00x) put_luma_v_8_8x8_c: 3.5 ( 1.00x) put_luma_v_8_8x8_neon: 0.5 ( 7.00x) put_luma_v_8_16x16_c: 13.8 ( 1.00x) put_luma_v_8_16x16_neon: 1.2 (11.00x) put_luma_v_8_32x32_c: 54.2 ( 1.00x) put_luma_v_8_32x32_neon: 5.0 (10.85x) put_luma_v_8_64x64_c: 217.5 ( 1.00x) put_luma_v_8_64x64_neon: 18.8 (11.60x) put_luma_v_8_128x128_c: 886.2 ( 1.00x) put_luma_v_8_128x128_neon: 74.0 (11.98x)
This commit is contained in:
parent
b051bc7cb8
commit
a0b52afd32
3 changed files with 115 additions and 0 deletions
|
@ -274,4 +274,12 @@ NEON8_FNPROTO_PARTIAL_6(qpel_h, (int16_t * dst,
|
|||
const uint8_t *_src, ptrdiff_t _srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width), _i8mm);
|
||||
|
||||
void ff_vvc_put_qpel_v4_8_neon(int16_t *dst, const uint8_t *_src,
|
||||
ptrdiff_t _srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width);
|
||||
|
||||
void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src,
|
||||
ptrdiff_t _srcstride, int height,
|
||||
const int8_t *hf, const int8_t *vf, int width);
|
||||
|
||||
#endif
|
||||
|
|
|
@ -86,6 +86,11 @@ endconst
|
|||
sxtl v0.8h, v0.8b
|
||||
.endm
|
||||
|
||||
.macro vvc_load_qpel_filterh freg
|
||||
ld1 {v0.8b}, [\freg]
|
||||
sxtl v0.8h, v0.8b
|
||||
.endm
|
||||
|
||||
.macro calc_qpelh dst, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
|
||||
smull \dst\().4s, \src0\().4h, v0.h[0]
|
||||
smlal \dst\().4s, \src1\().4h, v0.h[1]
|
||||
|
@ -95,11 +100,15 @@ endconst
|
|||
smlal \dst\().4s, \src5\().4h, v0.h[5]
|
||||
smlal \dst\().4s, \src6\().4h, v0.h[6]
|
||||
smlal \dst\().4s, \src7\().4h, v0.h[7]
|
||||
.ifc \op, sqxtn
|
||||
sqxtn \dst\().4h, \dst\().4s
|
||||
.else
|
||||
.ifc \op, sshr
|
||||
sshr \dst\().4s, \dst\().4s, \shift
|
||||
.else
|
||||
\op \dst\().4h, \dst\().4s, \shift
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro calc_qpelh2 dst, dstt, src0, src1, src2, src3, src4, src5, src6, src7, op, shift=6
|
||||
|
@ -111,11 +120,15 @@ endconst
|
|||
smlal2 \dstt\().4s, \src5\().8h, v0.h[5]
|
||||
smlal2 \dstt\().4s, \src6\().8h, v0.h[6]
|
||||
smlal2 \dstt\().4s, \src7\().8h, v0.h[7]
|
||||
.ifc \op, sqxtn2
|
||||
sqxtn2 \dst\().8h, \dstt\().4s
|
||||
.else
|
||||
.ifc \op, sshr
|
||||
sshr \dst\().4s, \dstt\().4s, \shift
|
||||
.else
|
||||
\op \dst\().8h, \dstt\().4s, \shift
|
||||
.endif
|
||||
.endif
|
||||
.endm
|
||||
|
||||
.macro calc_all
|
||||
|
@ -1000,6 +1013,93 @@ function ff_hevc_put_hevc_qpel_v64_8_neon, export=1
|
|||
ret
|
||||
endfunc
|
||||
|
||||
/* ff_hevc_put_hevc_qpel_vx require filter parameters be
|
||||
* [-, +, -, +, +, -, +, -],
|
||||
* vvc doesn't meet the requirement.
|
||||
*/
|
||||
function ff_vvc_put_qpel_v4_8_neon, export=1
|
||||
vvc_load_qpel_filterh x5
|
||||
sub x1, x1, x2, lsl #1
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
sub x1, x1, x2
|
||||
ldr s16, [x1]
|
||||
ldr s17, [x1, x2]
|
||||
add x1, x1, x2, lsl #1
|
||||
ldr s18, [x1]
|
||||
ldr s19, [x1, x2]
|
||||
uxtl v16.8h, v16.8b
|
||||
uxtl v17.8h, v17.8b
|
||||
add x1, x1, x2, lsl #1
|
||||
ldr s20, [x1]
|
||||
ldr s21, [x1, x2]
|
||||
uxtl v18.8h, v18.8b
|
||||
uxtl v19.8h, v19.8b
|
||||
add x1, x1, x2, lsl #1
|
||||
ldr s22, [x1]
|
||||
add x1, x1, x2
|
||||
uxtl v20.8h, v20.8b
|
||||
uxtl v21.8h, v21.8b
|
||||
uxtl v22.8h, v22.8b
|
||||
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
||||
ld1 {\tmp\().s}[0], [x1], x2
|
||||
uxtl \tmp\().8h, \tmp\().8b
|
||||
calc_qpelh v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn
|
||||
subs w3, w3, #1
|
||||
st1 {v24.4h}, [x0], x9
|
||||
.endm
|
||||
1:
|
||||
calc_all
|
||||
.purgem calc
|
||||
2:
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_vvc_put_qpel_v8_8_neon, export=1
|
||||
vvc_load_qpel_filterh x5
|
||||
sub x1, x1, x2, lsl #1
|
||||
sub x1, x1, x2
|
||||
mov x9, #(VVC_MAX_PB_SIZE * 2)
|
||||
0:
|
||||
mov x8, x1
|
||||
ldr d16, [x8]
|
||||
ldr d17, [x8, x2]
|
||||
mov x10, x0
|
||||
mov w11, w3
|
||||
add x8, x8, x2, lsl #1
|
||||
ldr d18, [x8]
|
||||
ldr d19, [x8, x2]
|
||||
uxtl v16.8h, v16.8b
|
||||
uxtl v17.8h, v17.8b
|
||||
add x8, x8, x2, lsl #1
|
||||
ldr d20, [x8]
|
||||
ldr d21, [x8, x2]
|
||||
uxtl v18.8h, v18.8b
|
||||
uxtl v19.8h, v19.8b
|
||||
add x8, x8, x2, lsl #1
|
||||
ldr d22, [x8]
|
||||
add x8, x8, x2
|
||||
uxtl v20.8h, v20.8b
|
||||
uxtl v21.8h, v21.8b
|
||||
uxtl v22.8h, v22.8b
|
||||
.macro calc tmp, src0, src1, src2, src3, src4, src5, src6, src7
|
||||
ld1 {\tmp\().8b}, [x8], x2
|
||||
uxtl \tmp\().8h, \tmp\().8b
|
||||
calc_qpelh v24, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn
|
||||
calc_qpelh2 v24, v25, \src0, \src1, \src2, \src3, \src4, \src5, \src6, \src7, sqxtn2
|
||||
subs w11, w11, #1
|
||||
st1 {v24.8h}, [x10], x9
|
||||
.endm
|
||||
1:
|
||||
calc_all
|
||||
.purgem calc
|
||||
2:
|
||||
subs w6, w6, #8
|
||||
add x0, x0, #16
|
||||
add x1, x1, #8
|
||||
b.ne 0b
|
||||
ret
|
||||
endfunc
|
||||
|
||||
function ff_hevc_put_hevc_qpel_bi_v4_8_neon, export=1
|
||||
load_qpel_filterb x7, x6
|
||||
sub x2, x2, x3, lsl #1
|
||||
|
|
|
@ -60,6 +60,13 @@ void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
|
|||
c->inter.put[0][5][0][1] =
|
||||
c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
|
||||
|
||||
c->inter.put[0][1][1][0] = ff_vvc_put_qpel_v4_8_neon;
|
||||
c->inter.put[0][2][1][0] =
|
||||
c->inter.put[0][3][1][0] =
|
||||
c->inter.put[0][4][1][0] =
|
||||
c->inter.put[0][5][1][0] =
|
||||
c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
|
||||
|
||||
c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
|
||||
c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
|
||||
c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
|
||||
|
|
Loading…
Add table
Reference in a new issue