swscale/aarch64: add neon {lum,chr}ConvertRange16

aarch64 A55:
chrRangeFromJpeg16_1920_c:    32684.2
chrRangeFromJpeg16_1920_neon:  8431.2 (3.88x)
chrRangeToJpeg16_1920_c:      24996.8
chrRangeToJpeg16_1920_neon:    9395.0 (2.66x)
lumRangeFromJpeg16_1920_c:    17305.2
lumRangeFromJpeg16_1920_neon:  4586.5 (3.77x)
lumRangeToJpeg16_1920_c:      21144.8
lumRangeToJpeg16_1920_neon:    5069.8 (4.17x)

aarch64 A76:
chrRangeFromJpeg16_1920_c:    11523.8
chrRangeFromJpeg16_1920_neon:  3367.5 (3.42x)
chrRangeToJpeg16_1920_c:      11655.2
chrRangeToJpeg16_1920_neon:    4087.2 (2.85x)
lumRangeFromJpeg16_1920_c:     5762.0
lumRangeFromJpeg16_1920_neon:  1815.8 (3.17x)
lumRangeToJpeg16_1920_c:       5946.2
lumRangeToJpeg16_1920_neon:    2148.2 (2.77x)
This commit is contained in:
Ramiro Polla 2024-09-22 16:31:54 +02:00
parent 87052c0933
commit ca889b1328
2 changed files with 116 additions and 18 deletions

View file

@ -20,12 +20,42 @@
#include "libavutil/aarch64/asm.S"
.macro lumConvertRange fromto
function ff_lumRange\fromto\()Jpeg_neon, export=1
.macro lumConvertRange fromto, bit_depth
function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1
// x0 int16_t *dst
// w1 int width
// w2 uint32_t coeff
// x3 int64_t offset
.if \bit_depth == 16
.ifc \fromto, To
movi v25.4s, #1
movi v24.4s, #1<<3, lsl #16
sub v24.4s, v24.4s, v25.4s
.endif
dup v25.4s, w2
dup v26.2d, x3
1:
ld1 {v0.4s, v1.4s}, [x0]
mov v16.16b, v26.16b
mov v17.16b, v26.16b
mov v18.16b, v26.16b
mov v19.16b, v26.16b
smlal v16.2d, v0.2s, v25.2s
smlal2 v17.2d, v0.4s, v25.4s
smlal v18.2d, v1.2s, v25.2s
smlal2 v19.2d, v1.4s, v25.4s
shrn v0.2s, v16.2d, 18
shrn2 v0.4s, v17.2d, 18
shrn v1.2s, v18.2d, 18
shrn2 v1.4s, v19.2d, 18
subs w1, w1, #8
.ifc \fromto, To
smin v0.4s, v0.4s, v24.4s
smin v1.4s, v1.4s, v24.4s
.endif
st1 {v0.4s, v1.4s}, [x0], #32
b.gt 1b
.else
dup v25.4s, w2
dup v26.4s, w3
1:
@ -46,17 +76,64 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1
subs w1, w1, #8
st1 {v0.8h}, [x0], #16
b.gt 1b
.endif
ret
endfunc
.endm
.macro chrConvertRange fromto
function ff_chrRange\fromto\()Jpeg_neon, export=1
.macro chrConvertRange fromto, bit_depth
function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1
// x0 int16_t *dstU
// x1 int16_t *dstV
// w2 int width
// w3 uint32_t coeff
// x4 int64_t offset
.if \bit_depth == 16
.ifc \fromto, To
movi v25.4s, #1
movi v24.4s, #1<<3, lsl #16
sub v24.4s, v24.4s, v25.4s
.endif
dup v25.4s, w3
dup v26.2d, x4
1:
ld1 {v0.4s, v1.4s}, [x0]
ld1 {v2.4s, v3.4s}, [x1]
mov v16.16b, v26.16b
mov v17.16b, v26.16b
mov v18.16b, v26.16b
mov v19.16b, v26.16b
mov v20.16b, v26.16b
mov v21.16b, v26.16b
mov v22.16b, v26.16b
mov v23.16b, v26.16b
smlal v16.2d, v0.2s, v25.2s
smlal2 v17.2d, v0.4s, v25.4s
smlal v18.2d, v1.2s, v25.2s
smlal2 v19.2d, v1.4s, v25.4s
smlal v20.2d, v2.2s, v25.2s
smlal2 v21.2d, v2.4s, v25.4s
smlal v22.2d, v3.2s, v25.2s
smlal2 v23.2d, v3.4s, v25.4s
shrn v0.2s, v16.2d, 18
shrn2 v0.4s, v17.2d, 18
shrn v1.2s, v18.2d, 18
shrn2 v1.4s, v19.2d, 18
shrn v2.2s, v20.2d, 18
shrn2 v2.4s, v21.2d, 18
shrn v3.2s, v22.2d, 18
shrn2 v3.4s, v23.2d, 18
subs w2, w2, #8
.ifc \fromto, To
smin v0.4s, v0.4s, v24.4s
smin v1.4s, v1.4s, v24.4s
smin v2.4s, v2.4s, v24.4s
smin v3.4s, v3.4s, v24.4s
.endif
st1 {v0.4s, v1.4s}, [x0], #32
st1 {v2.4s, v3.4s}, [x1], #32
b.gt 1b
.else
dup v25.4s, w3
dup v26.4s, w4
1:
@ -89,11 +166,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1
st1 {v0.8h}, [x0], #16
st1 {v1.8h}, [x1], #16
b.gt 1b
.endif
ret
endfunc
.endm
lumConvertRange To
chrConvertRange To
lumConvertRange From
chrConvertRange From
lumConvertRange To, 8
lumConvertRange To, 16
chrConvertRange To, 8
chrConvertRange To, 16
lumConvertRange From, 8
lumConvertRange From, 16
chrConvertRange From, 8
chrConvertRange From, 16

View file

@ -218,14 +218,22 @@ NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
void ff_lumRangeToJpeg8_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
void ff_lumRangeToJpeg16_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
{
@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
if (have_neon(cpu_flags)) {
if (c->dstBpc <= 14) {
if (c->opts.src_range) {
c->lumConvertRange = ff_lumRangeFromJpeg_neon;
c->chrConvertRange = ff_chrRangeFromJpeg_neon;
c->lumConvertRange = ff_lumRangeFromJpeg8_neon;
c->chrConvertRange = ff_chrRangeFromJpeg8_neon;
} else {
c->lumConvertRange = ff_lumRangeToJpeg_neon;
c->chrConvertRange = ff_chrRangeToJpeg_neon;
c->lumConvertRange = ff_lumRangeToJpeg8_neon;
c->chrConvertRange = ff_chrRangeToJpeg8_neon;
}
} else {
if (c->opts.src_range) {
c->lumConvertRange = ff_lumRangeFromJpeg16_neon;
c->chrConvertRange = ff_chrRangeFromJpeg16_neon;
} else {
c->lumConvertRange = ff_lumRangeToJpeg16_neon;
c->chrConvertRange = ff_chrRangeToJpeg16_neon;
}
}
}