swscale/aarch64/range_convert: update neon range_convert functions to new API

aarch64 A55:
chrRangeFromJpeg8_1920_c:    28835.2 (1.00x)
chrRangeFromJpeg8_1920_neon:  5313.9 (5.43x)  5308.4 (5.43x)
chrRangeToJpeg8_1920_c:      23074.7 (1.00x)
chrRangeToJpeg8_1920_neon:    5551.3 (4.16x)  5549.2 (4.16x)
lumRangeFromJpeg8_1920_c:    15389.7 (1.00x)
lumRangeFromJpeg8_1920_neon:  3152.3 (4.88x)  3147.7 (4.89x)
lumRangeToJpeg8_1920_c:      19227.8 (1.00x)
lumRangeToJpeg8_1920_neon:    3628.7 (5.30x)  3630.2 (5.30x)

aarch64 A76:
chrRangeFromJpeg8_1920_c:    6324.4 (1.00x)
chrRangeFromJpeg8_1920_neon: 2344.5 (2.70x) 2304.2 (2.74x)
chrRangeToJpeg8_1920_c:      9656.0 (1.00x)
chrRangeToJpeg8_1920_neon:   2824.2 (3.42x) 2794.2 (3.46x)
lumRangeFromJpeg8_1920_c:    4422.0 (1.00x)
lumRangeFromJpeg8_1920_neon: 1104.5 (4.00x) 1106.2 (4.00x)
lumRangeToJpeg8_1920_c:      5949.1 (1.00x)
lumRangeToJpeg8_1920_neon:   1329.8 (4.47x) 1328.2 (4.48x)
This commit is contained in:
Ramiro Polla 2024-09-22 15:29:39 +02:00
parent be108ebcf4
commit 6fe4a4ffb6
2 changed files with 39 additions and 37 deletions

View file

@ -20,12 +20,13 @@
#include "libavutil/aarch64/asm.S"
.macro lumConvertRange name, fromto, mult, offset, shift
function ff_\name, export=1
mov w3, #\mult
dup v25.4s, w3
movz w3, #(\offset & 0xffff)
movk w3, #((\offset >> 16) & 0xffff), lsl #16
.macro lumConvertRange fromto
function ff_lumRange\fromto\()Jpeg_neon, export=1
// x0 int16_t *dst
// w1 int width
// w2 uint32_t coeff
// x3 int64_t offset
dup v25.4s, w2
dup v26.4s, w3
1:
ld1 {v0.8h}, [x0]
@ -36,11 +37,11 @@ function ff_\name, export=1
mla v16.4s, v20.4s, v25.4s
mla v18.4s, v22.4s, v25.4s
.ifc \fromto, To
sqshrn v0.4h, v16.4s, #\shift
sqshrn2 v0.8h, v18.4s, #\shift
sqshrn v0.4h, v16.4s, 14
sqshrn2 v0.8h, v18.4s, 14
.else
shrn v0.4h, v16.4s, #\shift
shrn2 v0.8h, v18.4s, #\shift
shrn v0.4h, v16.4s, 14
shrn2 v0.8h, v18.4s, 14
.endif
subs w1, w1, #8
st1 {v0.8h}, [x0], #16
@ -49,13 +50,15 @@ function ff_\name, export=1
endfunc
.endm
.macro chrConvertRange name, fromto, mult, offset, shift
function ff_\name, export=1
mov w3, #\mult
.macro chrConvertRange fromto
function ff_chrRange\fromto\()Jpeg_neon, export=1
// x0 int16_t *dstU
// x1 int16_t *dstV
// w2 int width
// w3 uint32_t coeff
// x4 int64_t offset
dup v25.4s, w3
movz w3, #(\offset & 0xffff)
movk w3, #((\offset >> 16) & 0xffff), lsl #16
dup v26.4s, w3
dup v26.4s, w4
1:
ld1 {v0.8h}, [x0]
ld1 {v1.8h}, [x1]
@ -72,15 +75,15 @@ function ff_\name, export=1
mla v18.4s, v22.4s, v25.4s
mla v19.4s, v23.4s, v25.4s
.ifc \fromto, To
sqshrn v0.4h, v16.4s, #\shift
sqshrn v1.4h, v17.4s, #\shift
sqshrn2 v0.8h, v18.4s, #\shift
sqshrn2 v1.8h, v19.4s, #\shift
sqshrn v0.4h, v16.4s, 14
sqshrn v1.4h, v17.4s, 14
sqshrn2 v0.8h, v18.4s, 14
sqshrn2 v1.8h, v19.4s, 14
.else
shrn v0.4h, v16.4s, #\shift
shrn v1.4h, v17.4s, #\shift
shrn2 v0.8h, v18.4s, #\shift
shrn2 v1.8h, v19.4s, #\shift
shrn v0.4h, v16.4s, 14
shrn v1.4h, v17.4s, 14
shrn2 v0.8h, v18.4s, 14
shrn2 v1.8h, v19.4s, 14
.endif
subs w2, w2, #8
st1 {v0.8h}, [x0], #16
@ -90,7 +93,7 @@ function ff_\name, export=1
endfunc
.endm
lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14
chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12
lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14
chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11
lumConvertRange To
chrConvertRange To
lumConvertRange From
chrConvertRange From

View file

@ -218,17 +218,17 @@ NEON_INPUT(bgra32);
NEON_INPUT(rgb24);
NEON_INPUT(rgba32);
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
uint32_t coeff, int64_t offset);
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
uint32_t coeff, int64_t offset);
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
{
/* This code is currently disabled because of changes in the base
* implementation of these functions. This code should be enabled
* again once those changes are ported to this architecture. */
#if 0
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags)) {
@ -242,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
}
}
}
#endif
}
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)