forked from FFmpeg/FFmpeg
swscale/aarch64/range_convert: update neon range_convert functions to new API
aarch64 A55: chrRangeFromJpeg8_1920_c: 28835.2 (1.00x) chrRangeFromJpeg8_1920_neon: 5313.9 (5.43x) 5308.4 (5.43x) chrRangeToJpeg8_1920_c: 23074.7 (1.00x) chrRangeToJpeg8_1920_neon: 5551.3 (4.16x) 5549.2 (4.16x) lumRangeFromJpeg8_1920_c: 15389.7 (1.00x) lumRangeFromJpeg8_1920_neon: 3152.3 (4.88x) 3147.7 (4.89x) lumRangeToJpeg8_1920_c: 19227.8 (1.00x) lumRangeToJpeg8_1920_neon: 3628.7 (5.30x) 3630.2 (5.30x) aarch64 A76: chrRangeFromJpeg8_1920_c: 6324.4 (1.00x) chrRangeFromJpeg8_1920_neon: 2344.5 (2.70x) 2304.2 (2.74x) chrRangeToJpeg8_1920_c: 9656.0 (1.00x) chrRangeToJpeg8_1920_neon: 2824.2 (3.42x) 2794.2 (3.46x) lumRangeFromJpeg8_1920_c: 4422.0 (1.00x) lumRangeFromJpeg8_1920_neon: 1104.5 (4.00x) 1106.2 (4.00x) lumRangeToJpeg8_1920_c: 5949.1 (1.00x) lumRangeToJpeg8_1920_neon: 1329.8 (4.47x) 1328.2 (4.48x)
This commit is contained in:
parent
be108ebcf4
commit
6fe4a4ffb6
2 changed files with 39 additions and 37 deletions
|
@ -20,12 +20,13 @@
|
|||
|
||||
#include "libavutil/aarch64/asm.S"
|
||||
|
||||
.macro lumConvertRange name, fromto, mult, offset, shift
|
||||
function ff_\name, export=1
|
||||
mov w3, #\mult
|
||||
dup v25.4s, w3
|
||||
movz w3, #(\offset & 0xffff)
|
||||
movk w3, #((\offset >> 16) & 0xffff), lsl #16
|
||||
.macro lumConvertRange fromto
|
||||
function ff_lumRange\fromto\()Jpeg_neon, export=1
|
||||
// x0 int16_t *dst
|
||||
// w1 int width
|
||||
// w2 uint32_t coeff
|
||||
// x3 int64_t offset
|
||||
dup v25.4s, w2
|
||||
dup v26.4s, w3
|
||||
1:
|
||||
ld1 {v0.8h}, [x0]
|
||||
|
@ -36,11 +37,11 @@ function ff_\name, export=1
|
|||
mla v16.4s, v20.4s, v25.4s
|
||||
mla v18.4s, v22.4s, v25.4s
|
||||
.ifc \fromto, To
|
||||
sqshrn v0.4h, v16.4s, #\shift
|
||||
sqshrn2 v0.8h, v18.4s, #\shift
|
||||
sqshrn v0.4h, v16.4s, 14
|
||||
sqshrn2 v0.8h, v18.4s, 14
|
||||
.else
|
||||
shrn v0.4h, v16.4s, #\shift
|
||||
shrn2 v0.8h, v18.4s, #\shift
|
||||
shrn v0.4h, v16.4s, 14
|
||||
shrn2 v0.8h, v18.4s, 14
|
||||
.endif
|
||||
subs w1, w1, #8
|
||||
st1 {v0.8h}, [x0], #16
|
||||
|
@ -49,13 +50,15 @@ function ff_\name, export=1
|
|||
endfunc
|
||||
.endm
|
||||
|
||||
.macro chrConvertRange name, fromto, mult, offset, shift
|
||||
function ff_\name, export=1
|
||||
mov w3, #\mult
|
||||
.macro chrConvertRange fromto
|
||||
function ff_chrRange\fromto\()Jpeg_neon, export=1
|
||||
// x0 int16_t *dstU
|
||||
// x1 int16_t *dstV
|
||||
// w2 int width
|
||||
// w3 uint32_t coeff
|
||||
// x4 int64_t offset
|
||||
dup v25.4s, w3
|
||||
movz w3, #(\offset & 0xffff)
|
||||
movk w3, #((\offset >> 16) & 0xffff), lsl #16
|
||||
dup v26.4s, w3
|
||||
dup v26.4s, w4
|
||||
1:
|
||||
ld1 {v0.8h}, [x0]
|
||||
ld1 {v1.8h}, [x1]
|
||||
|
@ -72,15 +75,15 @@ function ff_\name, export=1
|
|||
mla v18.4s, v22.4s, v25.4s
|
||||
mla v19.4s, v23.4s, v25.4s
|
||||
.ifc \fromto, To
|
||||
sqshrn v0.4h, v16.4s, #\shift
|
||||
sqshrn v1.4h, v17.4s, #\shift
|
||||
sqshrn2 v0.8h, v18.4s, #\shift
|
||||
sqshrn2 v1.8h, v19.4s, #\shift
|
||||
sqshrn v0.4h, v16.4s, 14
|
||||
sqshrn v1.4h, v17.4s, 14
|
||||
sqshrn2 v0.8h, v18.4s, 14
|
||||
sqshrn2 v1.8h, v19.4s, 14
|
||||
.else
|
||||
shrn v0.4h, v16.4s, #\shift
|
||||
shrn v1.4h, v17.4s, #\shift
|
||||
shrn2 v0.8h, v18.4s, #\shift
|
||||
shrn2 v1.8h, v19.4s, #\shift
|
||||
shrn v0.4h, v16.4s, 14
|
||||
shrn v1.4h, v17.4s, 14
|
||||
shrn2 v0.8h, v18.4s, 14
|
||||
shrn2 v1.8h, v19.4s, 14
|
||||
.endif
|
||||
subs w2, w2, #8
|
||||
st1 {v0.8h}, [x0], #16
|
||||
|
@ -90,7 +93,7 @@ function ff_\name, export=1
|
|||
endfunc
|
||||
.endm
|
||||
|
||||
lumConvertRange lumRangeToJpeg_neon, To, 19077, -39057361, 14
|
||||
chrConvertRange chrRangeToJpeg_neon, To, 4663, -9289992, 12
|
||||
lumConvertRange lumRangeFromJpeg_neon, From, 14071, 33561947, 14
|
||||
chrConvertRange chrRangeFromJpeg_neon, From, 1799, 4081085, 11
|
||||
lumConvertRange To
|
||||
chrConvertRange To
|
||||
lumConvertRange From
|
||||
chrConvertRange From
|
||||
|
|
|
@ -218,17 +218,17 @@ NEON_INPUT(bgra32);
|
|||
NEON_INPUT(rgb24);
|
||||
NEON_INPUT(rgba32);
|
||||
|
||||
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width);
|
||||
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
|
||||
void ff_lumRangeToJpeg_neon(int16_t *dst, int width);
|
||||
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width);
|
||||
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
|
||||
uint32_t coeff, int64_t offset);
|
||||
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||
uint32_t coeff, int64_t offset);
|
||||
void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
|
||||
uint32_t coeff, int64_t offset);
|
||||
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||
uint32_t coeff, int64_t offset);
|
||||
|
||||
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
|
||||
{
|
||||
/* This code is currently disabled because of changes in the base
|
||||
* implementation of these functions. This code should be enabled
|
||||
* again once those changes are ported to this architecture. */
|
||||
#if 0
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
|
||||
if (have_neon(cpu_flags)) {
|
||||
|
@ -242,7 +242,6 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
|
|||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
av_cold void ff_sws_init_swscale_aarch64(SwsInternal *c)
|
||||
|
|
Loading…
Add table
Reference in a new issue