forked from FFmpeg/FFmpeg
swscale/aarch64: add neon {lum,chr}ConvertRange16
aarch64 A55: chrRangeFromJpeg16_1920_c: 32684.2 chrRangeFromJpeg16_1920_neon: 8431.2 (3.88x) chrRangeToJpeg16_1920_c: 24996.8 chrRangeToJpeg16_1920_neon: 9395.0 (2.66x) lumRangeFromJpeg16_1920_c: 17305.2 lumRangeFromJpeg16_1920_neon: 4586.5 (3.77x) lumRangeToJpeg16_1920_c: 21144.8 lumRangeToJpeg16_1920_neon: 5069.8 (4.17x) aarch64 A76: chrRangeFromJpeg16_1920_c: 11523.8 chrRangeFromJpeg16_1920_neon: 3367.5 (3.42x) chrRangeToJpeg16_1920_c: 11655.2 chrRangeToJpeg16_1920_neon: 4087.2 (2.85x) lumRangeFromJpeg16_1920_c: 5762.0 lumRangeFromJpeg16_1920_neon: 1815.8 (3.17x) lumRangeToJpeg16_1920_c: 5946.2 lumRangeToJpeg16_1920_neon: 2148.2 (2.77x)
This commit is contained in:
parent
87052c0933
commit
ca889b1328
2 changed files with 116 additions and 18 deletions
|
@ -20,12 +20,42 @@
|
||||||
|
|
||||||
#include "libavutil/aarch64/asm.S"
|
#include "libavutil/aarch64/asm.S"
|
||||||
|
|
||||||
.macro lumConvertRange fromto
|
.macro lumConvertRange fromto, bit_depth
|
||||||
function ff_lumRange\fromto\()Jpeg_neon, export=1
|
function ff_lumRange\fromto\()Jpeg\bit_depth\()_neon, export=1
|
||||||
// x0 int16_t *dst
|
// x0 int16_t *dst
|
||||||
// w1 int width
|
// w1 int width
|
||||||
// w2 uint32_t coeff
|
// w2 uint32_t coeff
|
||||||
// x3 int64_t offset
|
// x3 int64_t offset
|
||||||
|
.if \bit_depth == 16
|
||||||
|
.ifc \fromto, To
|
||||||
|
movi v25.4s, #1
|
||||||
|
movi v24.4s, #1<<3, lsl #16
|
||||||
|
sub v24.4s, v24.4s, v25.4s
|
||||||
|
.endif
|
||||||
|
dup v25.4s, w2
|
||||||
|
dup v26.2d, x3
|
||||||
|
1:
|
||||||
|
ld1 {v0.4s, v1.4s}, [x0]
|
||||||
|
mov v16.16b, v26.16b
|
||||||
|
mov v17.16b, v26.16b
|
||||||
|
mov v18.16b, v26.16b
|
||||||
|
mov v19.16b, v26.16b
|
||||||
|
smlal v16.2d, v0.2s, v25.2s
|
||||||
|
smlal2 v17.2d, v0.4s, v25.4s
|
||||||
|
smlal v18.2d, v1.2s, v25.2s
|
||||||
|
smlal2 v19.2d, v1.4s, v25.4s
|
||||||
|
shrn v0.2s, v16.2d, 18
|
||||||
|
shrn2 v0.4s, v17.2d, 18
|
||||||
|
shrn v1.2s, v18.2d, 18
|
||||||
|
shrn2 v1.4s, v19.2d, 18
|
||||||
|
subs w1, w1, #8
|
||||||
|
.ifc \fromto, To
|
||||||
|
smin v0.4s, v0.4s, v24.4s
|
||||||
|
smin v1.4s, v1.4s, v24.4s
|
||||||
|
.endif
|
||||||
|
st1 {v0.4s, v1.4s}, [x0], #32
|
||||||
|
b.gt 1b
|
||||||
|
.else
|
||||||
dup v25.4s, w2
|
dup v25.4s, w2
|
||||||
dup v26.4s, w3
|
dup v26.4s, w3
|
||||||
1:
|
1:
|
||||||
|
@ -46,17 +76,64 @@ function ff_lumRange\fromto\()Jpeg_neon, export=1
|
||||||
subs w1, w1, #8
|
subs w1, w1, #8
|
||||||
st1 {v0.8h}, [x0], #16
|
st1 {v0.8h}, [x0], #16
|
||||||
b.gt 1b
|
b.gt 1b
|
||||||
|
.endif
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
.macro chrConvertRange fromto
|
.macro chrConvertRange fromto, bit_depth
|
||||||
function ff_chrRange\fromto\()Jpeg_neon, export=1
|
function ff_chrRange\fromto\()Jpeg\bit_depth\()_neon, export=1
|
||||||
// x0 int16_t *dstU
|
// x0 int16_t *dstU
|
||||||
// x1 int16_t *dstV
|
// x1 int16_t *dstV
|
||||||
// w2 int width
|
// w2 int width
|
||||||
// w3 uint32_t coeff
|
// w3 uint32_t coeff
|
||||||
// x4 int64_t offset
|
// x4 int64_t offset
|
||||||
|
.if \bit_depth == 16
|
||||||
|
.ifc \fromto, To
|
||||||
|
movi v25.4s, #1
|
||||||
|
movi v24.4s, #1<<3, lsl #16
|
||||||
|
sub v24.4s, v24.4s, v25.4s
|
||||||
|
.endif
|
||||||
|
dup v25.4s, w3
|
||||||
|
dup v26.2d, x4
|
||||||
|
1:
|
||||||
|
ld1 {v0.4s, v1.4s}, [x0]
|
||||||
|
ld1 {v2.4s, v3.4s}, [x1]
|
||||||
|
mov v16.16b, v26.16b
|
||||||
|
mov v17.16b, v26.16b
|
||||||
|
mov v18.16b, v26.16b
|
||||||
|
mov v19.16b, v26.16b
|
||||||
|
mov v20.16b, v26.16b
|
||||||
|
mov v21.16b, v26.16b
|
||||||
|
mov v22.16b, v26.16b
|
||||||
|
mov v23.16b, v26.16b
|
||||||
|
smlal v16.2d, v0.2s, v25.2s
|
||||||
|
smlal2 v17.2d, v0.4s, v25.4s
|
||||||
|
smlal v18.2d, v1.2s, v25.2s
|
||||||
|
smlal2 v19.2d, v1.4s, v25.4s
|
||||||
|
smlal v20.2d, v2.2s, v25.2s
|
||||||
|
smlal2 v21.2d, v2.4s, v25.4s
|
||||||
|
smlal v22.2d, v3.2s, v25.2s
|
||||||
|
smlal2 v23.2d, v3.4s, v25.4s
|
||||||
|
shrn v0.2s, v16.2d, 18
|
||||||
|
shrn2 v0.4s, v17.2d, 18
|
||||||
|
shrn v1.2s, v18.2d, 18
|
||||||
|
shrn2 v1.4s, v19.2d, 18
|
||||||
|
shrn v2.2s, v20.2d, 18
|
||||||
|
shrn2 v2.4s, v21.2d, 18
|
||||||
|
shrn v3.2s, v22.2d, 18
|
||||||
|
shrn2 v3.4s, v23.2d, 18
|
||||||
|
subs w2, w2, #8
|
||||||
|
.ifc \fromto, To
|
||||||
|
smin v0.4s, v0.4s, v24.4s
|
||||||
|
smin v1.4s, v1.4s, v24.4s
|
||||||
|
smin v2.4s, v2.4s, v24.4s
|
||||||
|
smin v3.4s, v3.4s, v24.4s
|
||||||
|
.endif
|
||||||
|
st1 {v0.4s, v1.4s}, [x0], #32
|
||||||
|
st1 {v2.4s, v3.4s}, [x1], #32
|
||||||
|
b.gt 1b
|
||||||
|
.else
|
||||||
dup v25.4s, w3
|
dup v25.4s, w3
|
||||||
dup v26.4s, w4
|
dup v26.4s, w4
|
||||||
1:
|
1:
|
||||||
|
@ -89,11 +166,16 @@ function ff_chrRange\fromto\()Jpeg_neon, export=1
|
||||||
st1 {v0.8h}, [x0], #16
|
st1 {v0.8h}, [x0], #16
|
||||||
st1 {v1.8h}, [x1], #16
|
st1 {v1.8h}, [x1], #16
|
||||||
b.gt 1b
|
b.gt 1b
|
||||||
|
.endif
|
||||||
ret
|
ret
|
||||||
endfunc
|
endfunc
|
||||||
.endm
|
.endm
|
||||||
|
|
||||||
lumConvertRange To
|
lumConvertRange To, 8
|
||||||
chrConvertRange To
|
lumConvertRange To, 16
|
||||||
lumConvertRange From
|
chrConvertRange To, 8
|
||||||
chrConvertRange From
|
chrConvertRange To, 16
|
||||||
|
lumConvertRange From, 8
|
||||||
|
lumConvertRange From, 16
|
||||||
|
chrConvertRange From, 8
|
||||||
|
chrConvertRange From, 16
|
||||||
|
|
|
@ -218,14 +218,22 @@ NEON_INPUT(bgra32);
|
||||||
NEON_INPUT(rgb24);
|
NEON_INPUT(rgb24);
|
||||||
NEON_INPUT(rgba32);
|
NEON_INPUT(rgba32);
|
||||||
|
|
||||||
void ff_lumRangeFromJpeg_neon(int16_t *dst, int width,
|
void ff_lumRangeFromJpeg8_neon(int16_t *dst, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_chrRangeFromJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_lumRangeToJpeg8_neon(int16_t *dst, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_chrRangeToJpeg8_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_lumRangeFromJpeg16_neon(int16_t *dst, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_chrRangeFromJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||||
|
uint32_t coeff, int64_t offset);
|
||||||
|
void ff_lumRangeToJpeg16_neon(int16_t *dst, int width,
|
||||||
uint32_t coeff, int64_t offset);
|
uint32_t coeff, int64_t offset);
|
||||||
void ff_chrRangeFromJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
|
void ff_chrRangeToJpeg16_neon(int16_t *dstU, int16_t *dstV, int width,
|
||||||
uint32_t coeff, int64_t offset);
|
uint32_t coeff, int64_t offset);
|
||||||
void ff_lumRangeToJpeg_neon(int16_t *dst, int width,
|
|
||||||
uint32_t coeff, int64_t offset);
|
|
||||||
void ff_chrRangeToJpeg_neon(int16_t *dstU, int16_t *dstV, int width,
|
|
||||||
uint32_t coeff, int64_t offset);
|
|
||||||
|
|
||||||
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
|
av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
|
||||||
{
|
{
|
||||||
|
@ -234,11 +242,19 @@ av_cold void ff_sws_init_range_convert_aarch64(SwsInternal *c)
|
||||||
if (have_neon(cpu_flags)) {
|
if (have_neon(cpu_flags)) {
|
||||||
if (c->dstBpc <= 14) {
|
if (c->dstBpc <= 14) {
|
||||||
if (c->opts.src_range) {
|
if (c->opts.src_range) {
|
||||||
c->lumConvertRange = ff_lumRangeFromJpeg_neon;
|
c->lumConvertRange = ff_lumRangeFromJpeg8_neon;
|
||||||
c->chrConvertRange = ff_chrRangeFromJpeg_neon;
|
c->chrConvertRange = ff_chrRangeFromJpeg8_neon;
|
||||||
} else {
|
} else {
|
||||||
c->lumConvertRange = ff_lumRangeToJpeg_neon;
|
c->lumConvertRange = ff_lumRangeToJpeg8_neon;
|
||||||
c->chrConvertRange = ff_chrRangeToJpeg_neon;
|
c->chrConvertRange = ff_chrRangeToJpeg8_neon;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (c->opts.src_range) {
|
||||||
|
c->lumConvertRange = ff_lumRangeFromJpeg16_neon;
|
||||||
|
c->chrConvertRange = ff_chrRangeFromJpeg16_neon;
|
||||||
|
} else {
|
||||||
|
c->lumConvertRange = ff_lumRangeToJpeg16_neon;
|
||||||
|
c->chrConvertRange = ff_chrRangeToJpeg16_neon;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue