swscale/x86: add sse4 and avx2 {lum,chr}ConvertRange16

chrRangeFromJpeg16_1920_c:    3153.9
chrRangeFromJpeg16_1920_sse4: 1770.0 (1.78x)
chrRangeFromJpeg16_1920_avx2:  891.5 (3.54x)
chrRangeToJpeg16_1920_c:      3165.0
chrRangeToJpeg16_1920_sse4:   1953.2 (1.62x)
chrRangeToJpeg16_1920_avx2:    973.0 (3.25x)
lumRangeFromJpeg16_1920_c:    1298.5
lumRangeFromJpeg16_1920_sse4:  886.5 (1.46x)
lumRangeFromJpeg16_1920_avx2:  447.7 (2.90x)
lumRangeToJpeg16_1920_c:      1905.0
lumRangeToJpeg16_1920_sse4:    993.0 (1.92x)
lumRangeToJpeg16_1920_avx2:    498.9 (3.82x)
This commit is contained in:
Ramiro Polla 2024-09-22 13:30:03 +02:00
parent 6fe4a4ffb6
commit 87052c0933
2 changed files with 121 additions and 45 deletions

View file

@ -20,21 +20,24 @@
%include "libavutil/x86/x86util.asm"
SECTION_RODATA
pack19: times 4 dd (1 << 19) - 1
SECTION .text
;-----------------------------------------------------------------------------
; lumConvertRange
;
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
%macro LUMCONVERTRANGE 1
cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
shl widthd, 1
%macro LUMCONVERTRANGE 2
cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
shl widthd, %2 >> 3
movd xm2, coeffd
VBROADCASTSS m2, xm2
%if ARCH_X86_64
@ -42,12 +45,34 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
%else
movq xm3, offsetm
%endif
%if %2 == 16
VBROADCASTSD m3, xm3
%ifidni %1,To
VBROADCASTI128 m4, [pack19]
%endif
%elif %2 == 8
VBROADCASTSS m3, xm3
pxor m4, m4
%endif ; %2 == 8/16
add dstq, widthq
neg widthq
.loop:
movu m0, [dstq+widthq]
%if %2 == 16
pshufd m1, m0, 0xb1
pmuldq m0, m2
pmuldq m1, m2
paddq m0, m3
paddq m1, m3
psrlq m0, 18
psrlq m1, 18
pshufd m0, m0, 0xd8
pshufd m1, m1, 0xd8
punpckldq m0, m1
%ifidni %1,To
PMINSD m0, m4, m1
%endif
%elif %2 == 8
punpckhwd m1, m0, m4
punpcklwd m0, m4
pmaddwd m0, m2
@ -57,6 +82,7 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
psrad m0, 14
psrad m1, 14
packssdw m0, m1
%endif ; %2 == 8/16
movu [dstq+widthq], m0
add widthq, mmsize
jl .loop
@ -66,16 +92,16 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
;-----------------------------------------------------------------------------
; chrConvertRange
;
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
; uint32_t coeff, int64_t offset);
;
;-----------------------------------------------------------------------------
%macro CHRCONVERTRANGE 1
cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, 1
%macro CHRCONVERTRANGE 2
cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
shl widthd, %2 >> 3
movd xm4, coeffd
VBROADCASTSS m4, xm4
%if ARCH_X86_64
@ -83,14 +109,47 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%else
movq xm5, offsetm
%endif
%if %2 == 16
VBROADCASTSD m5, xm5
%ifidni %1,To
VBROADCASTI128 m6, [pack19]
%endif
%elif %2 == 8
VBROADCASTSS m5, xm5
pxor m6, m6
%endif ; %2 == 8/16
add dstUq, widthq
add dstVq, widthq
neg widthq
.loop:
movu m0, [dstUq+widthq]
movu m2, [dstVq+widthq]
%if %2 == 16
pshufd m1, m0, 0xb1
pshufd m3, m2, 0xb1
pmuldq m0, m4
pmuldq m1, m4
pmuldq m2, m4
pmuldq m3, m4
paddq m0, m5
paddq m1, m5
paddq m2, m5
paddq m3, m5
psrlq m0, 18
psrlq m1, 18
psrlq m2, 18
psrlq m3, 18
pshufd m0, m0, 0xd8
pshufd m1, m1, 0xd8
pshufd m2, m2, 0xd8
pshufd m3, m3, 0xd8
punpckldq m0, m1
punpckldq m2, m3
%ifidni %1,To
PMINSD m0, m6, m1
PMINSD m2, m6, m3
%endif
%elif %2 == 8
punpckhwd m1, m0, m6
punpckhwd m3, m2, m6
punpcklwd m0, m6
@ -109,6 +168,7 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
psrad m3, 14
packssdw m0, m1
packssdw m2, m3
%endif ; %2 == 8/16
movu [dstUq+widthq], m0
movu [dstVq+widthq], m2
add widthq, mmsize
@ -117,15 +177,25 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
%endmacro
INIT_XMM sse2
LUMCONVERTRANGE To
CHRCONVERTRANGE To
LUMCONVERTRANGE From
CHRCONVERTRANGE From
LUMCONVERTRANGE To, 8
CHRCONVERTRANGE To, 8
LUMCONVERTRANGE From, 8
CHRCONVERTRANGE From, 8
INIT_XMM sse4
LUMCONVERTRANGE To, 16
CHRCONVERTRANGE To, 16
LUMCONVERTRANGE From, 16
CHRCONVERTRANGE From, 16
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
LUMCONVERTRANGE To
CHRCONVERTRANGE To
LUMCONVERTRANGE From
CHRCONVERTRANGE From
LUMCONVERTRANGE To, 8
LUMCONVERTRANGE To, 16
CHRCONVERTRANGE To, 8
CHRCONVERTRANGE To, 16
LUMCONVERTRANGE From, 8
LUMCONVERTRANGE From, 16
CHRCONVERTRANGE From, 8
CHRCONVERTRANGE From, 16
%endif

View file

@ -451,38 +451,44 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
#endif
#define RANGE_CONVERT_FUNCS(opt) do { \
if (c->dstBpc <= 14) { \
if (c->opts.src_range) { \
c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
} else { \
c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
} \
#define RANGE_CONVERT_FUNCS(opt, bpc) do { \
if (c->opts.src_range) { \
c->lumConvertRange = ff_lumRangeFromJpeg##bpc##_##opt; \
c->chrConvertRange = ff_chrRangeFromJpeg##bpc##_##opt; \
} else { \
c->lumConvertRange = ff_lumRangeToJpeg##bpc##_##opt; \
c->chrConvertRange = ff_chrRangeToJpeg##bpc##_##opt; \
} \
} while (0)
#define RANGE_CONVERT_FUNCS_DECL(opt) \
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
#define RANGE_CONVERT_FUNCS_DECL(opt, bpc) \
void ff_lumRangeFromJpeg##bpc##_##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
void ff_chrRangeFromJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
void ff_lumRangeToJpeg##bpc##_##opt(int16_t *dst, int width, \
uint32_t coeff, int64_t offset); \
void ff_chrRangeToJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
uint32_t coeff, int64_t offset); \
RANGE_CONVERT_FUNCS_DECL(sse2);
RANGE_CONVERT_FUNCS_DECL(avx2);
RANGE_CONVERT_FUNCS_DECL(sse2, 8)
RANGE_CONVERT_FUNCS_DECL(sse4, 16)
RANGE_CONVERT_FUNCS_DECL(avx2, 8)
RANGE_CONVERT_FUNCS_DECL(avx2, 16)
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
{
int cpu_flags = av_get_cpu_flags();
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
RANGE_CONVERT_FUNCS(avx2);
} else if (EXTERNAL_SSE2(cpu_flags)) {
RANGE_CONVERT_FUNCS(sse2);
if (c->dstBpc <= 14) {
RANGE_CONVERT_FUNCS(avx2, 8);
} else {
RANGE_CONVERT_FUNCS(avx2, 16);
}
} else if (EXTERNAL_SSE2(cpu_flags) && c->dstBpc <= 14) {
RANGE_CONVERT_FUNCS(sse2, 8);
} else if (EXTERNAL_SSE4(cpu_flags) && c->dstBpc > 14) {
RANGE_CONVERT_FUNCS(sse4, 16);
}
}