forked from FFmpeg/FFmpeg
swscale/x86: add sse4 and avx2 {lum,chr}ConvertRange16
chrRangeFromJpeg16_1920_c: 3153.9 chrRangeFromJpeg16_1920_sse4: 1770.0 (1.78x) chrRangeFromJpeg16_1920_avx2: 891.5 (3.54x) chrRangeToJpeg16_1920_c: 3165.0 chrRangeToJpeg16_1920_sse4: 1953.2 (1.62x) chrRangeToJpeg16_1920_avx2: 973.0 (3.25x) lumRangeFromJpeg16_1920_c: 1298.5 lumRangeFromJpeg16_1920_sse4: 886.5 (1.46x) lumRangeFromJpeg16_1920_avx2: 447.7 (2.90x) lumRangeToJpeg16_1920_c: 1905.0 lumRangeToJpeg16_1920_sse4: 993.0 (1.92x) lumRangeToJpeg16_1920_avx2: 498.9 (3.82x)
This commit is contained in:
parent
6fe4a4ffb6
commit
87052c0933
2 changed files with 121 additions and 45 deletions
|
@ -20,21 +20,24 @@
|
|||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
pack19: times 4 dd (1 << 19) - 1
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; lumConvertRange
|
||||
;
|
||||
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_lumRangeToJpeg{8,16}_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_lumRangeFromJpeg{8,16}_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
;
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro LUMCONVERTRANGE 1
|
||||
cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
|
||||
shl widthd, 1
|
||||
%macro LUMCONVERTRANGE 2
|
||||
cglobal lumRange%1Jpeg%2, 4, 4, 5, dst, width, coeff, offset
|
||||
shl widthd, %2 >> 3
|
||||
movd xm2, coeffd
|
||||
VBROADCASTSS m2, xm2
|
||||
%if ARCH_X86_64
|
||||
|
@ -42,12 +45,34 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
|
|||
%else
|
||||
movq xm3, offsetm
|
||||
%endif
|
||||
%if %2 == 16
|
||||
VBROADCASTSD m3, xm3
|
||||
%ifidni %1,To
|
||||
VBROADCASTI128 m4, [pack19]
|
||||
%endif
|
||||
%elif %2 == 8
|
||||
VBROADCASTSS m3, xm3
|
||||
pxor m4, m4
|
||||
%endif ; %2 == 8/16
|
||||
add dstq, widthq
|
||||
neg widthq
|
||||
.loop:
|
||||
movu m0, [dstq+widthq]
|
||||
%if %2 == 16
|
||||
pshufd m1, m0, 0xb1
|
||||
pmuldq m0, m2
|
||||
pmuldq m1, m2
|
||||
paddq m0, m3
|
||||
paddq m1, m3
|
||||
psrlq m0, 18
|
||||
psrlq m1, 18
|
||||
pshufd m0, m0, 0xd8
|
||||
pshufd m1, m1, 0xd8
|
||||
punpckldq m0, m1
|
||||
%ifidni %1,To
|
||||
PMINSD m0, m4, m1
|
||||
%endif
|
||||
%elif %2 == 8
|
||||
punpckhwd m1, m0, m4
|
||||
punpcklwd m0, m4
|
||||
pmaddwd m0, m2
|
||||
|
@ -57,6 +82,7 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
|
|||
psrad m0, 14
|
||||
psrad m1, 14
|
||||
packssdw m0, m1
|
||||
%endif ; %2 == 8/16
|
||||
movu [dstq+widthq], m0
|
||||
add widthq, mmsize
|
||||
jl .loop
|
||||
|
@ -66,16 +92,16 @@ cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
|
|||
;-----------------------------------------------------------------------------
|
||||
; chrConvertRange
|
||||
;
|
||||
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_chrRangeToJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_chrRangeFromJpeg{8,16}_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
;
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro CHRCONVERTRANGE 1
|
||||
cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
|
||||
shl widthd, 1
|
||||
%macro CHRCONVERTRANGE 2
|
||||
cglobal chrRange%1Jpeg%2, 5, 5, 7, dstU, dstV, width, coeff, offset
|
||||
shl widthd, %2 >> 3
|
||||
movd xm4, coeffd
|
||||
VBROADCASTSS m4, xm4
|
||||
%if ARCH_X86_64
|
||||
|
@ -83,14 +109,47 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
|
|||
%else
|
||||
movq xm5, offsetm
|
||||
%endif
|
||||
%if %2 == 16
|
||||
VBROADCASTSD m5, xm5
|
||||
%ifidni %1,To
|
||||
VBROADCASTI128 m6, [pack19]
|
||||
%endif
|
||||
%elif %2 == 8
|
||||
VBROADCASTSS m5, xm5
|
||||
pxor m6, m6
|
||||
%endif ; %2 == 8/16
|
||||
add dstUq, widthq
|
||||
add dstVq, widthq
|
||||
neg widthq
|
||||
.loop:
|
||||
movu m0, [dstUq+widthq]
|
||||
movu m2, [dstVq+widthq]
|
||||
%if %2 == 16
|
||||
pshufd m1, m0, 0xb1
|
||||
pshufd m3, m2, 0xb1
|
||||
pmuldq m0, m4
|
||||
pmuldq m1, m4
|
||||
pmuldq m2, m4
|
||||
pmuldq m3, m4
|
||||
paddq m0, m5
|
||||
paddq m1, m5
|
||||
paddq m2, m5
|
||||
paddq m3, m5
|
||||
psrlq m0, 18
|
||||
psrlq m1, 18
|
||||
psrlq m2, 18
|
||||
psrlq m3, 18
|
||||
pshufd m0, m0, 0xd8
|
||||
pshufd m1, m1, 0xd8
|
||||
pshufd m2, m2, 0xd8
|
||||
pshufd m3, m3, 0xd8
|
||||
punpckldq m0, m1
|
||||
punpckldq m2, m3
|
||||
%ifidni %1,To
|
||||
PMINSD m0, m6, m1
|
||||
PMINSD m2, m6, m3
|
||||
%endif
|
||||
%elif %2 == 8
|
||||
punpckhwd m1, m0, m6
|
||||
punpckhwd m3, m2, m6
|
||||
punpcklwd m0, m6
|
||||
|
@ -109,6 +168,7 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
|
|||
psrad m3, 14
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
%endif ; %2 == 8/16
|
||||
movu [dstUq+widthq], m0
|
||||
movu [dstVq+widthq], m2
|
||||
add widthq, mmsize
|
||||
|
@ -117,15 +177,25 @@ cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
|
|||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
LUMCONVERTRANGE To
|
||||
CHRCONVERTRANGE To
|
||||
LUMCONVERTRANGE From
|
||||
CHRCONVERTRANGE From
|
||||
LUMCONVERTRANGE To, 8
|
||||
CHRCONVERTRANGE To, 8
|
||||
LUMCONVERTRANGE From, 8
|
||||
CHRCONVERTRANGE From, 8
|
||||
|
||||
INIT_XMM sse4
|
||||
LUMCONVERTRANGE To, 16
|
||||
CHRCONVERTRANGE To, 16
|
||||
LUMCONVERTRANGE From, 16
|
||||
CHRCONVERTRANGE From, 16
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
LUMCONVERTRANGE To
|
||||
CHRCONVERTRANGE To
|
||||
LUMCONVERTRANGE From
|
||||
CHRCONVERTRANGE From
|
||||
LUMCONVERTRANGE To, 8
|
||||
LUMCONVERTRANGE To, 16
|
||||
CHRCONVERTRANGE To, 8
|
||||
CHRCONVERTRANGE To, 16
|
||||
LUMCONVERTRANGE From, 8
|
||||
LUMCONVERTRANGE From, 16
|
||||
CHRCONVERTRANGE From, 8
|
||||
CHRCONVERTRANGE From, 16
|
||||
%endif
|
||||
|
|
|
@ -451,38 +451,44 @@ INPUT_PLANAR_RGB_UV_ALL_DECL(avx2);
|
|||
INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
|
||||
#endif
|
||||
|
||||
#define RANGE_CONVERT_FUNCS(opt) do { \
|
||||
if (c->dstBpc <= 14) { \
|
||||
if (c->opts.src_range) { \
|
||||
c->lumConvertRange = ff_lumRangeFromJpeg_ ##opt; \
|
||||
c->chrConvertRange = ff_chrRangeFromJpeg_ ##opt; \
|
||||
} else { \
|
||||
c->lumConvertRange = ff_lumRangeToJpeg_ ##opt; \
|
||||
c->chrConvertRange = ff_chrRangeToJpeg_ ##opt; \
|
||||
} \
|
||||
#define RANGE_CONVERT_FUNCS(opt, bpc) do { \
|
||||
if (c->opts.src_range) { \
|
||||
c->lumConvertRange = ff_lumRangeFromJpeg##bpc##_##opt; \
|
||||
c->chrConvertRange = ff_chrRangeFromJpeg##bpc##_##opt; \
|
||||
} else { \
|
||||
c->lumConvertRange = ff_lumRangeToJpeg##bpc##_##opt; \
|
||||
c->chrConvertRange = ff_chrRangeToJpeg##bpc##_##opt; \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
#define RANGE_CONVERT_FUNCS_DECL(opt) \
|
||||
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
#define RANGE_CONVERT_FUNCS_DECL(opt, bpc) \
|
||||
void ff_lumRangeFromJpeg##bpc##_##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeFromJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_lumRangeToJpeg##bpc##_##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeToJpeg##bpc##_##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
|
||||
RANGE_CONVERT_FUNCS_DECL(sse2);
|
||||
RANGE_CONVERT_FUNCS_DECL(avx2);
|
||||
RANGE_CONVERT_FUNCS_DECL(sse2, 8)
|
||||
RANGE_CONVERT_FUNCS_DECL(sse4, 16)
|
||||
RANGE_CONVERT_FUNCS_DECL(avx2, 8)
|
||||
RANGE_CONVERT_FUNCS_DECL(avx2, 16)
|
||||
|
||||
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
|
||||
{
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
RANGE_CONVERT_FUNCS(avx2);
|
||||
} else if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
RANGE_CONVERT_FUNCS(sse2);
|
||||
if (c->dstBpc <= 14) {
|
||||
RANGE_CONVERT_FUNCS(avx2, 8);
|
||||
} else {
|
||||
RANGE_CONVERT_FUNCS(avx2, 16);
|
||||
}
|
||||
} else if (EXTERNAL_SSE2(cpu_flags) && c->dstBpc <= 14) {
|
||||
RANGE_CONVERT_FUNCS(sse2, 8);
|
||||
} else if (EXTERNAL_SSE4(cpu_flags) && c->dstBpc > 14) {
|
||||
RANGE_CONVERT_FUNCS(sse4, 16);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Reference in a new issue