forked from FFmpeg/FFmpeg
swscale/x86/range_convert: update sse2 and avx2 range_convert functions to new API
chrRangeFromJpeg8_1920_c: 2127.4 (1.00x) chrRangeFromJpeg8_1920_sse2: 816.0 (2.61x) 813.5 (2.62x) chrRangeFromJpeg8_1920_avx2: 408.9 (5.20x) 405.4 (5.25x) chrRangeToJpeg8_1920_c: 3166.9 (1.00x) chrRangeToJpeg8_1920_sse2: 815.0 (3.89x) 815.0 (3.89x) chrRangeToJpeg8_1920_avx2: 404.5 (7.83x) 405.5 (7.81x) lumRangeFromJpeg8_1920_c: 1263.0 (1.00x) lumRangeFromJpeg8_1920_sse2: 411.0 (3.07x) 413.2 (3.06x) lumRangeFromJpeg8_1920_avx2: 200.5 (6.30x) 201.9 (6.26x) lumRangeToJpeg8_1920_c: 1886.8 (1.00x) lumRangeToJpeg8_1920_sse2: 412.0 (4.58x) 408.9 (4.61x) lumRangeToJpeg8_1920_avx2: 208.5 (9.05x) 205.7 (9.17x)
This commit is contained in:
parent
384fe39623
commit
be108ebcf4
2 changed files with 50 additions and 53 deletions
|
@ -20,39 +20,29 @@
|
|||
|
||||
%include "libavutil/x86/x86util.asm"
|
||||
|
||||
SECTION_RODATA
|
||||
|
||||
chr_to_mult: times 4 dw 4663, 0
|
||||
chr_to_offset: times 4 dd -9289992
|
||||
%define chr_to_shift 12
|
||||
|
||||
chr_from_mult: times 4 dw 1799, 0
|
||||
chr_from_offset: times 4 dd 4081085
|
||||
%define chr_from_shift 11
|
||||
|
||||
lum_to_mult: times 4 dw 19077, 0
|
||||
lum_to_offset: times 4 dd -39057361
|
||||
%define lum_to_shift 14
|
||||
|
||||
lum_from_mult: times 4 dw 14071, 0
|
||||
lum_from_offset: times 4 dd 33561947
|
||||
%define lum_from_shift 14
|
||||
|
||||
SECTION .text
|
||||
|
||||
;-----------------------------------------------------------------------------
|
||||
; lumConvertRange
|
||||
;
|
||||
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width);
|
||||
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width);
|
||||
; void ff_lumRangeToJpeg_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_lumRangeFromJpeg_<opt>(int16_t *dst, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
;
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro LUMCONVERTRANGE 4
|
||||
cglobal %1, 2, 2, 5, dst, width
|
||||
%macro LUMCONVERTRANGE 1
|
||||
cglobal lumRange%1Jpeg, 4, 4, 5, dst, width, coeff, offset
|
||||
shl widthd, 1
|
||||
VBROADCASTI128 m2, [%2]
|
||||
VBROADCASTI128 m3, [%3]
|
||||
movd xm2, coeffd
|
||||
VBROADCASTSS m2, xm2
|
||||
%if ARCH_X86_64
|
||||
movq xm3, offsetq
|
||||
%else
|
||||
movq xm3, offsetm
|
||||
%endif
|
||||
VBROADCASTSS m3, xm3
|
||||
pxor m4, m4
|
||||
add dstq, widthq
|
||||
neg widthq
|
||||
|
@ -64,8 +54,8 @@ cglobal %1, 2, 2, 5, dst, width
|
|||
pmaddwd m1, m2
|
||||
paddd m0, m3
|
||||
paddd m1, m3
|
||||
psrad m0, %4
|
||||
psrad m1, %4
|
||||
psrad m0, 14
|
||||
psrad m1, 14
|
||||
packssdw m0, m1
|
||||
movu [dstq+widthq], m0
|
||||
add widthq, mmsize
|
||||
|
@ -76,16 +66,24 @@ cglobal %1, 2, 2, 5, dst, width
|
|||
;-----------------------------------------------------------------------------
|
||||
; chrConvertRange
|
||||
;
|
||||
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
|
||||
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width);
|
||||
; void ff_chrRangeToJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
; void ff_chrRangeFromJpeg_<opt>(int16_t *dstU, int16_t *dstV, int width,
|
||||
; uint32_t coeff, int64_t offset);
|
||||
;
|
||||
;-----------------------------------------------------------------------------
|
||||
|
||||
%macro CHRCONVERTRANGE 4
|
||||
cglobal %1, 3, 3, 7, dstU, dstV, width
|
||||
%macro CHRCONVERTRANGE 1
|
||||
cglobal chrRange%1Jpeg, 5, 5, 7, dstU, dstV, width, coeff, offset
|
||||
shl widthd, 1
|
||||
VBROADCASTI128 m4, [%2]
|
||||
VBROADCASTI128 m5, [%3]
|
||||
movd xm4, coeffd
|
||||
VBROADCASTSS m4, xm4
|
||||
%if ARCH_X86_64
|
||||
movq xm5, offsetq
|
||||
%else
|
||||
movq xm5, offsetm
|
||||
%endif
|
||||
VBROADCASTSS m5, xm5
|
||||
pxor m6, m6
|
||||
add dstUq, widthq
|
||||
add dstVq, widthq
|
||||
|
@ -105,10 +103,10 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
|
|||
paddd m1, m5
|
||||
paddd m2, m5
|
||||
paddd m3, m5
|
||||
psrad m0, %4
|
||||
psrad m1, %4
|
||||
psrad m2, %4
|
||||
psrad m3, %4
|
||||
psrad m0, 14
|
||||
psrad m1, 14
|
||||
psrad m2, 14
|
||||
psrad m3, 14
|
||||
packssdw m0, m1
|
||||
packssdw m2, m3
|
||||
movu [dstUq+widthq], m0
|
||||
|
@ -119,15 +117,15 @@ cglobal %1, 3, 3, 7, dstU, dstV, width
|
|||
%endmacro
|
||||
|
||||
INIT_XMM sse2
|
||||
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
|
||||
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
|
||||
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
|
||||
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
|
||||
LUMCONVERTRANGE To
|
||||
CHRCONVERTRANGE To
|
||||
LUMCONVERTRANGE From
|
||||
CHRCONVERTRANGE From
|
||||
|
||||
%if HAVE_AVX2_EXTERNAL
|
||||
INIT_YMM avx2
|
||||
LUMCONVERTRANGE lumRangeToJpeg, lum_to_mult, lum_to_offset, lum_to_shift
|
||||
CHRCONVERTRANGE chrRangeToJpeg, chr_to_mult, chr_to_offset, chr_to_shift
|
||||
LUMCONVERTRANGE lumRangeFromJpeg, lum_from_mult, lum_from_offset, lum_from_shift
|
||||
CHRCONVERTRANGE chrRangeFromJpeg, chr_from_mult, chr_from_offset, chr_from_shift
|
||||
LUMCONVERTRANGE To
|
||||
CHRCONVERTRANGE To
|
||||
LUMCONVERTRANGE From
|
||||
CHRCONVERTRANGE From
|
||||
%endif
|
||||
|
|
|
@ -464,27 +464,26 @@ INPUT_PLANAR_RGB_A_ALL_DECL(avx2);
|
|||
} while (0)
|
||||
|
||||
#define RANGE_CONVERT_FUNCS_DECL(opt) \
|
||||
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width); \
|
||||
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
|
||||
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width); \
|
||||
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width); \
|
||||
void ff_lumRangeFromJpeg_ ##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeFromJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_lumRangeToJpeg_ ##opt(int16_t *dst, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
void ff_chrRangeToJpeg_ ##opt(int16_t *dstU, int16_t *dstV, int width, \
|
||||
uint32_t coeff, int64_t offset); \
|
||||
|
||||
RANGE_CONVERT_FUNCS_DECL(sse2);
|
||||
RANGE_CONVERT_FUNCS_DECL(avx2);
|
||||
|
||||
av_cold void ff_sws_init_range_convert_x86(SwsInternal *c)
|
||||
{
|
||||
/* This code is currently disabled because of changes in the base
|
||||
* implementation of these functions. This code should be enabled
|
||||
* again once those changes are ported to this architecture. */
|
||||
#if 0
|
||||
int cpu_flags = av_get_cpu_flags();
|
||||
if (EXTERNAL_AVX2_FAST(cpu_flags)) {
|
||||
RANGE_CONVERT_FUNCS(avx2);
|
||||
} else if (EXTERNAL_SSE2(cpu_flags)) {
|
||||
RANGE_CONVERT_FUNCS(sse2);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
av_cold void ff_sws_init_swscale_x86(SwsInternal *c)
|
||||
|
|
Loading…
Add table
Reference in a new issue